diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 48de111f8301..8786471a7bdd 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -85,7 +85,7 @@ fi
 CMAKE_VERSION=3.18.5
 
 _UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
-_UCC_COMMIT=1522ccff5e107451d747b1085b3f84714a6c2eea
+_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 
 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
@@ -149,6 +149,21 @@ case "$image" in
     TRITON=yes
     INDUCTOR_BENCHMARKS=yes
     ;;
+  pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.1.1
+    CUDNN_VERSION=8
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
   pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
     CUDA_VERSION=11.8.0
     CUDNN_VERSION=8
diff --git a/.ci/docker/common/install_acl.sh b/.ci/docker/common/install_acl.sh
index f5e5ce92af4a..8a6dc4d1c79c 100644
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@@ -1,6 +1,6 @@
 set -euo pipefail
 
-readonly version=v23.08
+readonly version=v24.04
 readonly src_host=https://review.mlplatform.org/ml
 readonly src_repo=ComputeLibrary
 
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 19d28eeefd9a..9483bb630d4e 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -588,6 +588,15 @@ test_inductor_torchbench_smoketest_perf() {
       "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
       --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
   done
+
+  # Perform some "warm-start" runs for a few huggingface models.
+  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
+      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
+    python benchmarks/dynamo/check_accuracy.py \
+      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
+      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv"
+  done
 }
 
 test_inductor_torchbench_cpu_smoketest_perf(){
@@ -1269,6 +1278,10 @@ elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHAR
 elif [[ "${TEST_CONFIG}" == *dynamo* && $SHARD_NUMBER -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   install_torchvision
   test_dynamo_shard "${SHARD_NUMBER}"
+elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
+  install_torchvision
+  test_python_shard "$SHARD_NUMBER"
+  test_aten
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
   test_without_numpy
   install_torchvision
@@ -1298,10 +1311,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
   test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
   test_docs_test
-elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
-  install_torchvision
-  test_python
-  test_aten
 elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
   install_torchvision
   test_python
diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt
index 9830a3ce9650..98cd949f9713 100644
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@@ -1 +1 @@
-ea437b31ce316ea3d66fe73768c0dcb94edb79ad
+1980f8af5bcd0bb2ce51965cf79d8d4c25dad8a0
diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml
index 9f0dfe973dc9..bb356dce5da9 100644
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@@ -42,6 +42,7 @@ jobs:
           pytorch-linux-focal-cuda12.4-cudnn8-py3-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9,
           pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks,
+          pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks,
           pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9,
           pytorch-linux-focal-py3.8-clang10,
           pytorch-linux-focal-py3.11-clang10,
diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml
index 0ad799a80bcc..3d1c3a539686 100644
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@@ -107,6 +107,27 @@ jobs:
     secrets:
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
 
+  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
+    name: cuda12.1-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3.12-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.6'
+      test-matrix: |
+        { include: [
+          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+        ]}
+
+  linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
+    name: cuda12.1-py3.12-gcc9-sm86
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_12-gcc9-inductor-build
+    with:
+      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+
   linux-jammy-cpu-py3_8-gcc11-inductor-build:
     name: linux-jammy-cpu-py3.8-gcc11-inductor
     uses: ./.github/workflows/_linux-build.yml
@@ -125,7 +146,7 @@ jobs:
           { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
           { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
           { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
         ]}
     secrets:
       HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 3414f23f690f..00813edd3d91 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -192,7 +192,9 @@ jobs:
       sync-tag: rocm-build
       test-matrix: |
         { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
         ]}
 
   linux-focal-rocm6_1-py3_8-test:
@@ -208,4 +210,4 @@ jobs:
       build-environment: linux-focal-rocm6.1-py3.8
       docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
+      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
\ No newline at end of file
diff --git a/.github/workflows/upload_test_stats_intermediate.yml b/.github/workflows/upload_test_stats_intermediate.yml
new file mode 100644
index 000000000000..14b65f6a75ef
--- /dev/null
+++ b/.github/workflows/upload_test_stats_intermediate.yml
@@ -0,0 +1,43 @@
+name: Upload test stats intermediate
+
+on:
+  workflow_dispatch:
+    inputs:
+      workflow_id:
+        description: workflow_id of the run
+        required: true
+      workflow_run_attempt:
+        description: workflow_run_attempt of the run
+        required: true
+
+jobs:
+  intermediate_upload_test_stats:
+    name: Intermediate upload test stats for ${{ inputs.workflow_id }}
+    runs-on: ubuntu-22.04
+    environment: upload-stats
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          fetch-depth: 1
+          submodules: false
+
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          cache: pip
+
+      - run: |
+          pip3 install requests==2.26 rockset==1.0.3 boto3==1.19.12
+
+      - name: Upload test stats
+        env:
+          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          WORKFLOW_RUN_ID: ${{ inputs.workflow_id }}
+          WORKFLOW_RUN_ATTEMPT: ${{ inputs.workflow_run_attempt }}
+        run: |
+          python3 -m tools.stats.upload_test_stats_intermediate \
+            --workflow-run-id "${WORKFLOW_RUN_ID}" \
+            --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" \
diff --git a/.gitmodules b/.gitmodules
index c9b84a370167..db7698876a29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,10 +2,6 @@
     ignore = dirty
     path = third_party/pybind11
     url = https://github.com/pybind/pybind11.git
-[submodule "third_party/cub"]
-    ignore = dirty
-    path = third_party/cub
-    url = https://github.com/NVlabs/cub.git
 [submodule "third_party/eigen"]
     ignore = dirty
     path = third_party/eigen
@@ -50,10 +46,6 @@
     ignore = dirty
     path = third_party/psimd
     url = https://github.com/Maratyszcza/psimd.git
-[submodule "third_party/zstd"]
-    ignore = dirty
-    path = third_party/zstd
-    url = https://github.com/facebook/zstd.git
 [submodule "third_party/cpuinfo"]
     ignore = dirty
     path = third_party/cpuinfo
@@ -152,3 +144,7 @@
 [submodule "third_party/opentelemetry-cpp"]
 	path = third_party/opentelemetry-cpp
 	url = https://github.com/open-telemetry/opentelemetry-cpp.git
+[submodule "third_party/cpp-httplib"]
+	path = third_party/cpp-httplib
+	url = https://github.com/yhirose/cpp-httplib.git
+	branch = v0.15.3
diff --git a/.lintrunner.toml b/.lintrunner.toml
index 938e9521f72d..50eb09984fec 100644
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@@ -1052,13 +1052,13 @@ exclude_patterns = [
     'test/quantization/fx/test_numeric_suite_fx.py',
     'test/quantization/fx/test_quantize_fx.py',
     'test/quantization/fx/test_subgraph_rewriter.py',
-    'test/test_datapipe.py',
     'test/test_fake_tensor.py',
     'test/test_flop_counter.py',
     'test/test_function_schema.py',
     'test/test_functional_autograd_benchmark.py',
     'test/test_functional_optim.py',
     'test/test_functionalization_of_rng_ops.py',
+    'test/test_datapipe.py',
     'test/test_futures.py',
     'test/test_fx.py',
     'test/test_fx_experimental.py',
@@ -1143,7 +1143,6 @@ exclude_patterns = [
     'test/test_transformers.py',
     'test/test_type_promotion.py',
     'test/test_unary_ufuncs.py',
-    'test/test_utils.py',
     'test/test_vulkan.py',
     'test/test_xnnpack_integration.py',
     'test/torch_np/numpy_test/**/*.py',
@@ -1930,8 +1929,6 @@ exclude_patterns = [
     'torch/utils/_mode_utils.py',
     'torch/utils/_python_dispatch.py',
     'torch/utils/_stats.py',
-    'torch/utils/_sympy/__init__.py',
-    'torch/utils/_sympy/functions.py',
     'torch/utils/_traceback.py',
     'torch/utils/_zip.py',
     'torch/utils/backcompat/__init__.py',
diff --git a/BUILD.bazel b/BUILD.bazel
index d3084d9ebd44..831d64b44c2f 100644
--- a/BUILD.bazel
+++ b/BUILD.bazel
@@ -663,6 +663,7 @@ cu_library(
     name = "torch_cuda",
     srcs = [
         "torch/csrc/distributed/c10d/intra_node_comm.cu",
+        "torch/csrc/distributed/c10d/Utils.cu",
         "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
     ],
     copts = torch_cuda_half_options,
@@ -771,7 +772,7 @@ cc_library(
         [
             "torch/*.h",
             "torch/csrc/**/*.h",
-            "torch/csrc/distributed/c10d/*.hpp",
+            "torch/csrc/distributed/c10d/**/*.hpp",
             "torch/lib/libshm/*.h",
         ],
         exclude = [
@@ -830,6 +831,7 @@ cc_library(
             "torch/csrc/cuda/python_nccl.cpp",
             "torch/csrc/cuda/nccl.cpp",
             "torch/csrc/distributed/c10d/intra_node_comm.cu",
+            "torch/csrc/distributed/c10d/Utils.cu",
             "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
         ],
     )) + torch_sources,
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7561d606cbd..02cf8dedc79e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -279,11 +279,13 @@ endif()
 option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
 option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
 option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
+cmake_dependent_option(
+  USE_LITE_AOTI "Include AOTI sources" OFF
+  "BUILD_LITE_INTERPRETER" OFF)
 option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF)
 option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF)
 # option USE_XNNPACK: try to enable xnnpack by default.
 option(USE_XNNPACK "Use XNNPACK" ON)
-option(USE_ZSTD "Use ZSTD" OFF)
 option(USE_ROCM_KERNEL_ASSERT "Use Kernel Assert for ROCm" OFF)
 # Ensure that an ITT build is the default for x86 CPUs
 cmake_dependent_option(
@@ -413,7 +415,6 @@ option(USE_SYSTEM_FXDIV "Use system-provided fxdiv." OFF)
 option(USE_SYSTEM_BENCHMARK "Use system-provided google benchmark." OFF)
 option(USE_SYSTEM_ONNX "Use system-provided onnx." OFF)
 option(USE_SYSTEM_XNNPACK "Use system-provided xnnpack." OFF)
-option(USE_SYSTEM_ZSTD "Use system-provided zstd." OFF)
 option(USE_GOLD_LINKER "Use ld.gold to link" OFF)
 if(USE_SYSTEM_LIBS)
   set(USE_SYSTEM_CPUINFO ON)
@@ -435,9 +436,6 @@ if(USE_SYSTEM_LIBS)
   if(USE_TBB)
     set(USE_SYSTEM_TBB ON)
   endif()
-  if(USE_ZSTD)
-    set(USE_SYSTEM_ZSTD ON)
-  endif()
 endif()
 
 # Used when building Caffe2 through setup.py
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 6c505f8b656c..3086fa18add6 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -887,12 +887,12 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::create(
 }
 
 IValue IValue::deepcopy(std::optional<at::Device> device) const {
-  IValue::HashAliasedIValueMap memo;
+  IValue::HashIdentityIValueMap memo;
   return deepcopy(memo, device);
 }
 
 IValue IValue::deepcopy(
-    IValue::HashAliasedIValueMap& memo,
+    IValue::HashIdentityIValueMap& memo,
     std::optional<at::Device> device) const {
   if (memo.count(*this)) {
     return memo.at(*this);
@@ -1028,12 +1028,12 @@ c10::intrusive_ptr<ivalue::Object> ivalue::Object::copy_to_weak_compilation_ref(
 
 c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
     std::optional<at::Device> device) const {
-  IValue::HashAliasedIValueMap memo;
+  IValue::HashIdentityIValueMap memo;
   return deepcopy(memo, device);
 }
 
 c10::intrusive_ptr<ivalue::Object> ivalue::Object::deepcopy(
-    IValue::HashAliasedIValueMap& memo,
+    IValue::HashIdentityIValueMap& memo,
     std::optional<at::Device> device) const {
   auto cu = type_.cu_;
   auto object = ivalue::Object::create(WeakOrStrongTypePtr(type_.cu_, type_.type_), type()->numAttributes());
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 7715ffbe3c31..922b10b8efeb 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -1117,6 +1117,23 @@ struct TORCH_API IValue final {
   using HashAliasedIValueMap =
       std::unordered_map<IValue, IValue, HashAliasedIValue, CompAliasedIValues>;
 
+  struct HashIdentityIValue {
+    size_t operator()(const IValue& val) const {
+      return val.payload.u.as_int;
+    }
+  };
+
+  struct CompIdentityIValues {
+    bool operator()(const IValue& lhs, const IValue& rhs) const {
+      return lhs.is(rhs);
+    }
+  };
+
+  using HashIdentityIValues =
+      std::unordered_set<IValue, HashIdentityIValue, CompIdentityIValues>;
+  using HashIdentityIValueMap =
+      std::unordered_map<IValue, IValue, HashIdentityIValue, CompIdentityIValues>;
+
   // Chechs if this and rhs has a subvalues in common.
   // [t1,t2] and [t2, t3] returns true.
   bool overlaps(const IValue& rhs) const;
@@ -1130,7 +1147,7 @@ struct TORCH_API IValue final {
   void visit(const std::function<bool(const IValue&)>& visitor) const;
   IValue deepcopy(std::optional<at::Device> device = c10::nullopt) const;
   IValue deepcopy(
-      HashAliasedIValueMap& memo,
+      HashIdentityIValueMap& memo,
       std::optional<at::Device> device = c10::nullopt) const;
 
  private:
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index b1124c12cfb3..b99229f2759c 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -1589,7 +1589,7 @@ struct C10_EXPORT ivalue::Object final : c10::intrusive_ptr_target {
       std::optional<at::Device> device = c10::nullopt) const;
 
   c10::intrusive_ptr<Object> deepcopy(
-      IValue::HashAliasedIValueMap& memo,
+      IValue::HashIdentityIValueMap& memo,
       std::optional<at::Device> device = c10::nullopt) const;
 
   bool is_weak_compilation_ref() const {
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 2502456e285b..ce991a9bcad4 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -1422,10 +1422,13 @@ void scaled_gemm(
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
+#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60200)
+  // Amax support in ROCm as of 6.2
+  if (isFloat8Type(result_dtype)) {
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
+  }
+#endif
 #ifndef USE_ROCM
-if (isFloat8Type(result_dtype)) {
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
-}
   computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode);
 #endif
   CuBlasLtMatrixLayout Adesc(ScalarTypeToCudaDataType(mat1_dtype), m, k, mat1_ld, transa == 't');
diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp
index ce4b4d15b796..2801b2a2b950 100644
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@@ -117,6 +117,8 @@
 #include <ATen/ops/triu.h>
 #include <ATen/ops/vdot.h>
 #include <ATen/ops/zeros.h>
+#include <ATen/ops/matmul.h>
+#include <ATen/ops/narrow.h>
 #endif
 
 // First the required LAPACK implementations are registered here.
@@ -1556,7 +1558,7 @@ void _linalg_check_errors(
           ": The algorithm failed to converge because the input matrix is ill-conditioned or has too many repeated eigenvalues (error code: ", info, ").");
     } else if (api_name.find("lstsq") != api_name.npos) {
       TORCH_CHECK_LINALG(false, api_name, batch_str,
-          ": The least squares solution could not be computed because the input matrix does not have full rank (error code: ", info, ").");
+          ": The least squares solution could not be computed because the input matrix does not have full rank (error code: ", info, "). Specify SVD in the driver if you would like to do this.");
     } else if (api_name.find("lu_factor") != api_name.npos) {
       TORCH_CHECK(false, api_name, batch_str,
           ": U[", info, ",", info, "] is zero and using it on lu_solve would result in a division by zero. "
@@ -3427,8 +3429,30 @@ static void linalg_lstsq_out_info(
   auto input_working_copy = copyBatchedColumnMajor(input);
 
   // now the actual call that computes the result in-place (apply_lstsq)
-  lstsq_stub(input.device().type(), input_working_copy, solution, rank, singular_values, infos, rcond, driver);
-
+  if (driver == "gelss" && input.device() != at::kCPU) {
+    if (input.numel() == 0) {
+        auto output_shape = input.sizes().vec();
+        output_shape.back() = other.size(-1);
+        solution.zero_();
+    } else {
+        auto [U, S, Vh] = at::_linalg_svd(input, false, true, "gesvd");
+        rank = at::zeros({1}, at::kLong);
+
+        auto S_pinv = S.reciprocal();
+        auto s1 = at::narrow(S, /*dim=*/-1, /*start=*/0, /*length=*/1);  // singular values are sorted in descending order
+        S_pinv.masked_fill_(S < rcond * s1, 0);
+        rank[0] = (S != 0).sum();
+        auto uhOther = at::matmul(U.adjoint(), other);
+        if(S_pinv.dim() != uhOther.dim()) {
+          S_pinv = S_pinv.unsqueeze(-1);
+        }
+        auto S_pinv_other = S_pinv * uhOther;
+        solution = at::matmul(Vh.adjoint(), S_pinv_other);
+    }
+  }
+  else {
+    lstsq_stub(input.device().type(), input_working_copy, solution, rank, singular_values, infos, rcond, driver);
+  }
   // residuals are available only if m > n and drivers other than gelsy used
   if (m > n && driver != "gelsy") {
     // if the driver is gelss or gelsd then the residuals are available only if rank == n
@@ -3456,9 +3480,15 @@ static void linalg_lstsq_out_info(
       at::sum_out(residuals, raw_residuals, /*dim=*/-2, /*keepdim=*/false, /*dtype*/real_dtype);
     }
   }
-  auto solution_view = solution.narrow(/*dim=*/-2, /*start=*/0, /*length*/n);
-  // manually restride original
-  solution.set_(solution.storage(), solution_view.storage_offset(), solution_view.sizes(), solution_view.strides());
+  if (solution.size(-2) >= n) {
+    auto solution_view = solution.narrow(/*dim=*/-2, /*start=*/0, /*length*/n);
+
+    // manually restride original
+    solution.set_(solution.storage(), solution_view.storage_offset(),
+                  solution_view.sizes(), solution_view.strides());
+  } else {
+    solution = at::zeros({solution.size(-1), n}, solution.options());
+  }
   if (m == 0) {
     solution.zero_();
   }
@@ -3490,8 +3520,8 @@ static std::string get_default_lstsq_driver(std::optional<c10::string_view> driv
       );
     } else { // else if (input.is_cuda())
       TORCH_CHECK(
-        driver_str == "gels",
-        "torch.linalg.lstsq: `driver` other than `gels` is not supported on CUDA"
+        (driver_str == "gelss" || driver_str == "gels"),
+        "torch.linalg.lstsq: `driver` other than `gels` or `gelss` is not supported on CUDA"
       );
     }
   } else {
diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp
index 48a077814880..af34ae5c582a 100644
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@@ -215,6 +215,87 @@ static inline float16_t reduce(float16x8_t x) {
         return reduce(vadd_f16(vget_low_f16(x), vget_high_f16(x)));
 }
 
+/*
+ * The below reduce overload and
+ * fp16_gemv_trans_fp16_arith_by_dot_products function is adapted from
+ * llama.cpp's ggml_vec_dot_f16 and surrounding utility functions, so
+ * here is the required copyright notice:
+ *
+ * MIT License
+ *
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define F16_ELEMENTS_PER_ITERATION 32
+#define F16_ELEMENTS_PER_REGISTER 8
+#define F16_REGISTERS_PER_ITERATION (F16_ELEMENTS_PER_ITERATION / F16_ELEMENTS_PER_REGISTER)
+static inline double reduce(float16x8_t x[F16_REGISTERS_PER_ITERATION]) {
+  int offset = F16_REGISTERS_PER_ITERATION / 2;
+  for (int i = 0; i < offset; ++i) {
+    x[i] = vaddq_f16(x[i], x[offset + i]);
+  }
+  offset /= 2;
+  for (int i = 0; i < offset; ++i) {
+    x[i] = vaddq_f16(x[i], x[offset + i]);
+  }
+  offset /= 2;
+  for (int i = 0; i < offset; ++i) {
+    x[i] = vaddq_f16(x[i], x[offset + i]);
+  }
+  const float32x4_t t0 = vcvt_f32_f16(vget_low_f16(x[0]));
+  const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0]));
+  return (double)vaddvq_f32(vaddq_f32(t0, t1));
+
+}
+
+static inline float16x8_t f16_fma(float16x8_t a, float16x8_t b, float16x8_t c) {
+#ifdef __ARM_FEATURE_FMA
+  return vfmaq_f16(a, b, c);
+#else
+  return vaddq_f16(a, vmulq_f16(b, c));
+#endif
+}
+
+// Rather than unrolling to process multiple rows (transposed columns)
+// of matrix A at once as done in fp16_gemv_trans_fp16_arith, unroll
+// along an individual dot product.
+static void fp16_gemv_trans_fp16_arith_by_dot_products(const int m, const int n, const float16_t* a, const int lda, const float16_t *x, float16_t* y, int incy) {
+  parallel_for(0, n, 1, [&](int begin, int end) {
+  for (int i = begin; i < end; ++i) {
+      float16x8_t sum[F16_REGISTERS_PER_ITERATION] = {vdupq_n_f16(0)};
+      float16x8_t ax[F16_REGISTERS_PER_ITERATION];
+      float16x8_t ay[F16_REGISTERS_PER_ITERATION];
+
+      for (int j = 0; j < m; j += F16_ELEMENTS_PER_ITERATION) {
+        for (int k = 0; k < F16_REGISTERS_PER_ITERATION; ++k) {
+          ax[k] = vld1q_f16(x + j + k * F16_ELEMENTS_PER_REGISTER);
+          ay[k] = vld1q_f16(a + lda * i + j + k * F16_ELEMENTS_PER_REGISTER);
+          sum[k] = f16_fma(sum[k], ax[k], ay[k]);
+        }
+      }
+      // TODO: add a tail fixup so we don't have to have such a
+      // restrictive gate to enter this path.
+      y[i * incy] = reduce(sum);
+  }
+  });
+}
 
 static void fp16_gemv_trans_fp16_arith(const int m, const int n, const float16_t* a, const int lda, const float16_t *x, float16_t* y, int incy) {
   parallel_for(0, n / 4, 1, [&](int begin, int end) {
@@ -230,13 +311,13 @@ static void fp16_gemv_trans_fp16_arith(const int m, const int n, const float16_t
       for (auto j = 0; j < m; j += 8) {
         float16x8_t xVec = vld1q_f16(x + j);
         float16x8_t a0Vec = vld1q_f16(row0 + j);
-        sum0Vec = vaddq_f16(sum0Vec, vmulq_f16(a0Vec, xVec));
+        sum0Vec = f16_fma(sum0Vec, a0Vec, xVec);
         float16x8_t a1Vec = vld1q_f16(row1 + j);
-        sum1Vec = vaddq_f16(sum1Vec, vmulq_f16(a1Vec, xVec));
+        sum1Vec = f16_fma(sum1Vec, a1Vec, xVec);
         float16x8_t a2Vec = vld1q_f16(row2 + j);
-        sum2Vec = vaddq_f16(sum2Vec, vmulq_f16(a2Vec, xVec));
+        sum2Vec = f16_fma(sum2Vec, a2Vec, xVec);
         float16x8_t a3Vec = vld1q_f16(row3 + j);
-        sum3Vec = vaddq_f16(sum3Vec, vmulq_f16(a3Vec, xVec));
+        sum3Vec = f16_fma(sum3Vec, a3Vec, xVec);
       }
       y[(i + 0) * incy] = reduce(sum0Vec);
       y[(i + 1) * incy] = reduce(sum1Vec);
@@ -245,6 +326,7 @@ static void fp16_gemv_trans_fp16_arith(const int m, const int n, const float16_t
     }
   });
 }
+
 #endif
 
 static inline float reduce(float32x4_t x) {
@@ -252,6 +334,14 @@ static inline float reduce(float32x4_t x) {
         return vgetq_lane_f32(vpaddq_f32(sum, sum), 0);
 }
 
+static inline float32x4_t f32_fma(float32x4_t a, float32x4_t b, float32x4_t c) {
+#ifdef __ARM_FEATURE_FMA
+  return vfmaq_f32(a, b, c);
+#else
+  return vaddq_f32(a, vmulq_f32(b, c));
+#endif
+}
+
 static void fp16_gemv_trans_fp32_arith(const int m, const int n, const float16_t* a, const int lda, const float16_t *x, float16_t* y, int incy) {
   parallel_for(0, n / 4, 1, [&](int begin, int end) {
     for (auto i =  begin * 4 ; i < end * 4; i += 4) {
@@ -266,13 +356,13 @@ static void fp16_gemv_trans_fp32_arith(const int m, const int n, const float16_t
       for (auto j = 0; j < m; j += 4) {
         float32x4_t xVec = vcvt_f32_f16(vld1_f16(x + j));
         float32x4_t a0Vec = vcvt_f32_f16(vld1_f16(row0 + j));
-        sum0Vec = vaddq_f32(sum0Vec, vmulq_f32(a0Vec, xVec));
+        sum0Vec = f32_fma(sum0Vec, a0Vec, xVec);
         float32x4_t a1Vec = vcvt_f32_f16(vld1_f16(row1 + j));
-        sum1Vec = vaddq_f32(sum1Vec, vmulq_f32(a1Vec, xVec));
+        sum1Vec = f32_fma(sum1Vec, a1Vec, xVec);
         float32x4_t a2Vec = vcvt_f32_f16(vld1_f16(row2 + j));
-        sum2Vec = vaddq_f32(sum2Vec, vmulq_f32(a2Vec, xVec));
+        sum2Vec = f32_fma(sum2Vec, a2Vec, xVec);
         float32x4_t a3Vec = vcvt_f32_f16(vld1_f16(row3 + j));
-        sum3Vec = vaddq_f32(sum3Vec, vmulq_f32(a3Vec, xVec));
+        sum3Vec = f32_fma(sum3Vec, a3Vec, xVec);
       }
       y[(i + 0) * incy] = reduce(sum0Vec);
       y[(i + 1) * incy] = reduce(sum1Vec);
@@ -295,11 +385,16 @@ void fp16_gemv_trans(
     const int incy) {
   if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && n % 4 == 0) {
 #ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
-    return at::globalContext().allowFP16ReductionCPU() && m % 8 == 0 ? fp16_gemv_trans_fp16_arith(m, n, a, lda, x, y, incy)
-                                                                     : fp16_gemv_trans_fp32_arith(m, n, a, lda, x, y, incy);
-#else
-    return fp16_gemv_trans_fp32_arith(m, n, a, lda, x, y, incy);
+    if (at::globalContext().allowFP16ReductionCPU()) {
+      if (m % 32 == 0 && n % 32 == 0) {
+        return fp16_gemv_trans_fp16_arith_by_dot_products(m, n, a, lda, x, y, incy);
+      }
+      if (m % 8 == 0) {
+        return fp16_gemv_trans_fp16_arith(m, n, a, lda, x, y, incy);
+      }
+    }
 #endif
+    return fp16_gemv_trans_fp32_arith(m, n, a, lda, x, y, incy);
   }
   for (const auto i : c10::irange(n)) {
     float sum = 0;
diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp
index 686948584c72..10ab4a70f091 100644
--- a/aten/src/ATen/native/ConvolutionMM2d.cpp
+++ b/aten/src/ATen/native/ConvolutionMM2d.cpp
@@ -543,6 +543,11 @@ Tensor& slow_conv2d_forward_out_cpu(
     IntArrayRef padding,
     Tensor& output) {
   // See [Note: hacky wrapper removal for optional tensor]
+
+  TORCH_CHECK(kernel_size.size() == 2, "2D kernel_size expected");
+  TORCH_CHECK(stride.size() == 2, "2D stride expected");
+  TORCH_CHECK(padding.size() == 2, "2D padding expected");
+
   c10::MaybeOwned<Tensor> bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt);
   const Tensor& bias = *bias_maybe_owned;
 
diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp
index e6aa8493905d..c5f81e98906d 100644
--- a/aten/src/ATen/native/Copy.cpp
+++ b/aten/src/ATen/native/Copy.cpp
@@ -325,7 +325,7 @@ Tensor copy(const Tensor& self, const Tensor& src, bool non_blocking) {
   // Instead, generate an empty tensor with the right sizes/strides, since we should be able to assume
   // that copy_() will fully overwrite all data with that of src
   if (self_storage->nbytes() == 0) {
-    r = at::empty_strided(self.sizes(), self.strides());
+    r = at::empty_strided(self.sizes(), self.strides(), self.options());
   } else {
     r = clone_preserve_strides(self);
   }
diff --git a/aten/src/ATen/native/FusedAdagrad.cpp b/aten/src/ATen/native/FusedAdagrad.cpp
new file mode 100644
index 000000000000..1c5f553e6854
--- /dev/null
+++ b/aten/src/ATen/native/FusedAdagrad.cpp
@@ -0,0 +1,59 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/FusedAdagrad.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/_fused_adagrad.h>
+#include <ATen/ops/_fused_adagrad_native.h>
+#endif
+namespace at {
+
+namespace native {
+
+void _fused_adagrad_kernel_cpu_(
+    at::TensorList params,
+    at::TensorList grads,
+    at::TensorList state_sums,
+    at::TensorList state_steps,
+    const double lr,
+    const double lr_decay,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const c10::optional<at::Tensor>& grad_scale,
+    const c10::optional<at::Tensor>& found_inf) {
+  const float* grad_scale_ptr =
+      grad_scale.has_value() ? grad_scale->data_ptr<float>() : nullptr;
+  const float* found_inf_ptr =
+      found_inf.has_value() ? found_inf->data_ptr<float>() : nullptr;
+  if (found_inf_ptr && *found_inf_ptr == 1.0) {
+      return;
+  }
+  size_t n_tensors = params.size();
+  TORCH_CHECK(grads.size() == n_tensors);
+  TORCH_CHECK(state_sums.size() == n_tensors);
+  TORCH_CHECK(state_steps.size() == n_tensors);
+  for (size_t i = 0; i < n_tensors; i++){
+    fused_adagrad_stub(
+      kCPU,
+      params[i],
+      grads[i],
+      state_sums[i],
+      state_steps[i],
+      lr,
+      lr_decay,
+      weight_decay,
+      eps,
+      maximize,
+      grad_scale_ptr);
+  }
+}
+
+DEFINE_DISPATCH(fused_adagrad_stub);
+
+}
+}
diff --git a/aten/src/ATen/native/FusedAdagrad.h b/aten/src/ATen/native/FusedAdagrad.h
new file mode 100644
index 000000000000..395cbdd43aa8
--- /dev/null
+++ b/aten/src/ATen/native/FusedAdagrad.h
@@ -0,0 +1,23 @@
+#include <ATen/core/Tensor.h>
+#include <ATen/native/DispatchStub.h>
+
+namespace at {
+
+namespace native {
+
+using fused_adagrad_fn = void (*)(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& state_sum,
+    const at::Tensor& state_step,
+    const double lr,
+    const double lr_decay,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const float* grad_scale_ptr);
+
+DECLARE_DISPATCH(fused_adagrad_fn, fused_adagrad_stub);
+
+}
+}
diff --git a/aten/src/ATen/native/ReduceAllOps.cpp b/aten/src/ATen/native/ReduceAllOps.cpp
index 34a4b58cbce0..2ac14a76fbc6 100644
--- a/aten/src/ATen/native/ReduceAllOps.cpp
+++ b/aten/src/ATen/native/ReduceAllOps.cpp
@@ -8,6 +8,7 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_aminmax_native.h>
 #include <ATen/ops/aminmax.h>
 #include <ATen/ops/empty.h>
 #include <ATen/ops/max.h>
@@ -65,4 +66,11 @@ Tensor& max_unary_out(const Tensor &self, Tensor& out) {
   return out;
 }
 
+// DEPRECATED: Use at::aminmax instead
+std::tuple<Tensor, Tensor> _aminmax_all(const Tensor &self) {
+  TORCH_WARN_ONCE("_aminmax is deprecated as of PyTorch 1.11 and will be removed in a future release. Use aminmax instead."
+                  " This warning will only appear once per process.");
+  return at::aminmax(self);
+}
+
 } // namespace at::native
diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp
index cbb79dfabc7e..974ad302ca0c 100644
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@@ -20,6 +20,7 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_aminmax_native.h>
 #include <ATen/ops/_assert_async_native.h>
 #include <ATen/ops/_functional_assert_async_native.h>
 #include <ATen/ops/_print_native.h>
@@ -681,6 +682,13 @@ std::tuple<Tensor, Tensor> qmin(const Tensor& self, int64_t dim, bool keepdim) {
       at::_make_per_tensor_quantized_tensor(min, self.q_scale(), self.q_zero_point()), min_indices);
 }
 
+// DEPRECATED: Use at::aminmax instead
+std::tuple<Tensor, Tensor> _aminmax(const Tensor& self, int64_t dim, bool keepdim) {
+  TORCH_WARN_ONCE("_aminmax is deprecated as of PyTorch 1.11 and will be removed in a future release. Use aminmax instead."
+                  " This warning will only appear once per process.");
+  return at::aminmax(self, dim, keepdim);
+}
+
 TORCH_IMPL_FUNC(clamp_out)
 (
  const Tensor& /*self*/,
diff --git a/aten/src/ATen/native/TypeProperties.cpp b/aten/src/ATen/native/TypeProperties.cpp
index 7091e4f78aef..4afc7619c2eb 100644
--- a/aten/src/ATen/native/TypeProperties.cpp
+++ b/aten/src/ATen/native/TypeProperties.cpp
@@ -191,8 +191,8 @@ ScalarType result_type(const Scalar& scalar1, const Scalar& scalar2) {
   return result_type(state);
 }
 
-bool can_cast(const at::ScalarType from, const at::ScalarType to) {
-  return at::canCast(from, to);
+bool can_cast(const at::ScalarType from_, const at::ScalarType to) {
+  return at::canCast(from_, to);
 }
 
 ScalarType promote_types(ScalarType type1, ScalarType type2) {
diff --git a/aten/src/ATen/native/cpu/FusedAdagradKernel.cpp b/aten/src/ATen/native/cpu/FusedAdagradKernel.cpp
new file mode 100644
index 000000000000..70085fde1e90
--- /dev/null
+++ b/aten/src/ATen/native/cpu/FusedAdagradKernel.cpp
@@ -0,0 +1,225 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/core/Tensor.h>
+#include <ATen/Parallel.h>
+#include <ATen/OpMathType.h>
+#include <ATen/native/DispatchStub.h>
+#include <ATen/native/FusedAdagrad.h>
+#include <ATen/Dispatch.h>
+#include <ATen/cpu/vec/vec.h>
+#include <ATen/cpu/vec/functional.h>
+namespace at::native {
+
+namespace{
+
+template <typename scalar_t, typename opmath_t>
+typename std::enable_if<
+    std::is_same<scalar_t, Half>::value || std::is_same<scalar_t, BFloat16>::value,
+    void>::
+    type inline adagrad_math(
+  scalar_t* param_ptr,
+  scalar_t* grad_ptr,
+  scalar_t* state_sum_ptr,
+  const double clr,
+  const double eps,
+  const double weight_decay,
+  const bool maximize,
+  const float* grad_scale_ptr,
+  int64_t size
+){
+  using lpVec = at::vec::Vectorized<scalar_t>;
+  using fVec = at::vec::Vectorized<opmath_t>;
+  lpVec grad_vec_to_store;
+  fVec param_vec1, param_vec2;
+  fVec grad_vec1, grad_vec2;
+  fVec state_sum_vec1, state_sum_vec2;
+  int64_t d = 0;
+  for (; d < size - (size % lpVec::size()); d += lpVec::size()) {
+    lpVec param_lpvec = lpVec::loadu(param_ptr + d);
+    std::tie(param_vec1, param_vec2) = vec::convert_to_float<scalar_t>(param_lpvec);
+    lpVec grad_lpvec = lpVec::loadu(grad_ptr + d);
+    std::tie(grad_vec1, grad_vec2) = vec::convert_to_float<scalar_t>(grad_lpvec);
+    if (grad_scale_ptr) {
+      grad_vec1 = grad_vec1 / fVec(float(*grad_scale_ptr));
+      grad_vec2 = grad_vec2 / fVec(float(*grad_scale_ptr));
+      grad_vec_to_store = vec::convert_from_float<scalar_t>(grad_vec1, grad_vec2);
+      grad_vec_to_store.store(grad_ptr + d);
+    }
+    if (maximize){
+      grad_vec1 = grad_vec1 * fVec(opmath_t(-1.0));
+      grad_vec2 = grad_vec2 * fVec(opmath_t(-1.0));
+    }
+    if (weight_decay != 0.0){
+      grad_vec1 += param_vec1 * fVec(scalar_t(weight_decay));
+      grad_vec2 += param_vec2 * fVec(scalar_t(weight_decay));
+    }
+    std::tie(state_sum_vec1, state_sum_vec2) = vec::convert_to_float<scalar_t>(lpVec::loadu(state_sum_ptr + d));
+    state_sum_vec1 += grad_vec1 * grad_vec1;
+    state_sum_vec2 += grad_vec2 * grad_vec2;
+    vec::convert_from_float<scalar_t>(state_sum_vec1, state_sum_vec2).store(state_sum_ptr + d);
+
+    fVec std_vec1 = state_sum_vec1.sqrt() + fVec(scalar_t(eps));
+    fVec std_vec2 = state_sum_vec2.sqrt() + fVec(scalar_t(eps));
+    param_vec1 = param_vec1 - fVec(scalar_t(clr)) * grad_vec1 / std_vec1;
+    param_vec2 = param_vec2 - fVec(scalar_t(clr)) * grad_vec2 / std_vec2;
+    vec::convert_from_float<scalar_t>(param_vec1, param_vec2).store(param_ptr + d);
+  }
+  scalar_t grad_val_to_store;
+  for (; d < size; d++) {
+    opmath_t grad_val = grad_ptr[d];
+    opmath_t param_val = param_ptr[d];
+    if (grad_scale_ptr) {
+      grad_val = grad_ptr[d] / opmath_t(*grad_scale_ptr);
+      grad_val_to_store = grad_val;
+      grad_ptr[d] = grad_val_to_store;
+    }
+    if (maximize) grad_val = -grad_val;
+    if (weight_decay != 0.0){
+      grad_val += param_val * opmath_t(weight_decay);
+    }
+    opmath_t state_sum_val = state_sum_ptr[d];
+    state_sum_val += grad_val * grad_val;
+    state_sum_ptr[d] = state_sum_val;
+    opmath_t std_val = std::sqrt(state_sum_val) + opmath_t(eps);
+    param_val -= opmath_t(clr) * grad_val / std_val;
+    param_ptr[d] = param_val;
+  }
+}
+
+
+template <typename scalar_t, typename opmath_t>
+typename std::enable_if<
+    std::is_same<scalar_t, float>::value || std::is_same<scalar_t, double>::value,
+    void>::
+    type inline adagrad_math(
+  scalar_t* param_ptr,
+  scalar_t* grad_ptr,
+  scalar_t* state_sum_ptr,
+  const double clr,
+  const double eps,
+  const double weight_decay,
+  const bool maximize,
+  const float* grad_scale_ptr,
+  int64_t size
+){
+  using Vec = at::vec::Vectorized<scalar_t>;
+  Vec grad_vec_to_store;
+  int64_t d = 0;
+  for (; d < size - (size % Vec::size()); d += Vec::size()) {
+    Vec param_vec = Vec::loadu(param_ptr + d);
+    Vec grad_vec = Vec::loadu(grad_ptr + d);
+    if (grad_scale_ptr) {
+      grad_vec = grad_vec / Vec(scalar_t(*grad_scale_ptr));
+      grad_vec_to_store = grad_vec;
+      grad_vec_to_store.store(grad_ptr + d);
+    }
+    if (maximize) grad_vec = grad_vec * Vec(scalar_t(-1.0));
+    if (weight_decay != 0.0){
+      grad_vec += param_vec * Vec(scalar_t(weight_decay));
+    }
+
+    Vec sum_vec = Vec::loadu(state_sum_ptr + d) + grad_vec * grad_vec;
+    sum_vec.store(state_sum_ptr + d);
+
+    Vec std_vec = sum_vec.sqrt() + Vec(scalar_t(eps));
+    param_vec = param_vec - Vec(scalar_t(clr)) * grad_vec / std_vec;
+    param_vec.store(param_ptr + d);
+  }
+  scalar_t grad_val_to_store;
+  for (; d < size; d++) {
+    scalar_t grad_val = grad_ptr[d];
+    if (grad_scale_ptr) {
+      grad_val = grad_ptr[d] / scalar_t(*grad_scale_ptr);
+      grad_val_to_store = grad_val;
+      grad_ptr[d] = grad_val_to_store;
+    }
+    if (maximize) grad_val = -grad_val;
+    if (weight_decay != 0.0){
+      grad_val += param_ptr[d] * scalar_t(weight_decay);
+    }
+    state_sum_ptr[d] += grad_val * grad_val;
+
+    scalar_t std_val = std::sqrt(state_sum_ptr[d]) + scalar_t(eps);
+    param_ptr[d] -= scalar_t(clr) * grad_val / std_val;
+  }
+}
+
+template <typename scalar_t>
+void adagrad_fused_step_impl(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& state_sum,
+    const at::Tensor& state_step,
+    const double lr,
+    const double lr_decay,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const float* grad_scale_ptr) {
+  using opmath_t = at::opmath_type<scalar_t>;
+  scalar_t* param_data = param.data_ptr<scalar_t>();
+  scalar_t* grad_data = grad.data_ptr<scalar_t>();
+  scalar_t* state_sum_data = state_sum.data_ptr<scalar_t>();
+  double step = state_step.item<float>();
+  double clr = lr / (1.0 + (step - 1.0) * lr_decay);
+
+  constexpr size_t cache_line_size = 64;
+  constexpr int64_t cache_line_aligned_task_unit = cache_line_size / sizeof(scalar_t);
+  size_t num_units = divup(param.numel(), cache_line_aligned_task_unit);
+
+  auto adagrad_fn = [&](int64_t begin, int64_t end) {
+        // local pointers
+        begin *= cache_line_aligned_task_unit;
+        end = std::min(end * cache_line_aligned_task_unit, param.numel());
+        scalar_t* param_ptr = param_data + begin;
+        scalar_t* grad_ptr = grad_data + begin;
+        scalar_t* state_sum_ptr = state_sum_data + begin;
+
+        const int64_t size = end - begin;
+        adagrad_math<scalar_t, opmath_t>(
+          param_ptr,
+          grad_ptr,
+          state_sum_ptr,
+          clr,
+          eps,
+          weight_decay,
+          maximize,
+          grad_scale_ptr,
+          size
+        );
+      };
+  at::parallel_for(
+      0, num_units, 0, adagrad_fn);
+}
+
+void fused_adagrad_kernel(
+    const at::Tensor& param,
+    const at::Tensor& grad,
+    const at::Tensor& state_sum,
+    const at::Tensor& state_step,
+    const double lr,
+    const double lr_decay,
+    const double weight_decay,
+    const double eps,
+    const bool maximize,
+    const float* grad_scale_ptr
+  ) {
+  Tensor grad_contiguous = grad.contiguous();
+  AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, param.scalar_type(), "fused_adagrad_kernel", [&] {
+    adagrad_fused_step_impl<scalar_t>(
+      param,
+      grad,
+      state_sum,
+      state_step,
+      lr,
+      lr_decay,
+      weight_decay,
+      eps,
+      maximize,
+      grad_scale_ptr);
+  });
+}
+
+}
+
+REGISTER_DISPATCH(fused_adagrad_stub, &fused_adagrad_kernel);
+} // namespace at::native
diff --git a/aten/src/ATen/native/cpu/int4mm_kernel.cpp b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
index acb4b927f23f..2ffef25a10ff 100644
--- a/aten/src/ATen/native/cpu/int4mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp
@@ -341,12 +341,46 @@ inline void tinygemm_kernel(
 
 #if !defined(C10_MOBILE) && defined(__aarch64__)
 #include <arm_neon.h>
-template <int BLOCK_M, int BLOCK_N>
-inline void tinygemm_kernel(
-    const Half* RESTRICT A,
+
+inline float32x4x2_t load_as_float32x4x2(const Half* ptr) {
+  float16x4x2_t f16_val = vld2_f16(reinterpret_cast<const float16_t *>(ptr));
+  auto val_low = vcvt_f32_f16(f16_val.val[0]);
+  auto val_high = vcvt_f32_f16(f16_val.val[1]);
+  return {val_low, val_high};
+}
+
+inline void store_float32x4(Half* ptr, float32x4_t val) {
+    vst1_f16(reinterpret_cast<float16_t*>(ptr), vcvt_f16_f32(val));
+}
+
+inline float32x4x2_t load_as_float32x4x2(const BFloat16* ptr) {
+  int32x4_t shift = vdupq_n_s32(16);
+  uint16x4x2_t u16_val = vld2_u16(reinterpret_cast<const uint16_t *>(ptr));
+  uint32x4_t int_low = vmovl_u16(u16_val.val[0]);
+  uint32x4_t int_high = vmovl_u16(u16_val.val[1]);
+  return {vreinterpretq_f32_u32(vshlq_u32(int_low, shift)), vreinterpretq_f32_u32(vshlq_u32(int_high, shift))};
+}
+
+inline void store_float32x4(BFloat16* ptr, float32x4_t val) {
+    int32x4_t shift = vdupq_n_s32(-16);
+    uint32x4_t uint32_val = vshlq_u32(vreinterpretq_u32_f32(val), shift);
+    vst1_u16(reinterpret_cast<uint16_t*>(ptr), vmovn_u32(uint32_val));
+}
+
+inline float32x4x2_t load_as_float32x4x2(const float* ptr) {
+  return vld2q_f32(ptr);
+}
+
+inline void store_float32x4(float* ptr, float32x4_t val) {
+    vst1q_f32(ptr, val);
+}
+
+template <int BLOCK_M, int BLOCK_N, typename T>
+inline void tinygemm_kernel_(
+    const T* RESTRICT A,
     const uint8_t* RESTRICT B,
-    const Half* RESTRICT ScaleAndZeros,
-    Half* RESTRICT C,
+    const T* RESTRICT ScaleAndZeros,
+    T* RESTRICT C,
     int lda,
     int ldb,
     int ldc,
@@ -368,9 +402,9 @@ inline void tinygemm_kernel(
         if (is_block_start(k, BLOCK_K)) {
           int kb = k / BLOCK_K;
           c10::ForcedUnroll<4>{}([&](auto i) {
-            auto scales_and_zeros = vld2_f16(reinterpret_cast<const float16_t*>(ScaleAndZeros + kb * ldc * 2 + n * 2 + i * 8));
-            scales[i] = vcvt_f32_f16(scales_and_zeros.val[0]);
-            zeros[i] = vcvt_f32_f16(scales_and_zeros.val[1]);
+            auto scales_and_zeros = load_as_float32x4x2(ScaleAndZeros + kb * ldc * 2 + n * 2 + i * 8);
+            scales[i] = scales_and_zeros.val[0];
+            zeros[i] = scales_and_zeros.val[1];
           });
         }
         c10::ForcedUnroll<4>{}([&](auto i) {
@@ -383,11 +417,53 @@ inline void tinygemm_kernel(
         });
       }
       c10::ForcedUnroll<4>{}([&](auto i) {
-        vst1_f16(reinterpret_cast<float16_t*>(C + m * ldc + n + i * 4), vcvt_f16_f32(c_val[i]));
+        store_float32x4(C + m * ldc + n + i * 4, c_val[i]);
       });
     }
   }
 }
+
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const Half* RESTRICT A,
+    const uint8_t* RESTRICT B,
+    const Half* RESTRICT ScaleAndZeros,
+    Half* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K,
+    int BLOCK_K) {
+  tinygemm_kernel_<BLOCK_M, BLOCK_N>(A, B, ScaleAndZeros, C, lda, ldb, ldc, K, BLOCK_K);
+}
+
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const BFloat16* RESTRICT A,
+    const uint8_t* RESTRICT B,
+    const BFloat16* RESTRICT ScaleAndZeros,
+    BFloat16* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K,
+    int BLOCK_K) {
+  tinygemm_kernel_<BLOCK_M, BLOCK_N>(A, B, ScaleAndZeros, C, lda, ldb, ldc, K, BLOCK_K);
+}
+
+template <int BLOCK_M, int BLOCK_N>
+inline void tinygemm_kernel(
+    const float* RESTRICT A,
+    const uint8_t* RESTRICT B,
+    const float* RESTRICT ScaleAndZeros,
+    float* RESTRICT C,
+    int lda,
+    int ldb,
+    int ldc,
+    int K,
+    int BLOCK_K) {
+  tinygemm_kernel_<BLOCK_M, BLOCK_N>(A, B, ScaleAndZeros, C, lda, ldb, ldc, K, BLOCK_K);
+}
 #endif
 
 template<int BLOCK_N>
diff --git a/aten/src/ATen/native/cpu/int8mm_kernel.cpp b/aten/src/ATen/native/cpu/int8mm_kernel.cpp
index bd266030b256..d61a1933afc7 100644
--- a/aten/src/ATen/native/cpu/int8mm_kernel.cpp
+++ b/aten/src/ATen/native/cpu/int8mm_kernel.cpp
@@ -250,10 +250,18 @@ inline void tinygemm_kernel_(
       });
     }
 
+#if __OPTIMIZE__
     float32x4_t scale_val = load_as_float32x4(scales);
     c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
       C[m * ldc + i] = reduce(c_val[i]) * vgetq_lane_f32(scale_val, i);
     });
+#else
+    // Workaround GCCs inability to infer lane index at compile time
+    // See https://github.com/pytorch/pytorch/issues/126283
+    c10::ForcedUnroll<BLOCK_N>{}([&](auto i) {
+      C[m * ldc + i] = reduce(c_val[i]) * float(scales[i]);
+    });
+#endif
   }
 }
 
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index c0ed650cf021..84c59a4fd0d7 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -32,6 +32,7 @@
 #include <ATen/ops/mm_native.h>
 #include <ATen/ops/mul.h>
 #include <ATen/ops/relu.h>
+#include <ATen/ops/ones.h>
 #include <ATen/ops/scalar_tensor_native.h>
 #include <ATen/ops/vdot_native.h>
 #endif
@@ -988,6 +989,11 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
   else
 #endif
   {
+#if defined(USE_ROCM) && ROCM_VERSION >= 60200
+  // hipBlasLT requires scaleD to be set to something in order to use AMAX
+    auto dummy_options = TensorOptions().dtype(kFloat).device(kCUDA);
+    auto dummy_scale = at::ones(1, dummy_options);
+#endif
     at::cuda::blas::scaled_gemm(
         args.transa,
         args.transb,
@@ -1005,15 +1011,19 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
         bias ? bias->data_ptr(): nullptr,
         bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_,
         args.result->data_ptr(),
+#if defined(USE_ROCM) && ROCM_VERSION >= 60200
+        scale_result ? scale_result->data_ptr() : dummy_scale.data_ptr(),
+#else
         scale_result ? scale_result->data_ptr() : nullptr,
+#endif
         args.result_ld,
         out_dtype_,
         amax.data_ptr(),
         use_fast_accum);
   }
 
-#if defined(USE_ROCM)
-  // rocm's hipblaslt does not yet support amax, so calculate separately
+#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && ROCM_VERSION < 60200
+  // ROCm's hipBLASLt does not support amax before 6.2, so calculate separately
   amax = at::max(at::abs(out.to(kFloat)));
 #endif
 
diff --git a/aten/src/ATen/native/cuda/FusedSgdKernel.cu b/aten/src/ATen/native/cuda/FusedSgdKernel.cu
index 61da02ce0b88..e644b1048c9b 100644
--- a/aten/src/ATen/native/cuda/FusedSgdKernel.cu
+++ b/aten/src/ATen/native/cuda/FusedSgdKernel.cu
@@ -86,12 +86,8 @@ struct FusedSgdMathFunctor {
         init_args<depth>(args, tl, chunk_idx, chunk_size, tensor_loc)};
     const auto n = tl.numel_for_tensor[tensor_loc] - chunk_idx * chunk_size;
 
-#ifndef USE_ROCM
     const auto use_faster_load_store =
         (n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned;
-#else
-    const auto use_faster_load_store{false};
-#endif
     if (use_faster_load_store) {
       for (auto i_start = threadIdx.x;
            i_start * kILP < n && i_start * kILP < chunk_size;
diff --git a/aten/src/ATen/native/mps/operations/Quantized.mm b/aten/src/ATen/native/mps/operations/Quantized.mm
index 3c77ec67b42d..4d0f569ea062 100644
--- a/aten/src/ATen/native/mps/operations/Quantized.mm
+++ b/aten/src/ATen/native/mps/operations/Quantized.mm
@@ -12,6 +12,8 @@
 #include <ATen/native/mps/OperationUtils.h>
 #include <fmt/format.h>
 
+// #define _CAPTURE_KERNEL 1
+
 namespace at::native {
 
 using namespace mps;
@@ -82,6 +84,85 @@ kernel void int4pack_mm(
 INSTANTIATE_INT4MM(bfloat, 128);
 INSTANTIATE_INT4MM(bfloat, 256);
 #endif
+
+template<typename T>
+struct Vec4Type {};
+
+template<>
+struct Vec4Type<float> {
+  using type = float4;
+};
+
+template<>
+struct Vec4Type<half> {
+  using type = half4;
+};
+
+#if __METAL_VERSION__ >= 310
+template<>
+struct Vec4Type<bfloat> {
+  using type = bfloat4;
+};
+#endif
+
+template <typename T, unsigned blockSize=8>
+kernel void
+int8pack_mm(constant T *A [[buffer(0)]], constant char *B [[buffer(1)]],
+            constant T *scales [[buffer(2)]],
+            device T *outputData [[buffer(3)]],
+            constant int3 &sizes [[buffer(4)]],
+            uint2 group_index [[threadgroup_position_in_grid]],
+            uint2 threadgroup_index [[thread_position_in_threadgroup]]) {
+  using vecT = typename Vec4Type<T>::type;
+  const uint lda = sizes.y;
+  const uint ldc = sizes.z;
+  int out_idx = (group_index.x * blockSize + threadgroup_index.x) * 4;
+  int n = out_idx % sizes.z;
+  int m = out_idx / sizes.z;
+  // Offset pointers
+  A += m * lda;
+  B += n * lda;
+  outputData += m *ldc;
+
+  float4 rc = 0;
+  for (unsigned k = threadgroup_index.y * 4; k < sizes.y; k += 4 * blockSize) {
+    threadgroup_barrier(mem_flags::mem_none);
+    auto a_val = float4(*reinterpret_cast<constant vecT *>(A  + k));
+    float4x4 b_val;
+    for (int i = 0; i < 4; ++i) {
+      b_val[i] = float4(*reinterpret_cast<constant char4 *>(B + i * lda + k));
+    }
+    rc += transpose(b_val) * a_val;
+  }
+
+  // Accumulate results acorss SIMD group? (8 threads using vec4)
+  threadgroup float4 tgp_memory[blockSize][blockSize];
+  tgp_memory[threadgroup_index.x][threadgroup_index.y] = rc;
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+  if (threadgroup_index.y == 0) {
+    for (int i = 1; i < blockSize; i++) {
+      rc += tgp_memory[threadgroup_index.x][i];
+    }
+    *reinterpret_cast<device vecT *>(outputData + n) =
+        vecT(rc * float4(*reinterpret_cast<constant vecT *>(scales + n)));
+  }
+}
+
+#define INSTANTIATE_INT8MM(DTYPE)                                              \
+  template [[host_name("int8pack_mm_" #DTYPE)]] kernel void                    \
+  int8pack_mm<DTYPE>(                                                          \
+      constant DTYPE * A [[buffer(0)]], constant char *B [[buffer(1)]],        \
+      constant DTYPE *scales [[buffer(2)]],                                    \
+      device DTYPE *outputData [[buffer(3)]],                                  \
+      constant int3 &sizes [[buffer(4)]],                                      \
+      uint2 group_index [[threadgroup_position_in_grid]],                      \
+      uint2 threadgroup_index [[thread_position_in_threadgroup]]);
+
+INSTANTIATE_INT8MM(half);
+INSTANTIATE_INT8MM(float);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_INT8MM(bfloat);
+#endif
 )METAL_QUANTIZED");
 
 Tensor _weight_int4pack_mm_mps(const Tensor& A, const Tensor& B, int64_t qGroupSize, const Tensor& qScaleAndZeros) {
@@ -114,8 +195,7 @@ Tensor _weight_int4pack_mm_mps(const Tensor& A, const Tensor& B, int64_t qGroupS
 
   auto C = at::empty({M, N}, A.options());
   MPSStream* mpsStream = getCurrentMPSStream();
-  std::array<uint32_t, 3> sizes = {static_cast<uint32_t>(M), static_cast<uint32_t>(K), static_cast<uint32_t>(N)};
-  static bool firstCapture = false;
+  std::array<uint32_t, 4> sizes = {static_cast<uint32_t>(M), static_cast<uint32_t>(K), static_cast<uint32_t>(N), 0};
   dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
     @autoreleasepool {
 #if _CAPTURE_KERNEL
@@ -163,7 +243,35 @@ Tensor _weight_int8pack_mm_mps(const Tensor& A, const Tensor& B, const Tensor& s
   TORCH_CHECK(scales.dim() == 1 && scales.size(0) == N, __func__, " : expect scales to be 1d tensor with size ", N);
 
   auto C = at::empty({M, N}, A.options());
-
+  TORCH_CHECK(N % 32 == 0 && K % 32 == 0);
+#if 1
+  MPSStream* mpsStream = getCurrentMPSStream();
+  std::array<uint32_t, 4> sizes = {static_cast<uint32_t>(M), static_cast<uint32_t>(K), static_cast<uint32_t>(N), 0};
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+#if _CAPTURE_KERNEL
+      if (getMPSProfiler().isCaptureEnabled()) {
+        getMPSProfiler().startCapture(fmt::format("int8pack_mm_{}x{}x{}", M, N, K), mpsStream);
+      }
+#endif
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      const std::string kernel = fmt::format("int8pack_mm_{}", scalarToMetalTypeString(A));
+      id<MTLComputePipelineState> quantizedPSO = lib.getPipelineStateForFunc(kernel);
+      [computeEncoder setComputePipelineState:quantizedPSO];
+      mtl_setBuffer(computeEncoder, A, 0);
+      mtl_setBuffer(computeEncoder, B, 1);
+      mtl_setBuffer(computeEncoder, scales, 2);
+      mtl_setBuffer(computeEncoder, C, 3);
+      [computeEncoder setBytes:sizes.data() length:sizeof(uint32_t) * sizes.size() atIndex:4];
+      [computeEncoder dispatchThreads:MTLSizeMake(M * N / 4, 8, 1) threadsPerThreadgroup:MTLSizeMake(8, 8, 1)];
+#if _CAPTURE_KERNEL
+      if (getMPSProfiler().isCapturing()) {
+        getMPSProfiler().stopCapture(mpsStream);
+      }
+#endif
+    }
+  });
+#else
   struct CachedGraph : public MPSCachedGraph {
     CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {}
     MPSGraphTensor *ATensor = nil, *BTensor = nil, *scalesTensor = nil;
@@ -193,6 +301,7 @@ Tensor _weight_int8pack_mm_mps(const Tensor& A, const Tensor& B, const Tensor& s
                 dictionaryFromPlaceholders(APlaceholder, BPlaceholder, scalesPlaceholder),
                 outputPlaceholder);
   }
+#endif
 
   return C;
 }
diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 1ea973f93261..10d8b1ad79ca 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -3762,6 +3762,18 @@
 # This function should be deprecated in favor of differential_analytic_matrix_function in FunctionsManual.cpp
 - func: matrix_exp_backward(Tensor self, Tensor grad) -> Tensor
 
+# DEPRECATED: Use torch.aminmax instead
+- func: _aminmax(Tensor self) -> (Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: _aminmax_all
+  autogen: _aminmax.out
+
+# DEPRECATED: Use torch.aminmax instead
+- func: _aminmax.dim(Tensor self, int dim, bool keepdim=False) -> (Tensor, Tensor)
+  dispatch:
+    CPU, CUDA: _aminmax
+  autogen: _aminmax.dim_out
+
 - func: aminmax(Tensor self, *, int? dim=None, bool keepdim=False) -> (Tensor min, Tensor max)
   device_check: NoCheck   # TensorIterator
   structured_delegate: aminmax.out
@@ -7702,7 +7714,7 @@
 
 - func: result_type.Scalar_Scalar(Scalar scalar1, Scalar scalar2) -> ScalarType
 
-- func: can_cast(ScalarType from, ScalarType to) -> bool
+- func: can_cast(ScalarType from_, ScalarType to) -> bool
   variants: function
 
 - func: promote_types(ScalarType type1, ScalarType type2) -> ScalarType
@@ -14708,13 +14720,13 @@
     CUDA: _scaled_dot_product_cudnn_attention_backward_cuda
   tags: nondeterministic_seeded
 
-- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+- func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
   variants: function
   dispatch:
     CUDA: _flash_attention_forward
   tags: nondeterministic_seeded
 
-- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor)
+- func: _flash_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor, Tensor, Tensor)
   device_check: NoCheck
   variants: function
   dispatch:
@@ -15527,7 +15539,6 @@
     CPU: foobar
   autogen: _foobar.out
 
-# Fused Optimizer CUDA kernels.
 - func: _fused_adam_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] exp_avgs, Tensor(d!)[] exp_avg_sqs, Tensor(e!)[] max_exp_avg_sqs, Tensor[] state_steps, *, float lr, float beta1, float beta2, float weight_decay, float eps, bool amsgrad, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
   # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now).
   variants: function
@@ -15582,6 +15593,12 @@
     CUDA: _fused_sgd_kernel_cuda_
   autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out
 
+- func: _fused_adagrad_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] state_sums, Tensor(d!)[] state_steps, *, float lr, float lr_decay, float weight_decay, float eps, bool maximize, Tensor? grad_scale=None, Tensor? found_inf=None) -> ()
+  variants: function
+  dispatch:
+    CPU: _fused_adagrad_kernel_cpu_
+  autogen: _fused_adagrad, _fused_adagrad.out
+
 # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts.
 - func: _propagate_xla_data(Tensor input, Tensor output) -> ()
   variants: function
diff --git a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
index c28f095bb907..d75a10c0db89 100644
--- a/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
+++ b/aten/src/ATen/native/quantized/cuda/FusedObsFakeQuant.cu
@@ -7,7 +7,7 @@
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #else
-#include <ATen/ops/aminmax.h>
+#include <ATen/ops/_aminmax.h>
 #include <ATen/ops/_fake_quantize_per_tensor_affine_cachemask_tensor_qparams.h>
 #include <ATen/ops/fake_quantize_per_channel_affine.h>
 #include <ATen/ops/fake_quantize_per_channel_affine_cachemask.h>
@@ -148,7 +148,7 @@ void _calculate_moving_average(
   cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream();
 
   if (per_row_fq) {
-    std::tie(x_min, x_max) = at::aminmax(x, 1);
+    std::tie(x_min, x_max) = at::_aminmax(x, 1);
     float* x_min_data = x_min.data_ptr<float>();
     float* x_max_data = x_max.data_ptr<float>();
     int num_threads = std::min(size, (int64_t)512);
@@ -165,7 +165,7 @@ void _calculate_moving_average(
         size);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
   } else {
-    std::tie(x_min, x_max) = at::aminmax(x);
+    std::tie(x_min, x_max) = at::_aminmax(x);
     float* x_min_data = x_min.data_ptr<float>();
     float* x_max_data = x_max.data_ptr<float>();
     // Moving Average Min/Max observer for activations
diff --git a/aten/src/ATen/native/transformers/cuda/attention.cu b/aten/src/ATen/native/transformers/cuda/attention.cu
index e55560791a08..8207735e9677 100644
--- a/aten/src/ATen/native/transformers/cuda/attention.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention.cu
@@ -841,7 +841,9 @@ _flash_attention_forward(
     double dropout_p,
     bool is_causal,
     bool return_debug_mask,
-    std::optional<double> scale) {
+    std::optional<double> scale,
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right) {
 #if defined(USE_FLASH_ATTENTION)
   const auto softmax_scale =
       sdp::calculate_scale(query, scale).as_float_unchecked();
@@ -852,6 +854,9 @@ _flash_attention_forward(
   std::optional<Tensor> seqused_k = c10::nullopt;
   std::optional<Tensor> alibi_slopes = c10::nullopt;
 
+  const int non_null_window_left = window_size_left.has_value() ? window_size_left.value() : -1;
+  const int non_null_window_right = window_size_right.has_value() ? window_size_right.value() : -1;
+
   // We are going to have two paths:
   // 1. The standard MHA path for dense tensors
   // 2. The Varseqlen path
@@ -886,8 +891,8 @@ _flash_attention_forward(
             softmax_scale,
             false /*zero_tensors*/,
             is_causal,
-            -1, /*window_size_left*/
-            -1, /*window_size_right*/
+            non_null_window_left,
+            non_null_window_right,
             return_debug_mask,
             c10::nullopt /*gen_*/);
   } else {
@@ -909,8 +914,8 @@ _flash_attention_forward(
             dropout_p,
             softmax_scale,
             is_causal,
-            -1, /*window_size_left*/
-            -1, /*window_size_right*/
+            non_null_window_left,
+            non_null_window_right,
             return_debug_mask, /*return_softmax (this is used for testing)*/
             c10::nullopt);
   }
diff --git a/aten/src/ATen/native/transformers/cuda/attention_backward.cu b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
index 78c2d54fdc8a..690f433aa5f2 100644
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@@ -66,13 +66,18 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
     bool is_causal,
     const Tensor& philox_seed,
     const Tensor& philox_offset,
-    std::optional<double> scale) {
+    std::optional<double> scale,
+    std::optional<int64_t> window_size_left,
+    std::optional<int64_t> window_size_right) {
 #if defined(USE_FLASH_ATTENTION)
   const auto softmax_scale = sdp::calculate_scale(query, scale).as_float_unchecked();
   //  CUDA code assumes that dout is contiguous
   auto contiguous_grad_out = grad_out.contiguous();
   auto contiguous_out = out.contiguous();
 
+  const int non_null_window_left = window_size_left.has_value() ? window_size_left.value() : -1;
+  const int non_null_window_right = window_size_right.has_value() ? window_size_right.value() : -1;
+
   std::optional<at::Tensor> dq{c10::nullopt};
   std::optional<at::Tensor> dk{c10::nullopt};
   std::optional<at::Tensor> dv{c10::nullopt};
@@ -118,8 +123,8 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
         softmax_scale,
         false /*zero_tensors*/,
         is_causal,
-        -1, /*window_size_left*/
-        -1, /*window_size_right*/
+        non_null_window_left,
+        non_null_window_right,
         determinisitic,
         philox_seed,
         philox_offset);
@@ -140,8 +145,8 @@ std::tuple<Tensor, Tensor, Tensor> _flash_attention_backward(
         dropout_p,
         softmax_scale,
         is_causal,
-        -1, /*window_size_left*/
-        -1, /*window_size_right*/
+        non_null_window_left,
+        non_null_window_right,
         determinisitic,
         philox_seed,
         philox_offset);
diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp
index 687691a370bf..e41d3d3d6abe 100644
--- a/aten/src/ATen/test/vulkan_api_test.cpp
+++ b/aten/src/ATen/test/vulkan_api_test.cpp
@@ -788,7 +788,7 @@ TEST_F(VulkanAPITest, avg_pool2d) {
   ASSERT_TRUE(check);
 }
 
-TEST_F(VulkanAPITest, batch_norm_invalid_inputs) {
+TEST_F(VulkanAPITest, DISABLED_batch_norm_invalid_inputs) {
   c10::InferenceMode mode;
 
   // Act: Vulkan batchnorm only supports evaluation mode
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
index 5b5646e85487..20fb340690ac 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_inference.csv
@@ -150,7 +150,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,pass,0
+hf_BigBird,pass,46
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
index 0dd9ce3482f4..5131c2e9ade4 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_eager_torchbench_training.csv
@@ -98,7 +98,7 @@ hf_Bert_large,pass,6
 
 
 
-hf_BigBird,pass,6
+hf_BigBird,pass, 52
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
index 3e0af614a38c..40382a4f277c 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/aot_inductor_torchbench_inference.csv
@@ -138,7 +138,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,fail_accuracy,0
+hf_BigBird,fail_to_run,0
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
index 07bbe765f161..431a91d10669 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_inference.csv
@@ -150,7 +150,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,fail_to_run,0
+hf_BigBird,pass,46
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
index 80035c453fbf..1e1a4be4149e 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_aot_eager_torchbench_training.csv
@@ -98,7 +98,7 @@ hf_Bert_large,pass,6
 
 
 
-hf_BigBird,fail_to_run,3
+hf_BigBird,pass,52
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
index 07bbe765f161..f652e5ffa91a 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_inference.csv
@@ -150,7 +150,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,fail_to_run,0
+hf_BigBird,fail_accuracy,46
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
index eb1195caa9a1..ee58808c0bb0 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_inductor_torchbench_training.csv
@@ -98,7 +98,7 @@ hf_Bert_large,pass,6
 
 
 
-hf_BigBird,fail_to_run,3
+hf_BigBird,pass,52
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
index 5b5646e85487..20fb340690ac 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_inference.csv
@@ -150,7 +150,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,pass,0
+hf_BigBird,pass,46
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
index 0dd9ce3482f4..cfc524426644 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamo_eager_torchbench_training.csv
@@ -98,7 +98,7 @@ hf_Bert_large,pass,6
 
 
 
-hf_BigBird,pass,6
+hf_BigBird,pass,52
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
index 4ced1b19f245..108bc6543aa9 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv
@@ -150,7 +150,7 @@ hf_Bert_large,pass,0
 
 
 
-hf_BigBird,fail_accuracy,0
+hf_BigBird,fail_accuracy,46
 
 
 
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
index 0dd9ce3482f4..cfc524426644 100644
--- a/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_training.csv
@@ -98,7 +98,7 @@ hf_Bert_large,pass,6
 
 
 
-hf_BigBird,pass,6
+hf_BigBird,pass,52
 
 
 
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
index 1987a60f64fb..6ea7a31a3915 100644
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@@ -108,6 +108,7 @@
 current_onnx_compiler = ""
 current_batch_size = None
 output_filename = None
+disable_output = False
 
 MAX_DOWNLOAD_ATTEMPTS = 5
 
@@ -306,6 +307,9 @@ def load_model_from_path(path_and_class_str):
 
 
 def output_csv(filename, headers, row):
+    global disable_output
+    if disable_output:
+        return
     if os.path.exists(filename):
         with open(filename) as fd:
             lines = list(csv.reader(fd)) or [[]]
@@ -350,6 +354,24 @@ def deterministic_torch_manual_seed(*args, **kwargs):
     torch.manual_seed = deterministic_torch_manual_seed
 
 
+def empty_gpu_cache(device):
+    """
+    Explicitly empty gpu cache to avoid OOM in subsequent run.
+    """
+
+    if device not in ["cuda", "xpu"]:
+        log.warning(
+            "Trying to call the empty_gpu_cache for device: %s, which is not in list [cuda, xpu]",
+            device,
+        )
+        return
+
+    if device == "cuda":
+        torch.cuda.empty_cache()
+    elif device == "xpu":
+        torch.xpu.empty_cache()
+
+
 def synchronize():
     pass
 
@@ -1230,7 +1252,7 @@ def wrapper(self, *args, **kwargs) -> Any:
                     )
                     time.sleep(wait)
                 else:
-                    raise RuntimeError(  # noqa: TRY200
+                    raise RuntimeError(  # noqa: B904
                         f"Failed to load model '{args}' with following error(s): {str(e)}."
                     )
 
@@ -2274,7 +2296,7 @@ def decay_batch_exp(self, batch_size, factor=0.5, divisor=2):
     def batch_size_finder(self, device, model_name, initial_batch_size=1024):
         batch_size = initial_batch_size
         while batch_size >= 1:
-            torch.cuda.empty_cache()
+            empty_gpu_cache(current_device)
             try:
                 device, name, model, example_inputs, _ = self.load_model(
                     device,
@@ -2464,7 +2486,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                 fp64_outputs = None
             finally:
                 del model_fp64, inputs_fp64
-                torch.cuda.empty_cache()
+                empty_gpu_cache(current_device)
 
             tolerance, cos_similarity = self.get_tolerance_and_cosine_flag(
                 self.args.training, current_device, name
@@ -2493,7 +2515,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                 return record_status(accuracy_status, dynamo_start_stats=start_stats)
             finally:
                 del model_copy
-                torch.cuda.empty_cache()
+                empty_gpu_cache(current_device)
 
             # Rerun native pytorch
             reset_rng_state()
@@ -2514,7 +2536,7 @@ def record_status(accuracy_status, dynamo_start_stats):
                 return record_status(accuracy_status, dynamo_start_stats=start_stats)
             finally:
                 del model_copy
-                torch.cuda.empty_cache()
+                empty_gpu_cache(current_device)
 
             # Two eager runs should have exactly same result
             is_same = True
@@ -2715,7 +2737,7 @@ def warmup(fn, model, example_inputs, mode, niters=5):
             try:
                 if current_device == "cuda":
                     torch.cuda.reset_peak_memory_stats()
-                    torch.cuda.empty_cache()
+                    empty_gpu_cache(current_device)
                 t0 = time.perf_counter()
                 for _ in range(niters):
                     fn(model, example_inputs)
@@ -2945,7 +2967,7 @@ def run_one_model(
                 name, model, example_inputs, optimize_ctx, experiment, tag
             )
             print(status)
-        torch.cuda.empty_cache()
+        empty_gpu_cache(current_device)
 
         self.maybe_preserve_compile_debug(name, status)
 
@@ -3212,6 +3234,11 @@ def get_example_inputs(self):
         "--output-directory",
         help="Overrides the directory to place output files.",
     )
+    parser.add_argument(
+        "--disable-output",
+        action="store_true",
+        help="Disable writing of output files, e.g., for warm-up runs",
+    )
     parser.add_argument(
         "--baseline",
         help="Compare with a prior --output",
@@ -3391,6 +3418,7 @@ def get_example_inputs(self):
     )
     group_latency.add_argument(
         "--warm-start-latency",
+        "--warm_start_latency",
         action="store_true",
         help="Run model(s) twice and preseve caches in between to enable a 'warm start' on the 2nd run",
     )
@@ -3610,10 +3638,11 @@ def main(runner, original_dir=None, args=None):
             cmd = [sys.executable] + sys.argv
             cmd.remove("--warm-start-latency")
 
-            print(f"Executing cold-start run for {args.only}")
-            subprocess.check_call(cmd, timeout=args.timeout, env=env)
+            print(f"Performing cold-start run for {args.only}")
+            warmup_cmd = cmd + ["--repeat=1", "--disable-output"]
+            subprocess.check_call(warmup_cmd, timeout=args.timeout, env=env)
 
-            print(f"Executing warm-start run for {args.only}")
+            print(f"Performing warm-start run for {args.only}")
             subprocess.check_call(cmd, timeout=args.timeout, env=env)
         else:
             # single process path just uses the main process
@@ -3666,7 +3695,7 @@ def run(runner, args, original_dir=None):
     if args.ci:
         if args.accuracy:
             # Run fewer iterations when checking accuracy
-            args.repeat = 2
+            args.repeat = min(args.repeat, 2)
 
             # Set translation validation on by default on CI accuracy runs.
             torch.fx.experimental._config.translation_validation = True
@@ -3820,9 +3849,12 @@ def run(runner, args, original_dir=None):
         runner.skip_models.clear()
 
     experiment = null_experiment
-    global current_name, current_device, current_batch_size, output_filename, optimize_ctx, current_onnx_compiler
+    global current_name, current_device, current_batch_size, output_filename, disable_output, optimize_ctx, current_onnx_compiler
     optimize_ctx = contextlib.nullcontext()
 
+    if args.disable_output:
+        disable_output = True
+
     if args.overhead:
         optimize_ctx = torch._dynamo.optimize(dummy_fx_compile, nopython=args.nopython)
         experiment = speedup_experiment
diff --git a/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv b/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
index d3359f5c2f6f..f2f8c1b26176 100644
--- a/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
+++ b/benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
@@ -1,12 +1,22 @@
-#name,data_type,shape,wrapper,perf_speedup_target_c5_12xlarge
-#timm_vision_transformer,float32,static,default,1.1585628
-phlippe_densenet,float32,static,default,1.99590617
-basic_gnn_gcn,float32,dynamic,default,1.24639561
-llama_v2_7b_16h,float32,dynamic,default,1.27455818
-resnet50,float32,dynamic,default,2.28794694
-timm_efficientnet,float32,static,cpp,2.72195686
-mobilenet_v3_large,float32,static,cpp,3.02274304
-timm_resnest,float32,dynamic,cpp,2.10118744
-shufflenet_v2_x1_0,float32,dynamic,cpp,1.8976929
-#hf_GPT2,float32,dynamic,cpp,1.6702305
-hf_GPT2,float32,dynamic,cpp,1.1183002
+#name,data_type,shape,wrapper,perf_speedup_target_c7i_metal_24xl
+#timm_vision_transformer,float32,static,default,1.039510755
+phlippe_densenet,float32,static,default,1.3988316
+basic_gnn_gcn,float32,dynamic,default,1.074576405
+llama_v2_7b_16h,float32,dynamic,default,1.211740245
+resnet50,float32,dynamic,default,1.65984261
+timm_efficientnet,float32,static,cpp,2.271561735
+mobilenet_v3_large,float32,static,cpp,2.63375628
+timm_resnest,float32,dynamic,cpp,1.67998548
+pyhpc_turbulent_kinetic_energy,float32,dynamic,cpp,1.59968463
+#hf_GPT2,float32,dynamic,cpp,
+hf_GPT2,float32,dynamic,cpp,1.379885175
+resnext50_32x4d,amp,static,default,1.461687045
+vgg16,amp,static,default,1.267194285
+hf_Longformer,amp,dynamic,default,0.997006035
+hf_Bert_large,amp,dynamic,default,0.99391146
+llama,amp,static,default,1.32950568
+timm_regnet,amp,static,cpp,1.157188305
+lennard_jones,amp,static,cpp,2.240104485
+hf_T5_generate,amp,dynamic,cpp,1.447656135
+timm_vovnet,amp,dynamic,cpp,1.07856471
+mobilenet_v2,amp,dynamic,cpp,2.27774577
diff --git a/benchmarks/dynamo/huggingface.py b/benchmarks/dynamo/huggingface.py
index d6014706479e..a998d10bf33c 100755
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@@ -15,6 +15,10 @@
 
 log = logging.getLogger(__name__)
 
+# Enable FX graph caching
+if "TORCHINDUCTOR_FX_GRAPH_CACHE" not in os.environ:
+    torch._inductor.config.fx_graph_cache = True
+
 
 def pip_install(package):
     subprocess.check_call([sys.executable, "-m", "pip", "install", package])
diff --git a/benchmarks/dynamo/timm_models.py b/benchmarks/dynamo/timm_models.py
index ed5132001827..1d291e8d1d75 100755
--- a/benchmarks/dynamo/timm_models.py
+++ b/benchmarks/dynamo/timm_models.py
@@ -13,6 +13,10 @@
 from torch._dynamo.testing import collect_results, reduce_to_scalar_loss
 from torch._dynamo.utils import clone_inputs
 
+# Enable FX graph caching
+if "TORCHINDUCTOR_FX_GRAPH_CACHE" not in os.environ:
+    torch._inductor.config.fx_graph_cache = True
+
 
 def pip_install(package):
     subprocess.check_call([sys.executable, "-m", "pip", "install", package])
diff --git a/benchmarks/transformer/score_mod.py b/benchmarks/transformer/score_mod.py
index 0e9e8d11a35b..57088c45f8a0 100644
--- a/benchmarks/transformer/score_mod.py
+++ b/benchmarks/transformer/score_mod.py
@@ -1,8 +1,9 @@
+import argparse
 import itertools
 from collections import defaultdict
 from dataclasses import asdict, dataclass
 from functools import partial
-from typing import Callable, List
+from typing import Callable, List, Optional, Tuple
 
 import numpy as np
 import torch
@@ -28,28 +29,32 @@ def benchmark_torch_function_in_microseconds(func: Callable, *args, **kwargs) ->
 
 @dataclass(frozen=True)
 class ExperimentConfig:
-    batch_size: int
-    num_heads: int
-    q_seq_len: int
-    k_seq_len: int
-    head_dim: int
+    shape: Tuple[int]
     score_mod: Callable
     dtype: torch.dtype
+    calculate_bwd_time: bool
+
+    def __post_init__(self):
+        assert len(self.shape) == 4, "Shape must be of length 4"
 
     def asdict(self):
-        return asdict(self)
+        # Convert the dataclass instance to a dictionary
+        d = asdict(self)
+        # Remove the 'calculate_bwd_time' key
+        d.pop("calculate_bwd_time", None)
+        return d
 
 
 @dataclass(frozen=True)
-class ExperimentResults:
+class Times:
     eager_time: float
     compiled_time: float
 
-    def get_entries(self) -> List:
-        return [
-            f"{self.eager_time:2f}",
-            f"{self.compiled_time:2f}",
-        ]
+
+@dataclass(frozen=True)
+class ExperimentResults:
+    fwd_times: Times
+    bwd_times: Optional[Times]
 
 
 @dataclass(frozen=True)
@@ -57,29 +62,31 @@ class Experiment:
     config: ExperimentConfig
     results: ExperimentResults
 
-    def get_entries(self) -> List:
-        return self.config.get_entries() + self.results.get_entries()
-
     def asdict(self):
-        dict1 = asdict(self.config)
+        dict1 = self.config.asdict()
         dict2 = asdict(self.results)
         return {**dict1, **dict2}
 
 
 def generate_inputs(
-    batch_size,
-    num_heads,
-    q_sequence_length,
-    kv_sequence_length,
-    head_dim,
-    dtype,
-    device,
+    batch_size: int,
+    num_heads: int,
+    q_sequence_length: int,
+    kv_sequence_length: int,
+    head_dim: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    requires_grad: bool,
 ):
     q_shape = (batch_size, q_sequence_length, num_heads * head_dim)
     kv_shape = (batch_size, kv_sequence_length, num_heads * head_dim)
 
-    make_q = partial(torch.rand, q_shape, device=device, dtype=dtype)
-    make_kv = partial(torch.rand, kv_shape, device=device, dtype=dtype)
+    make_q = partial(
+        torch.rand, q_shape, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    make_kv = partial(
+        torch.rand, kv_shape, device=device, dtype=dtype, requires_grad=requires_grad
+    )
     query = (
         make_q()
         .view(batch_size, q_sequence_length, num_heads, head_dim)
@@ -98,22 +105,24 @@ def generate_inputs(
     return query, key, value
 
 
-def run_single_experiment(config: ExperimentConfig) -> ExperimentResults:
+def run_single_experiment(config: ExperimentConfig, dynamic=False) -> ExperimentResults:
     device = torch.device("cuda")
+    batch_size, num_heads, q_seq_len, head_dim = config.shape
     query, key, value = generate_inputs(
-        config.batch_size,
-        config.num_heads,
-        config.q_seq_len,
-        config.k_seq_len,
-        config.head_dim,
+        batch_size,
+        num_heads,
+        q_seq_len,
+        q_seq_len,
+        head_dim,
         config.dtype,
         device,
+        requires_grad=config.calculate_bwd_time,
     )
 
     def eager_sdpa(query, key, value, _):
         return F.scaled_dot_product_attention(query, key, value)
 
-    compiled_sdpa = torch.compile(_flex_attention)
+    compiled_sdpa = torch.compile(_flex_attention, dynamic=dynamic)
 
     score_mod = config.score_mod
 
@@ -124,23 +133,47 @@ def eager_sdpa(query, key, value, _):
         compiled_sdpa, query, key, value, score_mod
     )
 
-    return ExperimentResults(
-        eager_time=forward_eager_time,
-        compiled_time=forward_compiled_time,
-    )
+    if config.calculate_bwd_time:
+        out_eager = eager_sdpa(query, key, value, score_mod)
+        dOut = torch.randn_like(out_eager)
+        backward_eager_time = benchmark_torch_function_in_microseconds(
+            out_eager.backward, dOut, retain_graph=True
+        )
 
+        out_compile = compiled_sdpa(query, key, value, score_mod)
+        dOut = torch.randn_like(out_eager)
+        backward_compile_time = benchmark_torch_function_in_microseconds(
+            out_compile.backward, dOut, retain_graph=True
+        )
+
+        return ExperimentResults(
+            fwd_times=Times(forward_eager_time, forward_compiled_time),
+            bwd_times=Times(backward_eager_time, backward_compile_time),
+        )
+    else:
+        return ExperimentResults(
+            fwd_times=Times(forward_eager_time, forward_compiled_time),
+            bwd_times=None,
+        )
 
-def calculate_speedup(results: ExperimentResults) -> float:
-    return results.eager_time / results.compiled_time
+
+def calculate_speedup(results: ExperimentResults, type: str) -> float:
+    if type == "fwd":
+        return results.fwd_times.eager_time / results.fwd_times.compiled_time
+    elif type == "bwd":
+        assert results.bwd_times is not None
+        return results.bwd_times.eager_time / results.bwd_times.compiled_time
+    else:
+        raise ValueError(f"Invalid type {type}")
 
 
 def get_func_name(func):
     return func.__name__.split("<locals>.")[-1].split(" at ")[0]
 
 
-def get_average_speedups(results: List[Experiment]):
+def get_average_speedups(results: List[Experiment], type: str):
     # Calculate speedups
-    speedups = [calculate_speedup(r.results) for r in results]
+    speedups = [calculate_speedup(r.results, type) for r in results]
 
     # Find indices of max and min speedups
     max_speedup_index = np.argmax(speedups)
@@ -176,20 +209,39 @@ def print_results(results: List[Experiment]):
     table_data = defaultdict(list)
     for experiment in results:
         for key, value in experiment.asdict().items():
-            if key == "eager_time" or key == "compiled_time":
-                value = float(value)
-            table_data[key].append(value)
+            if key == "fwd_times":
+                for name, time in value.items():
+                    table_data[f"fwd_{name}"].append(float(time))
+            elif key == "bwd_times":
+                if experiment.config.calculate_bwd_time:
+                    for name, time in value.items():
+                        table_data[f"bwd_{name}"].append(float(time))
+            else:
+                table_data[key].append(value)
 
     # Calculate speedups
-    speedups = [calculate_speedup(r.results) for r in results]
-    table_data["speedup"] = speedups
+    fwd_speedups = [calculate_speedup(r.results, type="fwd") for r in results]
+    table_data["fwd_speedup"] = fwd_speedups
+    if results[0].config.calculate_bwd_time:
+        bwd_speedups = [calculate_speedup(r.results, type="bwd") for r in results]
+        table_data["bwd_speedup"] = bwd_speedups
 
     table_data["score_mod"] = [get_func_name(func) for func in table_data["score_mod"]]
     print(tabulate(table_data, headers="keys", tablefmt="github", floatfmt=".3f"))
 
-    average_data = get_average_speedups(results)
+    print("\n")
+    print("FWD Speedups".center(125, "="))
+    print("\n")
+    average_data = get_average_speedups(results, type="fwd")
     print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))
 
+    if results[0].config.calculate_bwd_time:
+        print("\n")
+        print("BWD Speedups".center(125, "="))
+        print("\n")
+        average_data = get_average_speedups(results, type="bwd")
+        print(tabulate(average_data, headers="keys", tablefmt="github", floatfmt=".3f"))
+
 
 def generate_score_mods() -> List[Callable]:
     def noop(score, b, h, m, n):
@@ -207,8 +259,8 @@ def head_bias(score, b, h, m, n):
     return [noop, causal_mask, relative_bias, head_bias]
 
 
-def generate_experiment_configs() -> List[ExperimentConfig]:
-    batch_sizes = [1, 8, 16]
+def generate_experiment_configs(calculate_bwd: bool) -> List[ExperimentConfig]:
+    batch_sizes = [2, 8, 16]
     num_heads = [16]
     q_kv_seq_lens = [(512, 512), (1024, 1024), (4096, 4096)]
     head_dims = [64, 128, 256]
@@ -227,31 +279,49 @@ def generate_experiment_configs() -> List[ExperimentConfig]:
     ) in itertools.product(
         batch_sizes, num_heads, q_kv_seq_lens, head_dims, score_mods, dtypes
     ):
+        assert q_seq_len == kv_seq_len, "Only equal length inputs supported for now."
         all_configs.append(
             ExperimentConfig(
-                batch_size=bsz,
-                num_heads=n_heads,
-                q_seq_len=q_seq_len,
-                k_seq_len=kv_seq_len,
-                head_dim=head_dim,
+                shape=(bsz, n_heads, q_seq_len, head_dim),
                 score_mod=score_mod,
                 dtype=dtype,
+                calculate_bwd_time=calculate_bwd,
             )
         )
 
     return all_configs
 
 
-def main():
+def main(dynamic: bool, calculate_bwd: bool):
     seed = 123
     np.random.seed(seed)
     torch.manual_seed(seed)
     results = []
-    for config in tqdm(generate_experiment_configs()):
+    for config in tqdm(generate_experiment_configs(calculate_bwd)):
+        results.append(
+            Experiment(config, run_single_experiment(config, dynamic=dynamic))
+        )
+    for config in tqdm(generate_experiment_configs(calculate_bwd)):
         results.append(Experiment(config, run_single_experiment(config)))
 
     print_results(results)
 
 
 if __name__ == "__main__":
-    main()
+    # Set up the argument parser
+    parser = argparse.ArgumentParser(
+        description="Run sweep over sizes and score mods for flex attention"
+    )
+    parser.add_argument(
+        "--dynamic",
+        action="store_true",
+        help="Runs a dynamic shapes version of compiled flex attention.",
+    )
+    parser.add_argument(
+        "--calculate-bwd", action="store_true", help="Calculate backward pass times"
+    )
+
+    # Parse arguments
+    args = parser.parse_args()
+
+    main(args.dynamic, args.calculate_bwd)
diff --git a/build_variables.bzl b/build_variables.bzl
index 6ed6d5c303fb..152324a4d90c 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -106,6 +106,7 @@ libtorch_profiler_sources = [
     "torch/csrc/profiler/standalone/execution_trace_observer.cpp",
     "torch/csrc/profiler/standalone/itt_observer.cpp",
     "torch/csrc/profiler/standalone/nvtx_observer.cpp",
+    "torch/csrc/profiler/standalone/privateuse1_observer.cpp",
     "torch/csrc/profiler/stubs/base.cpp",
     "torch/csrc/profiler/orchestration/vulkan.cpp",
     "torch/csrc/profiler/perf.cpp",
@@ -486,6 +487,7 @@ libtorch_core_sources = sorted(
 # These files are the only ones that are supported on Windows.
 libtorch_distributed_base_sources = [
     "torch/csrc/distributed/c10d/Backend.cpp",
+    "torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp",
     "torch/csrc/distributed/c10d/FileStore.cpp",
     "torch/csrc/distributed/c10d/Functional.cpp",
     "torch/csrc/distributed/c10d/GlooDeviceFactory.cpp",
@@ -679,6 +681,7 @@ libtorch_cuda_distributed_extra_sources = [
     "torch/csrc/distributed/c10d/UCCUtils.cpp",
     "torch/csrc/distributed/c10d/intra_node_comm.cpp",
     "torch/csrc/distributed/c10d/intra_node_comm.cu",
+    "torch/csrc/distributed/c10d/Utils.cu",
     "torch/csrc/distributed/rpc/tensorpipe_cuda.cpp",
     "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
 ]
@@ -1172,6 +1175,7 @@ aten_native_source_codegen_list = [
     "aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp",
     "aten/src/ATen/native/cpu/FusedAdamKernel.cpp",
     "aten/src/ATen/native/cpu/FusedSGDKernel.cpp",
+    "aten/src/ATen/native/cpu/FusedAdagradKernel.cpp",
 ]
 
 # This aten native source file list will not go through aten codegen process
@@ -1408,6 +1412,7 @@ aten_native_source_non_codegen_list = [
     "aten/src/ATen/native/xnnpack/Shim.cpp",
     "aten/src/ATen/native/FusedAdam.cpp",
     "aten/src/ATen/native/FusedSGD.cpp",
+    "aten/src/ATen/native/FusedAdagrad.cpp",
     # Files not in native, but depends on native symbols
     # "aten/src/ATen/TensorIndexing.cpp",
     "aten/src/ATen/TensorIterator.cpp",
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index f3a0ca3ff73a..2479f96ab30b 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -1107,6 +1107,26 @@ class DeviceCachingAllocator {
               .current;
       auto observers_local = oom_observers_;
 
+      size_t allocated_in_private_pools = 0;
+      auto get_size_block = [](const BlockPool& pool) {
+        size_t res = 0;
+        for (const auto& block : pool.blocks) {
+          res += block->size;
+        }
+        return res;
+      };
+      for (const auto& p : graph_pools) {
+        allocated_in_private_pools += get_size_block(p.second->large_blocks);
+        allocated_in_private_pools += get_size_block(p.second->small_blocks);
+      }
+
+      std::string private_pool_msg;
+
+      if (allocated_in_private_pools > 0) {
+        private_pool_msg = "with " + format_size(allocated_in_private_pools) +
+            " allocated in private pools (e.g., CUDA Graphs), ";
+      }
+
       // Make sure we do not have the device lock before calling our
       // observers which might need hold the GIL
       // It is safe to release at this point because will no longer
@@ -1153,9 +1173,12 @@ class DeviceCachingAllocator {
           " is free. ",
           proc_info,
           "Of the allocated memory ",
-          format_size(allocated_bytes),
-          " is allocated by PyTorch, and ",
-          format_size(reserved_bytes - allocated_bytes),
+          format_size(allocated_bytes + allocated_in_private_pools),
+          " is allocated by PyTorch, ",
+          private_pool_msg,
+          "and ",
+          format_size(
+              reserved_bytes - allocated_bytes - allocated_in_private_pools),
           " is reserved by PyTorch but unallocated.",
           " If reserved but unallocated memory is large try setting",
           " PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid"
diff --git a/c10/util/Float8_e4m3fn.h b/c10/util/Float8_e4m3fn.h
index 8e05e2e43bb0..e7a59e343c1f 100644
--- a/c10/util/Float8_e4m3fn.h
+++ b/c10/util/Float8_e4m3fn.h
@@ -19,7 +19,7 @@
 #include <c10/util/floating_point_utils.h>
 #include <type_traits>
 
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#if defined(__cplusplus)
 #include <cmath>
 #include <cstdint>
 #elif !defined(__OPENCL_VERSION__)
diff --git a/c10/util/Float8_e4m3fnuz.h b/c10/util/Float8_e4m3fnuz.h
index 86ece9ebdadb..cf73b322e899 100644
--- a/c10/util/Float8_e4m3fnuz.h
+++ b/c10/util/Float8_e4m3fnuz.h
@@ -22,7 +22,7 @@
 #include <c10/util/floating_point_utils.h>
 #include <type_traits>
 
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#if defined(__cplusplus)
 #include <cstdint>
 #elif !defined(__OPENCL_VERSION__)
 #include <math.h>
diff --git a/c10/util/Float8_e5m2fnuz.h b/c10/util/Float8_e5m2fnuz.h
index f63773914c11..145464e2cfff 100644
--- a/c10/util/Float8_e5m2fnuz.h
+++ b/c10/util/Float8_e5m2fnuz.h
@@ -21,7 +21,7 @@
 #include <c10/util/TypeSafeSignMath.h>
 #include <c10/util/floating_point_utils.h>
 
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#if defined(__cplusplus)
 #include <cstdint>
 #elif !defined(__OPENCL_VERSION__)
 #include <math.h>
diff --git a/c10/util/Half.h b/c10/util/Half.h
index 3d5a38cb365c..af3435941e48 100644
--- a/c10/util/Half.h
+++ b/c10/util/Half.h
@@ -16,7 +16,7 @@
 #include <c10/util/floating_point_utils.h>
 #include <type_traits>
 
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#if defined(__cplusplus)
 #include <cmath>
 #elif !defined(__OPENCL_VERSION__)
 #include <math.h>
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index bd2588b5aef3..369bb9b106a0 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -457,6 +457,9 @@ if(BUILD_LITE_INTERPRETER)
   append_filelist("libtorch_lite_cmake_sources" LIBTORCH_CMAKE_SRCS)
   list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_EAGER_SYMOBLICATION_SRCS})
   list(APPEND LIBTORCH_CMAKE_SRCS ${LITE_PROFILER_SRCS})
+  if(USE_LITE_AOTI)
+    append_filelist("inductor_core_resources" LIBTORCH_CMAKE_SRCS)
+  endif()
   set(CMAKE_POSITION_INDEPENDENT_CODE TRUE)
 else()
   append_filelist("libtorch_cmake_sources" LIBTORCH_CMAKE_SRCS)
diff --git a/caffe2/python/CMakeLists.txt b/caffe2/python/CMakeLists.txt
deleted file mode 100644
index 464aa24eadd2..000000000000
--- a/caffe2/python/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-# ---[ CPU files.
-set(Caffe2_CPU_PYTHON_SRCS
-    "/pybind_state.cc"
-    "/pybind_workspace.cc"
-    "/pybind_state_dlpack.cc"
-    "/pybind_state_nomni.cc"
-    "/pybind_state_registry.cc"
-    "/pybind_state_int8.cc"
-)
-
-if(USE_MKLDNN)
-  set(Caffe2_CPU_PYTHON_SRCS
-      ${Caffe2_CPU_PYTHON_SRCS}
-      "/pybind_state_ideep.cc"
-  )
-endif()
-
-# ---[ GPU files
-set(Caffe2_GPU_PYTHON_SRCS
-    ${Caffe2_CPU_PYTHON_SRCS}
-    "/pybind_state_gpu.cc"
-)
-
-# ---[ HIP files
-set(Caffe2_HIP_PYTHON_SRCS
-    ${Caffe2_CPU_PYTHON_SRCS}
-    "/pybind_state_hip.cc"
-)
-
-prepend(Caffe2_CPU_PYTHON_SRCS ${CMAKE_CURRENT_SOURCE_DIR} ${Caffe2_CPU_PYTHON_SRCS})
-prepend(Caffe2_GPU_PYTHON_SRCS ${CMAKE_CURRENT_SOURCE_DIR} ${Caffe2_GPU_PYTHON_SRCS})
-prepend(Caffe2_HIP_PYTHON_SRCS ${CMAKE_CURRENT_SOURCE_DIR} ${Caffe2_HIP_PYTHON_SRCS})
-
-set(Caffe2_CPU_PYTHON_SRCS ${Caffe2_CPU_PYTHON_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_PYTHON_SRCS ${Caffe2_GPU_PYTHON_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_PYTHON_SRCS ${Caffe2_HIP_PYTHON_SRCS} PARENT_SCOPE)
diff --git a/caffe2/python/__init__.py b/caffe2/python/__init__.py
deleted file mode 100644
index 1e44baf28153..000000000000
--- a/caffe2/python/__init__.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import os
-import sys
-import warnings
-
-
-try:
-    from caffe2.proto import caffe2_pb2
-except ImportError:
-    warnings.warn('Caffe2 support is no longer present in PyTorch.')
-    raise
-
-# TODO: refactor & remove the following alias
-caffe2_pb2.CPU = caffe2_pb2.PROTO_CPU
-caffe2_pb2.CUDA = caffe2_pb2.PROTO_CUDA
-caffe2_pb2.MKLDNN = caffe2_pb2.PROTO_MKLDNN
-caffe2_pb2.OPENGL = caffe2_pb2.PROTO_OPENGL
-caffe2_pb2.OPENCL = caffe2_pb2.PROTO_OPENCL
-caffe2_pb2.IDEEP = caffe2_pb2.PROTO_IDEEP
-caffe2_pb2.HIP = caffe2_pb2.PROTO_HIP
-caffe2_pb2.COMPILE_TIME_MAX_DEVICE_TYPES = caffe2_pb2.PROTO_COMPILE_TIME_MAX_DEVICE_TYPES
-
-if sys.platform == "win32":
-    is_conda = os.path.exists(os.path.join(sys.prefix, 'conda-meta'))
-    py_dll_path = os.path.join(os.path.dirname(sys.executable), 'Library', 'bin')
-    th_root = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'torch')
-    th_dll_path = os.path.join(th_root, 'lib')
-
-    if not os.path.exists(os.path.join(th_dll_path, 'nvToolsExt64_1.dll')) and \
-            not os.path.exists(os.path.join(py_dll_path, 'nvToolsExt64_1.dll')):
-        nvtoolsext_dll_path = os.path.join(
-            os.getenv('NVTOOLSEXT_PATH', 'C:\\Program Files\\NVIDIA Corporation\\NvToolsExt'), 'bin', 'x64')
-    else:
-        nvtoolsext_dll_path = ''
-
-    import importlib.util
-    import glob
-    spec = importlib.util.spec_from_file_location('torch_version', os.path.join(th_root, 'version.py'))
-    torch_version = importlib.util.module_from_spec(spec)
-    spec.loader.exec_module(torch_version)
-    if torch_version.cuda and len(glob.glob(os.path.join(th_dll_path, 'cudart64*.dll'))) == 0 and \
-            len(glob.glob(os.path.join(py_dll_path, 'cudart64*.dll'))) == 0:
-        cuda_version = torch_version.cuda
-        cuda_version_1 = cuda_version.replace('.', '_')
-        cuda_path_var = 'CUDA_PATH_V' + cuda_version_1
-        default_path = 'C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v' + cuda_version
-        cuda_path = os.path.join(os.getenv(cuda_path_var, default_path), 'bin')
-    else:
-        cuda_path = ''
-
-    import ctypes
-    kernel32 = ctypes.WinDLL('kernel32.dll', use_last_error=True)
-    dll_paths = list(filter(os.path.exists, [th_dll_path, py_dll_path, nvtoolsext_dll_path, cuda_path]))
-    with_load_library_flags = hasattr(kernel32, 'AddDllDirectory')
-    prev_error_mode = kernel32.SetErrorMode(0x0001)
-
-    kernel32.LoadLibraryW.restype = ctypes.c_void_p
-    if with_load_library_flags:
-        kernel32.LoadLibraryExW.restype = ctypes.c_void_p
-
-    for dll_path in dll_paths:
-        os.add_dll_directory(dll_path)
-
-    dlls = glob.glob(os.path.join(th_dll_path, '*.dll'))
-    path_patched = False
-    for dll in dlls:
-        is_loaded = False
-        if with_load_library_flags:
-            res = kernel32.LoadLibraryExW(dll, None, 0x00001100)
-            last_error = ctypes.get_last_error()
-            if res is None and last_error != 126:
-                err = ctypes.WinError(last_error)
-                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(dll)
-                raise err
-            elif res is not None:
-                is_loaded = True
-        if not is_loaded:
-            if not path_patched:
-                os.environ['PATH'] = ';'.join(dll_paths + [os.environ['PATH']])
-                path_patched = True
-            res = kernel32.LoadLibraryW(dll)
-            if res is None:
-                err = ctypes.WinError(ctypes.get_last_error())
-                err.strerror += ' Error loading "{}" or one of its dependencies.'.format(dll)
-                raise err
-
-    kernel32.SetErrorMode(prev_error_mode)
diff --git a/caffe2/python/_import_c_extension.py b/caffe2/python/_import_c_extension.py
deleted file mode 100644
index 32b9ec34d1f8..000000000000
--- a/caffe2/python/_import_c_extension.py
+++ /dev/null
@@ -1,57 +0,0 @@
-## @package _import_c_extension
-# Module caffe2.python._import_c_extension
-import atexit
-import logging
-import sys
-from caffe2.python import extension_loader
-
-# We will first try to load the gpu-enabled caffe2. If it fails, we will then
-# attempt to load the cpu version. The cpu backend is the minimum required, so
-# if that still fails, we will exit loud.
-with extension_loader.DlopenGuard():
-    has_hip_support = False
-    has_cuda_support = False
-    has_gpu_support = False
-
-    try:
-        from caffe2.python.caffe2_pybind11_state_gpu import *  # noqa
-        if num_cuda_devices():  # noqa
-            has_gpu_support = has_cuda_support = True
-    except ImportError as gpu_e:
-        logging.info('Failed to import cuda module: {}'.format(gpu_e))
-        try:
-            from caffe2.python.caffe2_pybind11_state_hip import *  # noqa
-            # we stop checking whether we have AMD GPU devices on the host,
-            # because we may be constructing a net on a machine without GPU,
-            # and run the net on another one with GPU
-            has_gpu_support = has_hip_support = True
-            logging.info('This caffe2 python run has AMD GPU support!')
-        except ImportError as hip_e:
-            logging.info('Failed to import AMD hip module: {}'.format(hip_e))
-
-            logging.warning(
-                'This caffe2 python run failed to load cuda module:{},'
-                'and AMD hip module:{}.'
-                'Will run in CPU only mode.'.format(gpu_e, hip_e))
-            try:
-                from caffe2.python.caffe2_pybind11_state import *  # noqa
-            except ImportError as cpu_e:
-                logging.critical(
-                    'Cannot load caffe2.python. Error: {0}'.format(str(cpu_e)))
-                sys.exit(1)
-
-# libcaffe2_python contains a global Workspace that we need to properly delete
-# when exiting. Otherwise, cudart will cause segfaults sometimes.
-atexit.register(on_module_exit)  # noqa
-
-
-# Add functionalities for the TensorCPU interface.
-def _TensorCPU_shape(self):
-    return tuple(self._shape)
-
-
-def _TensorCPU_reshape(self, shape):
-    return self._reshape(list(shape))
-
-TensorCPU.shape = property(_TensorCPU_shape)  # noqa
-TensorCPU.reshape = _TensorCPU_reshape  # noqa
diff --git a/caffe2/python/_import_c_extension.pyi b/caffe2/python/_import_c_extension.pyi
deleted file mode 100644
index 43bbfe159621..000000000000
--- a/caffe2/python/_import_c_extension.pyi
+++ /dev/null
@@ -1,227 +0,0 @@
-import collections
-from typing import Any, Dict, List, Optional, Protocol, Tuple, Union, overload
-from typing_extensions import TypeAlias
-
-import numpy as np
-import google.protobuf.message
-import torch
-from caffe2.proto import caffe2_pb2
-
-from . import core
-
-# pybind11 will automatically accept either Python str or bytes for C++ APIs
-# that accept std::string.
-_PybindStr: TypeAlias = Union[str, bytes]
-_PerOpEnginePrefType: TypeAlias = Dict[int, Dict[str, List[str]]]
-_EnginePrefType: TypeAlias = Dict[int, List[str]]
-
-Int8Tensor = collections.namedtuple(
-    'Int8Tensor', ['data', 'scale', 'zero_point']
-)
-
-
-class _HasProto(Protocol):
-    def Proto(self) -> Any: ...
-
-
-class TensorCPU:
-    def init(self, dims: List[int], caffe_type: int) -> None: ...
-    def to_torch(self) -> torch.Tensor: ...
-
-
-class Blob:
-    def feed(
-        self,
-        arg: Any,
-        device_option: Union[
-            None, str, bytes, google.protobuf.message.Message, _HasProto,
-        ] = None,
-    ) -> bool: ...
-    def is_tensor(self) -> bool: ...
-    def as_tensor(self) -> TensorCPU: ...
-    def tensor(self) -> TensorCPU: ...
-    def to_torch(self) -> torch.Tensor: ...
-    def fetch(self) -> Any: ...
-
-
-class Net:
-    def run(self) -> None: ...
-    def cancel(self) -> None: ...
-
-
-class Workspace:
-    @overload
-    def __init__(self) -> None: ...
-    @overload
-    def __init__(self, workspace: Workspace) -> None: ...
-    @property
-    def blobs(self) -> Dict[str, Blob]: ...
-    def create_blob(self, name: _PybindStr) -> Blob: ...
-    def fetch_blob(self, name: _PybindStr) -> Any: ...
-    def fetch_int8_blob(
-        self, name: Union[str, bytes, core.BlobReference]
-    ) -> Int8Tensor: ...
-    def _create_net(self, _def: bytes, overwrite: bool) -> Net: ...
-    def create_net(
-        self,
-        net: Union[str, bytes, core.Net, caffe2_pb2.NetDef],
-        overwrite: bool = False,
-    ) -> Net: ...
-    def _run_net(self, _def: bytes) -> None: ...
-    def _run_operator(self, _def: bytes) -> None: ...
-    def _run_plan(self, _def: bytes) -> None: ...
-    def run(
-        self,
-        obj: Union[
-            caffe2_pb2.PlanDef,
-            caffe2_pb2.NetDef,
-            caffe2_pb2.OperatorDef,
-            _HasProto,
-        ],
-    ) -> None: ...
-    def feed_blob(
-        self,
-        name: Union[str, bytes, core.BlobReference],
-        arr: Union[caffe2_pb2.TensorProto, np.ndarray],
-        device_option: Optional[caffe2_pb2.DeviceOption] = None,
-    ) -> bool: ...
-    def remove_blob(self, blob: Any) -> None: ...
-
-    current: Workspace
-
-
-class Argument:
-    @property
-    def name(self) -> str: ...
-    @property
-    def description(self) -> str: ...
-    @property
-    def required(self) -> bool: ...
-
-
-class OpSchema:
-    @staticmethod
-    def get(key: str) -> OpSchema: ...
-    @property
-    def args(self) -> List[Argument]: ...
-    @property
-    def input_desc(self) -> List[Tuple[str, str]]: ...
-    @property
-    def output_desc(self) -> List[Tuple[str, str]]: ...
-    @property
-    def max_input(self) -> int: ...
-    @property
-    def max_output(self) -> int: ...
-    @property
-    def min_input(self) -> int: ...
-    @property
-    def min_output(self) -> int: ...
-    def inplace_enforced(self, x: int, y: int) -> bool: ...
-
-
-class DummyName:
-    ...
-
-
-class Graph:
-    ...
-
-
-class Node:
-    ...
-
-
-class Edge:
-    ...
-
-
-class NeuralNetOperator:
-    ...
-
-
-class NeuralNetData:
-    ...
-
-
-class NNSubgraph:
-    ...
-
-
-class NNMatchGraph:
-    ...
-
-
-class Annotation:
-    ...
-
-
-is_asan: bool
-has_mkldnn: bool
-use_mkldnn: bool
-has_fbgemm: bool
-use_rocm: bool
-use_trt: bool
-define_caffe2_no_operator_schema: bool
-
-def registered_dbs() -> List[str]: ...
-def get_build_options() -> Dict[str, str]: ...
-def set_per_op_engine_pref(pref: _PerOpEnginePrefType) -> None: ...
-def set_global_engine_pref(pref: _EnginePrefType) -> None: ...
-def set_engine_pref(
-    per_op_pref: _PerOpEnginePrefType, global_pref: _EnginePrefType
-) -> None: ...
-def set_op_engine_pref(
-    op_type: _PybindStr, op_pref: _EnginePrefType
-) -> None: ...
-def op_registry_key(op_type: _PybindStr, engine: _PybindStr) -> str: ...
-def global_init(args: List[str]) -> None: ...
-def registered_operators() -> List[str]: ...
-def on_module_exit() -> None: ...
-@overload
-def switch_workspace(ws: Workspace): ...
-@overload
-def switch_workspace(name: _PybindStr, create_if_missing: Optional[bool] = None): ...
-def create_child_workspace(
-    parent_ws_name: _PybindStr, child_ws_name: _PybindStr
-) -> None: ...
-def root_folder() -> str: ...
-def current_workspace() -> str: ...
-def workspaces() -> List[str]: ...
-def benchmark_net(
-    name: _PybindStr, warmup_runs: int, main_runs: int, run_individual: bool
-) -> List[float]: ...
-def benchmark_net_once(name: _PybindStr) -> float: ...
-
-def blobs() -> Dict[str, Blob]: ...
-def has_blob(name: _PybindStr) -> bool: ...
-def create_blob(name: _PybindStr) -> bool: ...
-def reset_blob(name: _PybindStr) -> None: ...
-@overload
-def deserialize_blob(content: _PybindStr) -> Blob: ...
-@overload
-def deserialize_blob(name: _PybindStr, serialized: bytes) -> None: ...
-def serialize_blob(name: _PybindStr) -> bytes: ...
-
-def get_stats() -> Dict[str, int]: ...
-def is_numa_enabled() -> bool: ...
-def get_num_numa_nodes() -> int: ...
-def get_blob_numa_node(blob_name: _PybindStr) -> int: ...
-def get_blob_size_bytes(blob_name: _PybindStr) -> int: ...
-def create_offline_tensor(
-    name: _PybindStr, dims: List[int], datatype: int
-) -> bool: ...
-def fakeFp16FuseOps(net_str: bytes) -> bytes: ...
-
-def num_cuda_devices() -> int: ...
-def get_cuda_version() -> int: ...
-def get_cudnn_version() -> int: ...
-def get_gpu_memory_info(device_id: int) -> Tuple[int, int]: ...
-def get_device_properties(deviceid: int) -> Dict[str, Any]: ...
-
-def num_hip_devices() -> int: ...
-def get_hip_version() -> int: ...
-def get_miopen_version() -> int: ...
-
-has_hip_support: bool
-has_cuda_support: bool
-has_gpu_support: bool
diff --git a/caffe2/python/allcompare_test.py b/caffe2/python/allcompare_test.py
deleted file mode 100644
index 8733e6d3545c..000000000000
--- a/caffe2/python/allcompare_test.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env python3
-
-
-
-
-
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-from multiprocessing import Process
-
-import numpy as np
-import tempfile
-import shutil
-
-import caffe2.python.hypothesis_test_util as hu
-
-op_engine = 'GLOO'
-
-
-class TemporaryDirectory:
-    def __enter__(self):
-        self.tmpdir = tempfile.mkdtemp()
-        return self.tmpdir
-
-    def __exit__(self, type, value, traceback):
-        shutil.rmtree(self.tmpdir)
-
-
-def allcompare_process(filestore_dir, process_id, data, num_procs):
-    from caffe2.python import core, data_parallel_model, workspace, dyndep
-    from caffe2.python.model_helper import ModelHelper
-    from caffe2.proto import caffe2_pb2
-    dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
-
-    workspace.RunOperatorOnce(
-        core.CreateOperator(
-            "FileStoreHandlerCreate", [], ["store_handler"], path=filestore_dir
-        )
-    )
-    rendezvous = dict(
-        kv_handler="store_handler",
-        shard_id=process_id,
-        num_shards=num_procs,
-        engine=op_engine,
-        exit_nets=None
-    )
-
-    model = ModelHelper()
-    model._rendezvous = rendezvous
-
-    workspace.FeedBlob("test_data", data)
-
-    data_parallel_model._RunComparison(
-        model, "test_data", core.DeviceOption(caffe2_pb2.CPU, 0)
-    )
-
-
-class TestAllCompare(hu.HypothesisTestCase):
-    @given(
-        d=st.integers(1, 5), n=st.integers(2, 11), num_procs=st.integers(1, 8)
-    )
-    @settings(deadline=10000)
-    def test_allcompare(self, d, n, num_procs):
-        dims = []
-        for _ in range(d):
-            dims.append(np.random.randint(1, high=n))
-        test_data = np.random.ranf(size=tuple(dims)).astype(np.float32)
-
-        with TemporaryDirectory() as tempdir:
-            processes = []
-            for idx in range(num_procs):
-                process = Process(
-                    target=allcompare_process,
-                    args=(tempdir, idx, test_data, num_procs)
-                )
-                processes.append(process)
-                process.start()
-
-            while len(processes) > 0:
-                process = processes.pop()
-                process.join()
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/attention.py b/caffe2/python/attention.py
deleted file mode 100644
index 59f4a5adb6a5..000000000000
--- a/caffe2/python/attention.py
+++ /dev/null
@@ -1,424 +0,0 @@
-## @package attention
-# Module caffe2.python.attention
-
-
-
-
-
-from caffe2.python import brew
-
-
-class AttentionType:
-    Regular, Recurrent, Dot, SoftCoverage = tuple(range(4))
-
-
-def s(scope, name):
-    # We have to manually scope due to our internal/external blob
-    # relationships.
-    return "{}/{}".format(str(scope), str(name))
-
-
-# c_i = \sum_j w_{ij}\textbf{s}_j
-def _calc_weighted_context(
-    model,
-    encoder_outputs_transposed,
-    encoder_output_dim,
-    attention_weights_3d,
-    scope,
-):
-    # [batch_size, encoder_output_dim, 1]
-    attention_weighted_encoder_context = brew.batch_mat_mul(
-        model,
-        [encoder_outputs_transposed, attention_weights_3d],
-        s(scope, 'attention_weighted_encoder_context'),
-    )
-    # [batch_size, encoder_output_dim]
-    attention_weighted_encoder_context, _ = model.net.Reshape(
-        attention_weighted_encoder_context,
-        [
-            attention_weighted_encoder_context,
-            s(scope, 'attention_weighted_encoder_context_old_shape'),
-        ],
-        shape=[1, -1, encoder_output_dim],
-    )
-    return attention_weighted_encoder_context
-
-
-# Calculate a softmax over the passed in attention energy logits
-def _calc_attention_weights(
-    model,
-    attention_logits_transposed,
-    scope,
-    encoder_lengths=None,
-):
-    if encoder_lengths is not None:
-        attention_logits_transposed = model.net.SequenceMask(
-            [attention_logits_transposed, encoder_lengths],
-            ['masked_attention_logits'],
-            mode='sequence',
-        )
-
-    # [batch_size, encoder_length, 1]
-    attention_weights_3d = brew.softmax(
-        model,
-        attention_logits_transposed,
-        s(scope, 'attention_weights_3d'),
-        engine='CUDNN',
-        axis=1,
-    )
-    return attention_weights_3d
-
-
-# e_{ij} = \textbf{v}^T tanh \alpha(\textbf{h}_{i-1}, \textbf{s}_j)
-def _calc_attention_logits_from_sum_match(
-    model,
-    decoder_hidden_encoder_outputs_sum,
-    encoder_output_dim,
-    scope,
-):
-    # [encoder_length, batch_size, encoder_output_dim]
-    decoder_hidden_encoder_outputs_sum = model.net.Tanh(
-        decoder_hidden_encoder_outputs_sum,
-        decoder_hidden_encoder_outputs_sum,
-    )
-
-    # [encoder_length, batch_size, 1]
-    attention_logits = brew.fc(
-        model,
-        decoder_hidden_encoder_outputs_sum,
-        s(scope, 'attention_logits'),
-        dim_in=encoder_output_dim,
-        dim_out=1,
-        axis=2,
-        freeze_bias=True,
-    )
-
-    # [batch_size, encoder_length, 1]
-    attention_logits_transposed = brew.transpose(
-        model,
-        attention_logits,
-        s(scope, 'attention_logits_transposed'),
-        axes=[1, 0, 2],
-    )
-    return attention_logits_transposed
-
-
-# \textbf{W}^\alpha used in the context of \alpha_{sum}(a,b)
-def _apply_fc_weight_for_sum_match(
-    model,
-    input,
-    dim_in,
-    dim_out,
-    scope,
-    name,
-):
-    output = brew.fc(
-        model,
-        input,
-        s(scope, name),
-        dim_in=dim_in,
-        dim_out=dim_out,
-        axis=2,
-    )
-    output = model.net.Squeeze(
-        output,
-        output,
-        dims=[0],
-    )
-    return output
-
-
-# Implement RecAtt due to section 4.1 in http://arxiv.org/abs/1601.03317
-def apply_recurrent_attention(
-    model,
-    encoder_output_dim,
-    encoder_outputs_transposed,
-    weighted_encoder_outputs,
-    decoder_hidden_state_t,
-    decoder_hidden_state_dim,
-    attention_weighted_encoder_context_t_prev,
-    scope,
-    encoder_lengths=None,
-):
-    weighted_prev_attention_context = _apply_fc_weight_for_sum_match(
-        model=model,
-        input=attention_weighted_encoder_context_t_prev,
-        dim_in=encoder_output_dim,
-        dim_out=encoder_output_dim,
-        scope=scope,
-        name='weighted_prev_attention_context',
-    )
-
-    weighted_decoder_hidden_state = _apply_fc_weight_for_sum_match(
-        model=model,
-        input=decoder_hidden_state_t,
-        dim_in=decoder_hidden_state_dim,
-        dim_out=encoder_output_dim,
-        scope=scope,
-        name='weighted_decoder_hidden_state',
-    )
-    # [1, batch_size, encoder_output_dim]
-    decoder_hidden_encoder_outputs_sum_tmp = model.net.Add(
-        [
-            weighted_prev_attention_context,
-            weighted_decoder_hidden_state,
-        ],
-        s(scope, 'decoder_hidden_encoder_outputs_sum_tmp'),
-    )
-    # [encoder_length, batch_size, encoder_output_dim]
-    decoder_hidden_encoder_outputs_sum = model.net.Add(
-        [
-            weighted_encoder_outputs,
-            decoder_hidden_encoder_outputs_sum_tmp,
-        ],
-        s(scope, 'decoder_hidden_encoder_outputs_sum'),
-        broadcast=1,
-    )
-    attention_logits_transposed = _calc_attention_logits_from_sum_match(
-        model=model,
-        decoder_hidden_encoder_outputs_sum=decoder_hidden_encoder_outputs_sum,
-        encoder_output_dim=encoder_output_dim,
-        scope=scope,
-    )
-
-    # [batch_size, encoder_length, 1]
-    attention_weights_3d = _calc_attention_weights(
-        model=model,
-        attention_logits_transposed=attention_logits_transposed,
-        scope=scope,
-        encoder_lengths=encoder_lengths,
-    )
-
-    # [batch_size, encoder_output_dim, 1]
-    attention_weighted_encoder_context = _calc_weighted_context(
-        model=model,
-        encoder_outputs_transposed=encoder_outputs_transposed,
-        encoder_output_dim=encoder_output_dim,
-        attention_weights_3d=attention_weights_3d,
-        scope=scope,
-    )
-    return attention_weighted_encoder_context, attention_weights_3d, [
-        decoder_hidden_encoder_outputs_sum,
-    ]
-
-
-def apply_regular_attention(
-    model,
-    encoder_output_dim,
-    encoder_outputs_transposed,
-    weighted_encoder_outputs,
-    decoder_hidden_state_t,
-    decoder_hidden_state_dim,
-    scope,
-    encoder_lengths=None,
-):
-    weighted_decoder_hidden_state = _apply_fc_weight_for_sum_match(
-        model=model,
-        input=decoder_hidden_state_t,
-        dim_in=decoder_hidden_state_dim,
-        dim_out=encoder_output_dim,
-        scope=scope,
-        name='weighted_decoder_hidden_state',
-    )
-
-    # [encoder_length, batch_size, encoder_output_dim]
-    decoder_hidden_encoder_outputs_sum = model.net.Add(
-        [weighted_encoder_outputs, weighted_decoder_hidden_state],
-        s(scope, 'decoder_hidden_encoder_outputs_sum'),
-        broadcast=1,
-        use_grad_hack=1,
-    )
-
-    attention_logits_transposed = _calc_attention_logits_from_sum_match(
-        model=model,
-        decoder_hidden_encoder_outputs_sum=decoder_hidden_encoder_outputs_sum,
-        encoder_output_dim=encoder_output_dim,
-        scope=scope,
-    )
-
-    # [batch_size, encoder_length, 1]
-    attention_weights_3d = _calc_attention_weights(
-        model=model,
-        attention_logits_transposed=attention_logits_transposed,
-        scope=scope,
-        encoder_lengths=encoder_lengths,
-    )
-
-    # [batch_size, encoder_output_dim, 1]
-    attention_weighted_encoder_context = _calc_weighted_context(
-        model=model,
-        encoder_outputs_transposed=encoder_outputs_transposed,
-        encoder_output_dim=encoder_output_dim,
-        attention_weights_3d=attention_weights_3d,
-        scope=scope,
-    )
-    return attention_weighted_encoder_context, attention_weights_3d, [
-        decoder_hidden_encoder_outputs_sum,
-    ]
-
-
-def apply_dot_attention(
-    model,
-    encoder_output_dim,
-    # [batch_size, encoder_output_dim, encoder_length]
-    encoder_outputs_transposed,
-    # [1, batch_size, decoder_state_dim]
-    decoder_hidden_state_t,
-    decoder_hidden_state_dim,
-    scope,
-    encoder_lengths=None,
-):
-    if decoder_hidden_state_dim != encoder_output_dim:
-        weighted_decoder_hidden_state = brew.fc(
-            model,
-            decoder_hidden_state_t,
-            s(scope, 'weighted_decoder_hidden_state'),
-            dim_in=decoder_hidden_state_dim,
-            dim_out=encoder_output_dim,
-            axis=2,
-        )
-    else:
-        weighted_decoder_hidden_state = decoder_hidden_state_t
-
-    # [batch_size, decoder_state_dim]
-    squeezed_weighted_decoder_hidden_state = model.net.Squeeze(
-        weighted_decoder_hidden_state,
-        s(scope, 'squeezed_weighted_decoder_hidden_state'),
-        dims=[0],
-    )
-
-    # [batch_size, decoder_state_dim, 1]
-    expanddims_squeezed_weighted_decoder_hidden_state = model.net.ExpandDims(
-        squeezed_weighted_decoder_hidden_state,
-        squeezed_weighted_decoder_hidden_state,
-        dims=[2],
-    )
-
-    # [batch_size, encoder_output_dim, 1]
-    attention_logits_transposed = model.net.BatchMatMul(
-        [
-            encoder_outputs_transposed,
-            expanddims_squeezed_weighted_decoder_hidden_state,
-        ],
-        s(scope, 'attention_logits'),
-        trans_a=1,
-    )
-
-    # [batch_size, encoder_length, 1]
-    attention_weights_3d = _calc_attention_weights(
-        model=model,
-        attention_logits_transposed=attention_logits_transposed,
-        scope=scope,
-        encoder_lengths=encoder_lengths,
-    )
-
-    # [batch_size, encoder_output_dim, 1]
-    attention_weighted_encoder_context = _calc_weighted_context(
-        model=model,
-        encoder_outputs_transposed=encoder_outputs_transposed,
-        encoder_output_dim=encoder_output_dim,
-        attention_weights_3d=attention_weights_3d,
-        scope=scope,
-    )
-    return attention_weighted_encoder_context, attention_weights_3d, []
-
-
-def apply_soft_coverage_attention(
-    model,
-    encoder_output_dim,
-    encoder_outputs_transposed,
-    weighted_encoder_outputs,
-    decoder_hidden_state_t,
-    decoder_hidden_state_dim,
-    scope,
-    encoder_lengths,
-    coverage_t_prev,
-    coverage_weights,
-):
-
-    weighted_decoder_hidden_state = _apply_fc_weight_for_sum_match(
-        model=model,
-        input=decoder_hidden_state_t,
-        dim_in=decoder_hidden_state_dim,
-        dim_out=encoder_output_dim,
-        scope=scope,
-        name='weighted_decoder_hidden_state',
-    )
-
-    # [encoder_length, batch_size, encoder_output_dim]
-    decoder_hidden_encoder_outputs_sum_tmp = model.net.Add(
-        [weighted_encoder_outputs, weighted_decoder_hidden_state],
-        s(scope, 'decoder_hidden_encoder_outputs_sum_tmp'),
-        broadcast=1,
-    )
-    # [batch_size, encoder_length]
-    coverage_t_prev_2d = model.net.Squeeze(
-        coverage_t_prev,
-        s(scope, 'coverage_t_prev_2d'),
-        dims=[0],
-    )
-    # [encoder_length, batch_size]
-    coverage_t_prev_transposed = brew.transpose(
-        model,
-        coverage_t_prev_2d,
-        s(scope, 'coverage_t_prev_transposed'),
-    )
-
-    # [encoder_length, batch_size, encoder_output_dim]
-    scaled_coverage_weights = model.net.Mul(
-        [coverage_weights, coverage_t_prev_transposed],
-        s(scope, 'scaled_coverage_weights'),
-        broadcast=1,
-        axis=0,
-    )
-
-    # [encoder_length, batch_size, encoder_output_dim]
-    decoder_hidden_encoder_outputs_sum = model.net.Add(
-        [decoder_hidden_encoder_outputs_sum_tmp, scaled_coverage_weights],
-        s(scope, 'decoder_hidden_encoder_outputs_sum'),
-    )
-
-    # [batch_size, encoder_length, 1]
-    attention_logits_transposed = _calc_attention_logits_from_sum_match(
-        model=model,
-        decoder_hidden_encoder_outputs_sum=decoder_hidden_encoder_outputs_sum,
-        encoder_output_dim=encoder_output_dim,
-        scope=scope,
-    )
-
-    # [batch_size, encoder_length, 1]
-    attention_weights_3d = _calc_attention_weights(
-        model=model,
-        attention_logits_transposed=attention_logits_transposed,
-        scope=scope,
-        encoder_lengths=encoder_lengths,
-    )
-
-    # [batch_size, encoder_output_dim, 1]
-    attention_weighted_encoder_context = _calc_weighted_context(
-        model=model,
-        encoder_outputs_transposed=encoder_outputs_transposed,
-        encoder_output_dim=encoder_output_dim,
-        attention_weights_3d=attention_weights_3d,
-        scope=scope,
-    )
-
-    # [batch_size, encoder_length]
-    attention_weights_2d = model.net.Squeeze(
-        attention_weights_3d,
-        s(scope, 'attention_weights_2d'),
-        dims=[2],
-    )
-
-    coverage_t = model.net.Add(
-        [coverage_t_prev, attention_weights_2d],
-        s(scope, 'coverage_t'),
-        broadcast=1,
-    )
-
-    return (
-        attention_weighted_encoder_context,
-        attention_weights_3d,
-        [decoder_hidden_encoder_outputs_sum],
-        coverage_t,
-    )
diff --git a/caffe2/python/benchmark_generator.py b/caffe2/python/benchmark_generator.py
deleted file mode 100644
index c557ebfc9536..000000000000
--- a/caffe2/python/benchmark_generator.py
+++ /dev/null
@@ -1,137 +0,0 @@
-#!/usr/bin/env python3
-
-
-
-
-
-import string
-
-import argparse
-
-import numpy as np
-
-from caffe2.python.model_helper import ModelHelper
-from caffe2.python.predictor import mobile_exporter
-from caffe2.python import core, workspace, brew, utils
-
-
-def parse_kwarg(kwarg_str):
-    key, value = map(string.strip, kwarg_str.split("=", 1))
-    try:
-        value = int(value)
-    except ValueError:
-        try:
-            value = float(value)
-        except ValueError:
-            pass
-    return key, value
-
-
-def main(args):
-    # User defined keyword arguments
-    kwargs = {"order": "NCHW"}
-    kwargs.update(dict(args.kwargs))
-
-    model = ModelHelper(name=args.benchmark_name)
-
-    op_type = args.operator  # assumes a brew type op name
-    input_name = args.input_name
-    output_name = args.output_name
-
-    iters = int(args.iters)
-    for i in range(iters):
-        input_blob_name = input_name + (str(i) if i > 0 and args.chain else '')
-        output_blob_name = output_name + str(i + 1)
-        add_op = getattr(brew, op_type)
-        add_op(model, input_blob_name, output_blob_name, **kwargs)
-        if args.chain:
-            input_name, output_name = output_name, input_name
-
-    workspace.RunNetOnce(model.param_init_net)
-    extra_init_net_ops = []
-
-    def make_blob_on_context(blob_name, blob_data, context):
-        if context.upper() != "CPU":
-            blob_name_modified = "{}_CPU".format(blob_name)
-        else:  # CPU case is simple
-            blob_name_modified = blob_name
-
-        fill_op = core.CreateOperator(
-            "GivenTensorFill", [], [blob_name_modified],
-            arg=[
-                utils.MakeArgument("shape", blob_data.shape),
-                utils.MakeArgument("values", blob_data)
-            ]
-        )
-        extra_init_net_ops.append(fill_op)
-
-        # We need to create CPU blobs and add some copy operations in
-        # the init_net
-        if context.upper() == "OPENGL":
-            copy_op = core.CreateOperator("CopyToOpenGL", [blob_name_modified],
-                                          [blob_name])
-            extra_init_net_ops.append(copy_op)
-
-    for unparsed_blob in args.blob:
-        name, unparsed_dims = unparsed_blob.split('=')
-        dims = [int(d) for d in unparsed_dims.split(',')]
-        np_input = np.random.rand(*dims).astype(np.float32)
-        make_blob_on_context(name, np_input, args.context)
-
-    init_net, predict_net = mobile_exporter.Export(
-        workspace, model.net, model.params
-    )
-    init_net.op.extend(extra_init_net_ops)
-
-    # Handle manual rewrite
-    if args.context.upper() == "OPENGL":
-        old_ops = [op for op in predict_net.op]
-        del predict_net.op[:]
-        for op in old_ops:
-            op.type = 'OpenGL{}'.format(op.type)
-        predict_net.op.extend(old_ops)
-
-    if args.debug:
-        print("init_net:")
-        for op in init_net.op:
-            print(" ", op.type, op.input, "-->", op.output)
-        print("predict_net:")
-        for op in predict_net.op:
-            print(" ", op.type, op.input, "-->", op.output)
-
-    with open(args.predict_net, 'wb') as f:
-        f.write(predict_net.SerializeToString())
-    with open(args.init_net, 'wb') as f:
-        f.write(init_net.SerializeToString())
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Utility to generate Caffe2 benchmark models.")
-    parser.add_argument("operator", help="Caffe2 operator to benchmark.")
-    parser.add_argument("-b", "--blob",
-                        help="Instantiate a blob --blob name=dim1,dim2,dim3",
-                        action='append')
-    parser.add_argument("--context", help="Context to run on.", default="CPU")
-    parser.add_argument("--kwargs", help="kwargs to pass to operator.",
-                        nargs="*", type=parse_kwarg, default=[])
-    parser.add_argument("--init_net", help="Output initialization net.",
-                        default="init_net.pb")
-    parser.add_argument("--predict_net", help="Output prediction net.",
-                        default="predict_net.pb")
-    parser.add_argument("--benchmark_name",
-                        help="Name of the benchmark network",
-                        default="benchmark")
-    parser.add_argument("--input_name", help="Name of the input blob.",
-                        default="data")
-    parser.add_argument("--output_name", help="Name of the output blob.",
-                        default="output")
-    parser.add_argument("--iters",
-                        help="Number of iterations to run the operator.",
-                        default="1")
-    parser.add_argument("-d", "--debug", help="Print debug information.",
-                        action='store_true')
-    parser.add_argument("-c", "--chain",
-                        help="Chain ops together (create data dependencies)",
-                        action='store_true')
-    args = parser.parse_args()
-    main(args)
diff --git a/caffe2/python/benchmarks/concat_benchmark.py b/caffe2/python/benchmarks/concat_benchmark.py
deleted file mode 100644
index d32def6841c3..000000000000
--- a/caffe2/python/benchmarks/concat_benchmark.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import argparse
-
-import numpy as np
-from caffe2.python import core, workspace
-
-
-def benchmark_concat(num_inputs, input_dim, axis, add_axis, iterations):
-    input_names = [f"input{i}" for i in range(num_inputs)]
-    for n in input_names:
-        workspace.FeedBlob(n, np.random.randn(*input_dim).astype(np.float32))
-
-    net = core.Net("benchmark_net")
-    net.Concat(input_names, ["output", "split_info"], axis=axis, add_axis=add_axis)
-    workspace.CreateNet(net)
-
-    runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True)
-    print(f"{num_inputs * np.prod(input_dim) * 4 / runtimes[1] / 1e6} GB/s")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="minimal benchmark for concat.")
-    parser.add_argument("--num_inputs", type=int, default=2)
-    parser.add_argument("--input_dim", nargs="+", type=int, required=True)
-    parser.add_argument("--axis", type=int, default=-1)
-    parser.add_argument("--add_axis", type=int, default=0)
-    parser.add_argument("--iterations", type=int, default=64)
-    args, extra_args = parser.parse_known_args()
-    core.GlobalInit(["python"] + extra_args)
-    benchmark_concat(
-        args.num_inputs, args.input_dim, args.axis, args.add_axis, args.iterations
-    )
diff --git a/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py b/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py
deleted file mode 100644
index ce96dbc1dd63..000000000000
--- a/caffe2/python/benchmarks/fused_rowwise_nbit_conversion_bench.py
+++ /dev/null
@@ -1,50 +0,0 @@
-
-
-import argparse
-
-import numpy as np
-from caffe2.python import core, workspace
-
-
-def main(bit_rate):
-    # uncomment for debugging
-    # np.random.seed(0)
-    batchsize = 10 * 1000
-    blocksize = 64
-    print(batchsize, blocksize)
-    input_data = np.random.rand(batchsize, blocksize).astype(np.float32)
-
-    workspace.FeedBlob("input_data", input_data)
-
-    net = core.Net("bench")
-    op = core.CreateOperator(
-        "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
-        "input_data",
-        "quantized_data",
-        engine="GREEDY",
-    )
-    net.Proto().op.extend([op])
-    workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"])
-    workspace.CreateNet(net)
-    iterations = 10
-    workspace.BenchmarkNet(net.Proto().name, 1, iterations, True)
-
-    net2 = core.Net("bench2")
-    op = core.CreateOperator(
-        "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
-        "input_data",
-        "quantized_data",
-    )
-    net2.Proto().op.extend([op])
-
-    workspace.CreateNet(net2)
-    workspace.BenchmarkNet(net2.Proto().name, 1, iterations, True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="benchmark for row-wise 2/4-bit quantization."
-    )
-    parser.add_argument("--bit-rate", type=int, default=4)
-    args = parser.parse_args()
-    main(args.bit_rate)
diff --git a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py b/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
deleted file mode 100644
index b4cb8f2da0b4..000000000000
--- a/caffe2/python/benchmarks/sparse_lengths_sum_nbit_benchmark.py
+++ /dev/null
@@ -1,117 +0,0 @@
-
-
-import argparse
-import datetime
-
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, workspace
-
-
-def benchmark_sparse_lengths_sum(
-    categorical_limit,
-    embedding_size,
-    average_len,
-    batch_size,
-    iterations,
-    flush_cache,
-    bit_rate=st.sampled_from([2, 4]),
-):
-    print("Preparing lookup table. " + str(datetime.datetime.now()))
-
-    # We will use a constant, but non-trivial value so we save initialization
-    # time.
-    data = np.ones([categorical_limit, embedding_size], dtype=np.float32)
-    data *= 17.01
-
-    init_net = core.Net("init_net")
-    op = core.CreateOperator(
-        "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized", "X", "X_q"
-    )
-    init_net.Proto().op.extend([op])
-    workspace.FeedBlob("X", data)
-
-    print("Data has shape {} {}".format(data.shape, datetime.datetime.now()))
-
-    # In order to produce truly random lengths and indices, we will embed a
-    # Python operator in the net to generate them.
-    def f(_, outputs):
-        lengths = np.random.randint(
-            int(average_len * 0.75), int(average_len * 1.25), batch_size
-        ).astype(np.int32)
-        indices = np.random.randint(0, categorical_limit, np.sum(lengths)).astype(
-            np.int64
-        )
-        outputs[0].feed(indices)
-        outputs[1].feed(lengths)
-
-    init_net.Python(f)([], ["indices", "lengths"])
-    workspace.RunNetOnce(init_net)
-
-    net = core.Net("mynet")
-    if flush_cache:
-        l3_cache_size = 30 * 2 ** 20 // 4
-        workspace.FeedBlob(
-            "huge_blob", np.random.randn(l3_cache_size).astype(np.float32)
-        )
-        net.Scale("huge_blob", "huge_blob_2x", value=2.0)
-    op = core.CreateOperator(
-        "SparseLengthsSumFused" + str(bit_rate) + "BitRowwise",
-        ["X_q", "indices", "lengths"],
-        "Y",
-    )
-    net.Proto().op.extend([op])
-    workspace.CreateNet(net)
-
-    # Set random seed, so that repeated runs will keep the same sequence of
-    # random indices.
-    np.random.seed(1701)
-
-    print("Preparation finished. " + str(datetime.datetime.now()))
-
-    runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True)
-    print(
-        "{} billion sums per sec".format(
-            embedding_size
-            * workspace.FetchBlob("indices").size
-            / runtimes[2 if flush_cache else 1]
-            / 1e6
-        )
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="minimal benchmark for sparse lengths sum."
-    )
-    parser.add_argument(
-        "-e", "--embedding-size", type=int, default=6000000, help="Lookup table size."
-    )
-    parser.add_argument(
-        "--embedding-dim", type=int, default=128, help="Embedding dimension."
-    )
-    parser.add_argument(
-        "--average_len",
-        type=int,
-        default=27,
-        help="Sparse feature average lengths, default is 27",
-    )
-    parser.add_argument("--batch_size", type=int, default=100, help="The batch size.")
-    parser.add_argument(
-        "-i", "--iteration", type=int, default=100000, help="The number of iterations."
-    )
-    parser.add_argument(
-        "--flush-cache", action="store_true", help="If true, flush cache"
-    )
-    parser.add_argument("--bit-rate", type=int, default=4)
-    args, extra_args = parser.parse_known_args()
-    core.GlobalInit(["python"] + extra_args)
-    benchmark_sparse_lengths_sum(
-        args.embedding_size,
-        args.embedding_dim,
-        args.average_len,
-        args.batch_size,
-        args.iteration,
-        args.flush_cache,
-        args.bit_rate,
-    )
diff --git a/caffe2/python/benchmarks/sparse_normalize_benchmark.py b/caffe2/python/benchmarks/sparse_normalize_benchmark.py
deleted file mode 100644
index 91bb3a344866..000000000000
--- a/caffe2/python/benchmarks/sparse_normalize_benchmark.py
+++ /dev/null
@@ -1,121 +0,0 @@
-import argparse
-import datetime
-
-# import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, workspace
-
-
-def benchmark_sparse_normalize(
-    categorical_limit,
-    embedding_size,
-    average_len,
-    batch_size,
-    iterations,
-    flush_cache,
-    fp16,
-):
-    print("Preparing lookup table. " + str(datetime.datetime.now()))
-
-    # We will use a constant, but non-trivial value so we save initialization
-    # time.
-    data = np.ones([categorical_limit, embedding_size], dtype=np.float32)
-    data *= 17.01
-
-    init_net = core.Net("init_net")
-    if fp16:
-        op = core.CreateOperator("FloatToHalf", "X", "X_fp16")
-        init_net.Proto().op.extend([op])
-    l3_cache_size = 30 * 2 ** 20 // 4
-
-    # In order to produce truly random lengths and indices, we will embed a
-    # Python operator in the net to generate them.
-    def f(_, outputs):
-        lengths = np.random.randint(
-            int(average_len * 0.75), int(average_len * 1.25), batch_size
-        ).astype(np.int32)
-        indices = np.random.randint(0, categorical_limit, np.sum(lengths)).astype(
-            np.int64
-        )
-        outputs[0].feed(indices)
-
-    workspace.FeedBlob("X", data)
-    workspace.FeedBlob("huge_blob", np.random.randn(l3_cache_size).astype(np.float32))
-
-    print("Data has shape {} {}".format(data.shape, datetime.datetime.now()))
-
-    init_net.Python(f)([], ["indices"])
-    workspace.RunNetOnce(init_net)
-
-    net = core.Net("mynet")
-    op = core.CreateOperator(
-        "Float16SparseNormalize" if fp16 else "SparseNormalize",
-        ["X_fp16", "indices"] if fp16 else ["X", "indices"],
-        "X_fp16" if fp16 else "X",
-    )
-    net.Proto().external_input.append("X")
-    net.Proto().external_input.append("X_fp16")
-    net.Proto().external_input.append("indices")
-    net.Proto().op.extend([op])
-    if flush_cache:
-        net.Scale("huge_blob", "huge_blob_2x", value=2.0)
-
-    workspace.CreateNet(net)
-
-    # Set random seed, so that repeated runs will keep the same sequence of
-    # random indices.
-    np.random.seed(1701)
-
-    print("Preparation finished. " + str(datetime.datetime.now()))
-
-    runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True)
-
-    print("{} ms".format(runtimes[2 if flush_cache else 1]))
-    print("indice_size: " + str(workspace.FetchBlob("indices").size))
-    print(
-        "{} GB/sec".format(
-            (2 if fp16 else 4)
-            * embedding_size
-            * workspace.FetchBlob("indices").size
-            / runtimes[2 if flush_cache else 1]
-            / 1e6
-        )
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="minimal benchmark for sparse lengths sum."
-    )
-    parser.add_argument(
-        "-e", "--embedding-size", type=int, default=600000, help="Lookup table size."
-    )
-    parser.add_argument(
-        "--embedding-dim", type=int, default=128, help="Embedding dimension."
-    )
-    parser.add_argument(
-        "--average-len",
-        type=int,
-        default=27,
-        help="Sparse feature average lengths, default is 27",
-    )
-    parser.add_argument("--batch_size", type=int, default=100, help="The batch size.")
-    parser.add_argument(
-        "-i", "--iteration", type=int, default=100, help="The number of iterations."
-    )
-    parser.add_argument(
-        "--flush-cache", action="store_true", help="If true, flush cache"
-    )
-    parser.add_argument("--fp16", action="store_true", help="If true, use fp16")
-    args, extra_args = parser.parse_known_args()
-    core.GlobalInit(["python"] + extra_args)
-
-    benchmark_sparse_normalize(
-        args.embedding_size,
-        args.embedding_dim,
-        args.average_len,
-        args.batch_size,
-        args.iteration,
-        args.flush_cache,
-        args.fp16,
-    )
diff --git a/caffe2/python/binarysize.py b/caffe2/python/binarysize.py
deleted file mode 100644
index 172abfed56c2..000000000000
--- a/caffe2/python/binarysize.py
+++ /dev/null
@@ -1,164 +0,0 @@
-"""A tool to inspect the binary size of a built binary file.
-
-This script prints out a tree of symbols and their corresponding sizes, using
-Linux's nm functionality.
-
-Usage:
-
-    python binary_size.py -- \
-            --target=/path/to/your/target/binary \
-            [--nm_command=/path/to/your/custom/nm] \
-            [--max_depth=10] [--min_size=1024] \
-            [--color] \
-
-To assist visualization, pass in '--color' to make the symbols color coded to
-green, assuming that you have a xterm connection that supports color.
-"""
-
-
-
-
-
-import argparse
-import subprocess
-import sys
-
-
-class Trie:
-    """A simple class that represents a Trie."""
-
-    def __init__(self, name):
-        """Initializes a Trie object."""
-        self.name = name
-        self.size = 0
-        self.dictionary = {}
-
-
-def GetSymbolTrie(target, nm_command, max_depth):
-    """Gets a symbol trie with the passed in target.
-
-    Args:
-            target: the target binary to inspect.
-            nm_command: the command to run nm.
-            max_depth: the maximum depth to create the trie.
-    """
-    # Run nm to get a dump on the strings.
-    proc = subprocess.Popen(
-        [nm_command, '--radix=d', '--size-sort', '--print-size', target],
-        stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
-    nm_out, _ = proc.communicate()
-    if proc.returncode != 0:
-        print('NM command failed. Output is as follows:')
-        print(nm_out)
-        sys.exit(1)
-    # Run c++filt to get proper symbols.
-    proc = subprocess.Popen(['c++filt'],
-                            stdin=subprocess.PIPE, stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT)
-    out, _ = proc.communicate(input=nm_out)
-    if proc.returncode != 0:
-        print('c++filt failed. Output is as follows:')
-        print(out)
-        sys.exit(1)
-    # Splits the output to size and function name.
-    data = []
-    for line in out.split('\n'):
-        if line:
-            content = line.split(' ')
-            if len(content) < 4:
-                # This is a line not representing symbol sizes. skip.
-                continue
-            data.append([int(content[1]), ' '.join(content[3:])])
-    symbol_trie = Trie('')
-    for size, name in data:
-        curr = symbol_trie
-        for c in name:
-            if c not in curr.dictionary:
-                curr.dictionary[c] = Trie(curr.name + c)
-            curr = curr.dictionary[c]
-            curr.size += size
-            if len(curr.name) > max_depth:
-                break
-    symbol_trie.size = sum(t.size for t in symbol_trie.dictionary.values())
-    return symbol_trie
-
-
-def MaybeAddColor(s, color):
-    """Wrap the input string to the xterm green color, if color is set.
-    """
-    if color:
-        return '\033[92m{0}\033[0m'.format(s)
-    else:
-        return s
-
-
-def ReadableSize(num):
-    """Get a human-readable size."""
-    for unit in ['B', 'KB', 'MB', 'GB']:
-        if abs(num) <= 1024.0:
-            return '%3.2f%s' % (num, unit)
-        num /= 1024.0
-    return '%.1f TB' % (num,)
-
-
-# Note(jiayq): I know, I know, this is a recursive function, but it is
-# convenient to write.
-def PrintTrie(trie, prefix, max_depth, min_size, color):
-    """Prints the symbol trie in a readable manner.
-    """
-    if len(trie.name) == max_depth or not trie.dictionary.keys():
-        # If we are reaching a leaf node or the maximum depth, we will print the
-        # result.
-        if trie.size > min_size:
-            print('{0}{1} {2}'.format(
-                  prefix,
-                  MaybeAddColor(trie.name, color),
-                  ReadableSize(trie.size)))
-    elif len(trie.dictionary.keys()) == 1:
-        # There is only one child in this dictionary, so we will just delegate
-        # to the downstream trie to print stuff.
-        PrintTrie(
-            trie.dictionary.values()[0], prefix, max_depth, min_size, color)
-    elif trie.size > min_size:
-        print('{0}{1} {2}'.format(
-              prefix,
-              MaybeAddColor(trie.name, color),
-              ReadableSize(trie.size)))
-        keys_with_sizes = [
-            (k, trie.dictionary[k].size) for k in trie.dictionary.keys()]
-        keys_with_sizes.sort(key=lambda x: x[1])
-        for k, _ in keys_with_sizes[::-1]:
-            PrintTrie(
-                trie.dictionary[k], prefix + ' |', max_depth, min_size, color)
-
-
-def main(argv):
-    if not sys.platform.startswith('linux'):
-        raise RuntimeError('Currently this tool only supports Linux.')
-    parser = argparse.ArgumentParser(
-        description="Tool to inspect binary size.")
-    parser.add_argument(
-        '--max_depth', type=int, default=10,
-        help='The maximum depth to print the symbol tree.')
-    parser.add_argument(
-        '--min_size', type=int, default=1024,
-        help='The mininum symbol size to print.')
-    parser.add_argument(
-        '--nm_command', type=str, default='nm',
-        help='The path to the nm command that the tool needs.')
-    parser.add_argument(
-        '--color', action='store_true',
-        help='If set, use ascii color for output.')
-    parser.add_argument(
-        '--target', type=str,
-        help='The binary target to inspect.')
-    args = parser.parse_args(argv)
-    if not args.target:
-        raise RuntimeError('You must specify a target to inspect.')
-    symbol_trie = GetSymbolTrie(
-        args.target, args.nm_command, args.max_depth)
-    PrintTrie(symbol_trie, '', args.max_depth, args.min_size, args.color)
-
-
-if __name__ == '__main__':
-    main(sys.argv[1:])
diff --git a/caffe2/python/brew.py b/caffe2/python/brew.py
deleted file mode 100644
index f78b885150dd..000000000000
--- a/caffe2/python/brew.py
+++ /dev/null
@@ -1,139 +0,0 @@
-## @package model_helper_api
-# Module caffe2.python.model_helper_api
-
-
-
-
-
-import sys
-import copy
-import inspect
-from past.builtins import basestring
-from caffe2.python.model_helper import ModelHelper
-
-# flake8: noqa
-from caffe2.python.helpers.algebra import *
-from caffe2.python.helpers.arg_scope import *
-from caffe2.python.helpers.array_helpers import *
-from caffe2.python.helpers.control_ops import *
-from caffe2.python.helpers.conv import *
-from caffe2.python.helpers.db_input import *
-from caffe2.python.helpers.dropout import *
-from caffe2.python.helpers.elementwise_linear import *
-from caffe2.python.helpers.fc import *
-from caffe2.python.helpers.nonlinearity import *
-from caffe2.python.helpers.normalization import *
-from caffe2.python.helpers.pooling import *
-from caffe2.python.helpers.quantization import *
-from caffe2.python.helpers.tools import *
-from caffe2.python.helpers.train import *
-
-
-class HelperWrapper(object):
-    _registry = {
-        'arg_scope': arg_scope,
-        'fc': fc,
-        'packed_fc': packed_fc,
-        'fc_decomp': fc_decomp,
-        'fc_sparse': fc_sparse,
-        'fc_prune': fc_prune,
-        'dropout': dropout,
-        'max_pool': max_pool,
-        'average_pool': average_pool,
-        'max_pool_with_index' : max_pool_with_index,
-        'lrn': lrn,
-        'softmax': softmax,
-        'instance_norm': instance_norm,
-        'spatial_bn': spatial_bn,
-        'spatial_gn': spatial_gn,
-        'moments_with_running_stats': moments_with_running_stats,
-        'relu': relu,
-        'prelu': prelu,
-        'tanh': tanh,
-        'concat': concat,
-        'depth_concat': depth_concat,
-        'sum': sum,
-        'reduce_sum': reduce_sum,
-        'sub': sub,
-        'arg_min': arg_min,
-        'transpose': transpose,
-        'iter': iter,
-        'accuracy': accuracy,
-        'conv': conv,
-        'conv_nd': conv_nd,
-        'conv_transpose': conv_transpose,
-        'group_conv': group_conv,
-        'group_conv_deprecated': group_conv_deprecated,
-        'image_input': image_input,
-        'video_input': video_input,
-        'add_weight_decay': add_weight_decay,
-        'elementwise_linear': elementwise_linear,
-        'layer_norm': layer_norm,
-        'mat_mul' : mat_mul,
-        'batch_mat_mul' : batch_mat_mul,
-        'cond' : cond,
-        'loop' : loop,
-        'db_input' : db_input,
-        'fused_8bit_rowwise_quantized_to_float' : fused_8bit_rowwise_quantized_to_float,
-        'sparse_lengths_sum_4bit_rowwise_sparse': sparse_lengths_sum_4bit_rowwise_sparse,
-    }
-
-    def __init__(self, wrapped):
-        self.wrapped = wrapped
-
-    def __getattr__(self, helper_name):
-        if helper_name not in self._registry:
-            raise AttributeError(
-                "Helper function {} not "
-                "registered.".format(helper_name)
-            )
-
-        def scope_wrapper(*args, **kwargs):
-            new_kwargs = {}
-            if helper_name != 'arg_scope':
-                if len(args) > 0 and isinstance(args[0], ModelHelper):
-                    model = args[0]
-                elif 'model' in kwargs:
-                    model = kwargs['model']
-                else:
-                    raise RuntimeError(
-                "The first input of helper function should be model. " \
-                "Or you can provide it in kwargs as model=<your_model>.")
-                new_kwargs = copy.deepcopy(model.arg_scope)
-            func = self._registry[helper_name]
-            var_names, _, varkw, _= inspect.getargspec(func)
-            if varkw is None:
-                # this helper function does not take in random **kwargs
-                new_kwargs = {
-                    var_name: new_kwargs[var_name]
-                    for var_name in var_names if var_name in new_kwargs
-                }
-
-            cur_scope = get_current_scope()
-            new_kwargs.update(cur_scope.get(helper_name, {}))
-            new_kwargs.update(kwargs)
-            return func(*args, **new_kwargs)
-
-        scope_wrapper.__name__ = helper_name
-        return scope_wrapper
-
-    def Register(self, helper):
-        name = helper.__name__
-        if name in self._registry:
-            raise AttributeError(
-                "Helper {} already exists. Please change your "
-                "helper name.".format(name)
-            )
-        self._registry[name] = helper
-
-    def has_helper(self, helper_or_helper_name):
-        helper_name = (
-            helper_or_helper_name
-            if isinstance(helper_or_helper_name, basestring) else
-            helper_or_helper_name.__name__
-        )
-        return helper_name in self._registry
-
-
-# pyre-fixme[6]: incompatible parameter type: expected ModuleType, got HelperWrapper
-sys.modules[__name__] = HelperWrapper(sys.modules[__name__])
diff --git a/caffe2/python/brew_test.py b/caffe2/python/brew_test.py
deleted file mode 100644
index 4973876a8008..000000000000
--- a/caffe2/python/brew_test.py
+++ /dev/null
@@ -1,328 +0,0 @@
-
-
-
-
-
-from caffe2.python import brew, core, scope, workspace
-from caffe2.python.modeling.parameter_info import ParameterTags
-from caffe2.python.model_helper import ModelHelper
-from caffe2.python.cnn import CNNModelHelper
-
-import unittest
-import numpy as np
-
-
-class BrewTest(unittest.TestCase):
-    def setUp(self):
-
-        def myhelper(model, val=-1):
-            return val
-
-        if not brew.has_helper(myhelper):
-            brew.Register(myhelper)
-        self.myhelper = myhelper
-
-        def myhelper2(model, val=-1):
-            return val
-
-        if not brew.has_helper(myhelper2):
-            brew.Register(myhelper2)
-        self.myhelper2 = myhelper2
-        self.model = ModelHelper(name="test_model")
-
-    def test_dropout(self):
-        p = 0.2
-        X = np.ones((100, 100)).astype(np.float32) - p
-        workspace.FeedBlob("x", X)
-        model = ModelHelper(name="test_model")
-        brew.dropout(model, "x", "out", is_test=False)
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        out = workspace.FetchBlob("out")
-        self.assertLess(abs(out.mean() - (1 - p)), 0.05)
-
-    def test_fc(self):
-        m, n, k = (15, 15, 15)
-        X = np.random.rand(m, k).astype(np.float32) - 0.5
-
-        workspace.FeedBlob("x", X)
-        model = ModelHelper(name="test_model")
-        brew.fc(model, "x", "out_1", k, n)
-        model.Validate()
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-    def test_relu(self):
-        Xpos = np.ones((5, 5)).astype(np.float32) - 0.5
-        Xneg = np.ones((5, 5)).astype(np.float32) - 1.5
-
-        workspace.FeedBlob("xpos", Xpos)
-        workspace.FeedBlob("xneg", Xneg)
-        model = ModelHelper(name="test_model")
-        brew.relu(model, "xpos", "out_xpos")
-        brew.relu(model, "xneg", "out_xneg")
-        model.Validate()
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        pos = workspace.FetchBlob("out_xpos")
-        self.assertAlmostEqual(pos.mean(), 0.5)
-        neg = workspace.FetchBlob("out_xneg")
-        self.assertAlmostEqual(neg.mean(), 0)
-
-    def test_tanh(self):
-        X = np.ones((5, 5)).astype(np.float32) - 0.5
-
-        workspace.FeedBlob("x", X)
-        model = ModelHelper(name="test_model")
-        brew.tanh(model, "x", "out_tanh")
-        model.Validate()
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        out = workspace.FetchBlob("out_tanh")
-        self.assertAlmostEqual(out.mean(), np.tanh(0.5), places=5)
-
-    def test_validate(self):
-        model = ModelHelper(name="test_model")
-        model.params.append("aaa")
-        model.params.append("bbb")
-        self.assertEqual(model._Validate(), [])
-
-        model.params.append("xxx")
-        model.params.append("bbb")
-        self.assertEqual(model._Validate(), ["bbb"])
-
-    def test_arg_scope(self):
-        myhelper = self.myhelper
-        myhelper2 = self.myhelper2
-        n = 15
-        with brew.arg_scope([myhelper], val=n):
-            res = brew.myhelper(self.model)
-        self.assertEqual(n, res)
-
-        with brew.arg_scope([myhelper, myhelper2], val=n):
-            res1 = brew.myhelper(self.model)
-            res2 = brew.myhelper2(self.model)
-        self.assertEqual([n, n], [res1, res2])
-
-    def test_arg_scope_single(self):
-        X = np.random.rand(64, 3, 32, 32).astype(np.float32) - 0.5
-
-        workspace.FeedBlob("x", X)
-        model = ModelHelper(name="test_model")
-        with brew.arg_scope(
-            brew.conv,
-            stride=2,
-            pad=2,
-            weight_init=('XavierFill', {}),
-            bias_init=('ConstantFill', {})
-        ):
-            brew.conv(
-                model=model,
-                blob_in="x",
-                blob_out="out",
-                dim_in=3,
-                dim_out=64,
-                kernel=3,
-            )
-        model.Validate()
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        out = workspace.FetchBlob("out")
-        self.assertEqual(out.shape, (64, 64, 17, 17))
-
-    def test_arg_scope_nested(self):
-        myhelper = self.myhelper
-        n = 16
-        with brew.arg_scope([myhelper], val=-3), \
-                brew.arg_scope([myhelper], val=-2):
-            with brew.arg_scope([myhelper], val=n):
-                res = brew.myhelper(self.model)
-                self.assertEqual(n, res)
-            res = brew.myhelper(self.model)
-            self.assertEqual(res, -2)
-
-        res = brew.myhelper(self.model, val=15)
-        self.model.Validate()
-        self.assertEqual(res, 15)
-
-    def test_double_register(self):
-        myhelper = self.myhelper
-        with self.assertRaises(AttributeError):
-            brew.Register(myhelper)
-
-    def test_has_helper(self):
-        self.assertTrue(brew.has_helper(brew.conv))
-        self.assertTrue(brew.has_helper("conv"))
-
-        def myhelper3():
-            pass
-
-        self.assertFalse(brew.has_helper(myhelper3))
-
-    def test_model_helper(self):
-        X = np.random.rand(64, 32, 32, 3).astype(np.float32) - 0.5
-
-        workspace.FeedBlob("x", X)
-        my_arg_scope = {'order': 'NHWC'}
-        model = ModelHelper(name="test_model", arg_scope=my_arg_scope)
-        with brew.arg_scope(
-            brew.conv,
-            stride=2,
-            pad=2,
-            weight_init=('XavierFill', {}),
-            bias_init=('ConstantFill', {})
-        ):
-            brew.conv(
-                model=model,
-                blob_in="x",
-                blob_out="out",
-                dim_in=3,
-                dim_out=64,
-                kernel=[8, 3]
-            )
-        model.Validate()
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        out = workspace.FetchBlob("out")
-        self.assertEqual(out.shape, (64, 15, 17, 64))
-
-    def test_cnn_model_helper_deprecated(self):
-        X = np.random.rand(64, 32, 32, 3).astype(np.float32) - 0.5
-
-        workspace.FeedBlob("x", X)
-        # CNNModelHelper is going to be deprecated soon. This test is only
-        # covering some CNNModelHelper logic
-        model = CNNModelHelper(name="test_model", order='NHWC')
-        self.assertEqual(model.arg_scope['order'], 'NHWC')
-
-    def test_get_params(self):
-        def param(x):
-            return core.ScopedBlobReference(x)
-
-        def to_str_list(x):
-            return sorted([str(p) for p in x])
-
-        model = ModelHelper(name="test_model")
-        model.AddParameter(param("a"))
-        model.AddParameter(param("b"), tags=ParameterTags.COMPUTED_PARAM)
-        with scope.NameScope("c"):
-            model.AddParameter(param("a"))
-            model.AddParameter(param("d"), tags=ParameterTags.COMPUTED_PARAM)
-            self.assertEqual(to_str_list(model.GetParams()), ['c/a'])
-            self.assertEqual(to_str_list(model.GetComputedParams()), ['c/d'])
-            self.assertEqual(to_str_list(model.GetAllParams()), ['c/a', 'c/d'])
-            # Get AllParams from the global Scope
-            self.assertEqual(to_str_list(model.GetAllParams('')), [
-                             'a', 'b', 'c/a', 'c/d'])
-        self.assertEqual(to_str_list(model.GetParams()), ['a', 'c/a'])
-        self.assertEqual(to_str_list(model.GetComputedParams()), ['b', 'c/d'])
-        self.assertEqual(to_str_list(model.GetAllParams()),
-                         ['a', 'b', 'c/a', 'c/d'])
-        self.assertEqual(to_str_list(model.GetAllParams('')),
-                         ['a', 'b', 'c/a', 'c/d'])
-        # Get AllParams from the scope 'c'
-        self.assertEqual(to_str_list(model.GetAllParams('c')), ['c/a', 'c/d'])
-        self.assertEqual(to_str_list(model.GetAllParams('c/')), ['c/a', 'c/d'])
-
-    def test_param_consistence(self):
-        model = ModelHelper(name='test_mode')
-        cnv = brew.conv(model, 'data', 'cnv', 32, 32, 4)
-        step_model = ModelHelper(name='step_model', param_model=model)
-        a = brew.fc(step_model, cnv, 'a', 100, 200)
-        brew.fc(model, a, 'b', 200, 5)
-        # test the _parameters_info is shared between model and step_model
-        self.assertEqual(model._parameters_info, step_model._parameters_info)
-
-    def test_cond(self):
-        workspace.FeedBlob("cond", np.array(True))
-        workspace.FeedBlob("then_value", np.array(1))
-        workspace.FeedBlob("else_value", np.array(2))
-
-        then_model = ModelHelper(name="then_test_model")
-        then_model.net.Copy("then_value", "output_blob")
-
-        else_model = ModelHelper(name="else_test_model")
-        else_model.net.Copy("else_value", "output_blob")
-
-        model = ModelHelper(name="test_model")
-        brew.cond(
-            model=model,
-            cond_blob="cond",
-            external_blobs=["then_value", "else_value", "output_blob"],
-            then_model=then_model,
-            else_model=else_model)
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        output_value = workspace.FetchBlob("output_blob")
-        self.assertEqual(output_value, 1)
-        workspace.FeedBlob("cond", np.array(False))
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        output_value = workspace.FetchBlob("output_blob")
-        self.assertEqual(output_value, 2)
-
-    def test_loop(self):
-        workspace.FeedBlob("cond", np.array(True))
-        workspace.FeedBlob("ONE", np.array(1))
-        workspace.FeedBlob("TWO", np.array(2))
-        workspace.FeedBlob("TEN", np.array(10))
-        workspace.FeedBlob("counter", np.array(0))
-        workspace.FeedBlob("output_blob", np.array(0))
-
-        loop_model = ModelHelper(name="loop_test_model")
-        loop_model.net.Add(["output_blob", "TWO"], "output_blob")
-
-        cond_model = ModelHelper(name="cond_test_model")
-        cond_model.net.Add(["counter", "ONE"], "counter")
-        comp_res = cond_model.net.LT(["counter", "TEN"])
-        cond_model.net.Copy(comp_res, "cond")
-
-        model = ModelHelper(name="test_model")
-        brew.loop(
-            model=model,
-            cond_blob="cond",
-            external_blobs=["cond", "ONE", "TWO", "TEN", "counter", "output_blob"],
-            loop_model=loop_model,
-            cond_model=cond_model)
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        output_value = workspace.FetchBlob("output_blob")
-        self.assertEqual(output_value, 18)
-
-
-@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
-class BrewGPUTest(unittest.TestCase):
-    def test_relu(self):
-        Xpos = np.ones((5, 5)).astype(np.float32) - 0.5
-        Xneg = np.ones((5, 5)).astype(np.float32) - 1.5
-
-        workspace.FeedBlob("xpos", Xpos)
-        workspace.FeedBlob("xneg", Xneg)
-        model = ModelHelper(name="test_model")
-        brew.relu(model, "xpos", "out_xpos", use_cudnn=True)
-        brew.relu(model, "xneg", "out_xneg", use_cudnn=True)
-        model.Validate()
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        pos = workspace.FetchBlob("out_xpos")
-        self.assertAlmostEqual(pos.mean(), 0.5)
-        neg = workspace.FetchBlob("out_xneg")
-        self.assertAlmostEqual(neg.mean(), 0)
-
-    def test_tanh(self):
-        X = np.ones((5, 5)).astype(np.float32) - 0.5
-
-        workspace.FeedBlob("x", X)
-        model = ModelHelper(name="test_model")
-        brew.tanh(model, "x", "out_tanh", use_cudnn=True)
-        model.Validate()
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        out = workspace.FetchBlob("out_tanh")
-        self.assertAlmostEqual(out.mean(), np.tanh(0.5), places=5)
diff --git a/caffe2/python/build.py b/caffe2/python/build.py
deleted file mode 100644
index 862c031004c5..000000000000
--- a/caffe2/python/build.py
+++ /dev/null
@@ -1,9 +0,0 @@
-
-
-
-
-
-import caffe2.python._import_c_extension as C
-
-CAFFE2_NO_OPERATOR_SCHEMA = C.define_caffe2_no_operator_schema
-build_options = C.get_build_options()
diff --git a/caffe2/python/cached_reader.py b/caffe2/python/cached_reader.py
deleted file mode 100644
index 22bf49ed4154..000000000000
--- a/caffe2/python/cached_reader.py
+++ /dev/null
@@ -1,133 +0,0 @@
-## @package cached_reader
-# Module caffe2.python.cached_reader
-
-
-
-
-
-import os
-
-from caffe2.python import core
-from caffe2.python.db_file_reader import DBFileReader
-from caffe2.python.pipeline import pipe
-from caffe2.python.task import Cluster, TaskGroup
-
-
-class CachedReader(DBFileReader):
-
-    default_name_suffix = 'cached_reader'
-
-    """Reader with persistent in-file cache.
-
-    Example usage:
-    cached_reader = CachedReader(
-        reader,
-        db_path='/tmp/cache.db',
-        db_type='LevelDB',
-    )
-    build_cache_step = cached_reader.build_cache_step()
-    with LocalSession() as session:
-        session.run(build_cache_step)
-
-    Every time new CachedReader is created, it's expected that
-    db_path exists before calling .setup_ex(...) and .read(...).
-
-    If db_path doesn't exist, it's expected build_cache_step to be called
-    first to build a cache at db_path.
-
-    build_cache_step will check existence of provided db_path and in case
-    it's missing will initialize it by reading data from original reader.
-    All consequent attempts to read will ignore original reader
-    (i.e. no additional data will be read from it).
-
-    Args:
-        original_reader: Reader.
-            If provided, it's the original reader used to build the cache file.
-        db_path: str.
-
-    Optional Args:
-        db_type: str. DB type of file. A db_type is registed by
-            `REGISTER_CAFFE2_DB(<db_type>, <DB Class>)`.
-            Default to 'LevelDB'.
-        name: str or None. Name of CachedReader.
-            Optional name to prepend to blobs that will store the data.
-            Default to '<db_name>_<default_name_suffix>'.
-        batch_size: int.
-            How many examples are read for each time the read_net is run.
-            Defaults to 100.
-        loop_over: bool.
-            If True given, will go through examples in random order endlessly.
-            Defaults to False.
-    """
-    def __init__(
-        self,
-        original_reader,
-        db_path,
-        db_type='LevelDB',
-        name=None,
-        batch_size=100,
-        loop_over=False,
-    ):
-        assert original_reader is not None, "original_reader can't be None"
-        self.original_reader = original_reader
-
-        super().__init__(
-            db_path,
-            db_type,
-            name,
-            batch_size,
-            loop_over,
-        )
-
-    def _init_reader_schema(self, *args, **kwargs):
-        """Prepare the reader schema.
-
-            Since an original reader is given,
-            use it's schema as ground truth.
-
-            Returns:
-                schema: schema.Struct. Used in Reader.__init__(...).
-        """
-        return self.original_reader._schema
-
-    def build_cache_step(self, overwrite=False):
-        """Build a step for generating cache DB file.
-
-            If self.db_path exists and not overwritting, build an empty step.
-            Overwise, build a step as follows.
-            Pipe original reader to the _DatasetWriter,
-            so that dataset field blobs are populated.
-            Then save these blobs into a file.
-
-            Args:
-                overwrite: bool. If true, ignore the existing file
-                    and build a new one overwritting the existing one anyway.
-
-            Returns:
-                build_cache_step: ExecutionStep.
-                    The step to be run for building a cache DB file.
-        """
-        if os.path.exists(self.db_path) and not overwrite:
-            # cache already exists, no need to rebuild it
-            return core.execution_step('build_step', [])
-
-        init_net = core.Net('init')
-        self._init_field_blobs_as_empty(init_net)
-        with Cluster(), core.NameScope(self.name), TaskGroup() as copy_tg:
-            pipe(self.original_reader, self.ds.writer(), num_threads=16)
-            copy_step = copy_tg.to_task().get_step()
-        save_net = core.Net('save')
-        self._save_field_blobs_to_db_file(save_net)
-
-        return core.execution_step('build_cache', [init_net, copy_step, save_net])
-
-    def _save_field_blobs_to_db_file(self, net):
-        """Save dataset field blobs to a DB file at db_path"""
-        net.Save(
-            self.ds.get_blobs(),
-            [],
-            db=self.db_path,
-            db_type=self.db_type,
-            blob_name_overrides=self.ds.field_names(),
-            absolute_path=True,
-        )
diff --git a/caffe2/python/caffe_translator.py b/caffe2/python/caffe_translator.py
deleted file mode 100644
index 23987adf3532..000000000000
--- a/caffe2/python/caffe_translator.py
+++ /dev/null
@@ -1,937 +0,0 @@
-## @package caffe_translator
-# Module caffe2.python.caffe_translator
-
-import argparse
-import copy
-import logging
-import re
-import numpy as np  # noqa
-
-from caffe2.proto import caffe2_pb2, caffe2_legacy_pb2
-from caffe.proto import caffe_pb2
-from caffe2.python import core, utils, workspace
-from google.protobuf import text_format
-
-logging.basicConfig()
-log = logging.getLogger("caffe_translator")
-log.setLevel(logging.INFO)
-
-
-def _StateMeetsRule(state, rule):
-    """A function that reproduces Caffe's StateMeetsRule functionality."""
-    if rule.HasField('phase') and rule.phase != state.phase:
-        return False
-    if rule.HasField('min_level') and state.level < rule.min_level:
-        return False
-    if rule.HasField('max_level') and state.level > rule.max_level:
-        return False
-    curr_stages = set(list(state.stage))
-    # all stages in rule.stages should be in, otherwise it's not a match.
-    if len(rule.stage) and any([s not in curr_stages for s in rule.stage]):
-        return False
-    # none of the stage in rule.stages should be in, otherwise it's not a match.
-    if len(rule.not_stage) and any([s in curr_stages for s in rule.not_stage]):
-        return False
-    # If none of the nonmatch happens, return True.
-    return True
-
-
-def _ShouldInclude(net_state, layer):
-    """A function that reproduces Caffe's inclusion and exclusion rule."""
-    ret = (len(layer.include) == 0)
-    # check exclude rules: if any exclusion is met, we shouldn't include.
-    ret &= not any([_StateMeetsRule(net_state, rule) for rule in layer.exclude])
-    if len(layer.include):
-        # check include rules: if any inclusion is met, we should include.
-        ret |= any([_StateMeetsRule(net_state, rule) for rule in layer.include])
-    return ret
-
-
-def _GetLegacyDims(net, net_params, dummy_input, legacy_pad_ops):
-    dim_map = {}
-    ws = workspace.C.Workspace()
-    for param in net_params.protos:
-        ws.create_blob(param.name) \
-            .feed(utils.Caffe2TensorToNumpyArray(param))
-    external_input = net.op[0].input[0]
-    ws.create_blob(external_input).feed(dummy_input)
-    # Get dimensions with legacy pad
-    for i in range(len(net.op)):
-        op_def = net.op[i]
-        ws._run_operator(op_def.SerializeToString())
-        if i in legacy_pad_ops:
-            output = op_def.output[0]
-            blob_legacy = ws.fetch_blob(output)
-            dim_map[i] = blob_legacy.shape
-    return dim_map
-
-
-def _GetLegacyPadArgs(op_def, arg_map):
-    pads = {}
-    keys = ['pad_l', 'pad_t', 'pad_r', 'pad_b']
-    is_pad = 'pad' in arg_map
-    if is_pad:
-        for k in keys:
-            pads[k] = arg_map['pad'].i
-    else:
-        pads = {x: arg_map[x].i for x in keys}
-    return pads
-
-
-def _AdjustDims(op_def, arg_map, pads, dim1, dim2):
-    n1, c1, h1, w1 = dim1
-    n2, c2, h2, w2 = dim2
-    assert(n1 == n2)
-    assert(c1 == c2)
-    is_pad = 'pad' in arg_map
-    if h1 != h2 or w1 != w2:
-        if h1 == h2 + 1:
-            pads['pad_b'] += 1
-        elif h1 != h2:
-            raise Exception("Unexpected dimensions for height:", h1, h2)
-        if w1 == w2 + 1:
-            pads['pad_r'] += 1
-        elif w1 != w2:
-            raise Exception("Unexpected dimensions for width:", w1, w2)
-        if is_pad:
-            op_def.arg.remove(arg_map['pad'])
-            args = []
-            for name in pads.keys():
-                arg = caffe2_pb2.Argument()
-                arg.name = name
-                arg.i = pads[name]
-                args.append(arg)
-            op_def.arg.extend(args)
-        else:
-            for name in pads.keys():
-                arg_map[name].i = pads[name]
-
-
-def _RemoveLegacyPad(net, net_params, input_dims):
-    legacy_pad_ops = []
-    for i in range(len(net.op)):
-        op_def = net.op[i]
-        if re.match(r'^(Conv|ConvTranspose|MaxPool|AveragePool)(\dD)?$',
-                    op_def.type):
-            for arg in op_def.arg:
-                if arg.name == 'legacy_pad':
-                    legacy_pad_ops.append(i)
-                    break
-    if legacy_pad_ops:
-        n, c, h, w = input_dims
-        dummy_input = np.random.randn(n, c, h, w).astype(np.float32)
-        dim_map = _GetLegacyDims(net, net_params, dummy_input, legacy_pad_ops)
-
-        # Running with the legacy pad argument removed
-        # compare the dimensions and adjust pad argument when necessary
-        ws = workspace.C.Workspace()
-
-        external_input = net.op[0].input[0]
-        ws.create_blob(external_input).feed_blob(dummy_input)
-        for param in net_params.protos:
-            ws.create_blob(param.name) \
-              .feed_blob(utils.Caffe2TensorToNumpyArray(param))
-
-        for i in range(len(net.op)):
-            op_def = net.op[i]
-            if i in legacy_pad_ops:
-                arg_map = {}
-                for arg in op_def.arg:
-                    arg_map[arg.name] = arg
-                pads = _GetLegacyPadArgs(op_def, arg_map)
-                # remove legacy pad arg
-                for j in range(len(op_def.arg)):
-                    arg = op_def.arg[j]
-                    if arg.name == 'legacy_pad':
-                        del op_def.arg[j]
-                        break
-                output = op_def.output[0]
-                # use a new name to avoid the interference with inplace
-                nonlegacy_output = output + '_nonlegacy'
-                op_def.output[0] = nonlegacy_output
-                ws._run_operator(op_def.SerializeToString())
-                blob_nonlegacy = ws.fetch_blob(nonlegacy_output)
-                # reset output name
-                op_def.output[0] = output
-
-                dim1 = dim_map[i]
-                dim2 = blob_nonlegacy.shape
-                _AdjustDims(op_def, arg_map, pads, dim1, dim2)
-
-            ws._run_operator(op_def.SerializeToString())
-    return net
-
-
-def _GetBlobDimMap(net, net_params, dummy_input):
-    dim_map = {}
-    ws = workspace.C.Workspace()
-    for param in net_params.protos:
-        ws.create_blob(param.name) \
-          .feed(utils.Caffe2TensorToNumpyArray(param))
-    external_input = net.op[0].input[0]
-    ws.create_blob(external_input).feed(dummy_input)
-    # Get dimensions with legacy pad
-    for i in range(len(net.op)):
-        op_def = net.op[i]
-        ws._run_operator(op_def.SerializeToString())
-        for output in op_def.output:
-            blob = ws.fetch_blob(output)
-            dim_map[output] = blob.shape
-    return dim_map
-
-
-def _GetInputDims(caffe_net):
-    input_dims = []
-    if caffe_net.input_dim:
-        input_dims = caffe_net.input_dim
-    elif caffe_net.input_shape:
-        input_dims = caffe_net.input_shape[0].dim
-    elif caffe_net.layer[0].input_param.shape:
-        # getting input dimension from first layer
-        input_dims = caffe_net.layer[0].input_param.shape[0].dim
-    return input_dims
-
-
-class TranslatorRegistry:
-    registry_ = {}
-
-    @classmethod
-    def Register(cls, op_name):
-        """A decorator for registering gradient mappings."""
-
-        def Wrapper(func):
-            cls.registry_[op_name] = func
-            return func
-
-        return Wrapper
-
-    @classmethod
-    def TranslateLayer(cls, layer, pretrained_blobs, is_test, **kwargs):
-        try:
-            caffe_ops, params = cls.registry_[layer.type](
-                layer, pretrained_blobs, is_test, **kwargs)
-        except KeyError as e:
-            raise KeyError('No translator registered for layer: %s yet.' %
-                           str(layer)) from e
-        if caffe_ops is None:
-            caffe_ops = []
-        if type(caffe_ops) is not list:
-            caffe_ops = [caffe_ops]
-        return caffe_ops, params
-
-    @classmethod
-    def TranslateModel(
-        cls,
-        caffe_net,
-        pretrained_net,
-        is_test=False,
-        net_state=None,
-        remove_legacy_pad=False,
-        input_dims=None
-    ):
-        net_state = caffe_pb2.NetState() if net_state is None else net_state
-        net = caffe2_pb2.NetDef()
-        net.name = caffe_net.name
-        net_params = caffe2_pb2.TensorProtos()
-        if len(caffe_net.layers) > 0:
-            raise ValueError(
-                'I think something is wrong. This translation script '
-                'only accepts new style layers that are stored in the '
-                'layer field.'
-            )
-        if not input_dims:
-            input_dims = _GetInputDims(caffe_net)
-        for layer in caffe_net.layer:
-            if not _ShouldInclude(net_state, layer):
-                log.info('Current net state does not need layer {}'
-                            .format(layer.name))
-                continue
-            log.info('Translate layer {}'.format(layer.name))
-            # Get pretrained one
-            pretrained_layers = (
-                [l for l in pretrained_net.layer
-                 if l.name == layer.name] + [l
-                                             for l in pretrained_net.layers
-                                             if l.name == layer.name]
-            )
-            if len(pretrained_layers) > 1:
-                raise ValueError(
-                    'huh? more than one pretrained layer of one name?')
-            elif len(pretrained_layers) == 1:
-                pretrained_blobs = [
-                    utils.CaffeBlobToNumpyArray(blob)
-                    for blob in pretrained_layers[0].blobs
-                ]
-            else:
-                # No pretrained layer for the given layer name. We'll just pass
-                # no parameter blobs.
-                # print 'No pretrained layer for layer', layer.name
-                pretrained_blobs = []
-            operators, params = cls.TranslateLayer(
-                layer, pretrained_blobs, is_test, net=net,
-                net_params=net_params, input_dims=input_dims)
-            net.op.extend(operators)
-            net_params.protos.extend(params)
-        if remove_legacy_pad:
-            assert input_dims, \
-                   'Please specify input_dims to remove legacy_pad'
-            net = _RemoveLegacyPad(net, net_params, input_dims)
-        return net, net_params
-
-
-def TranslateModel(*args, **kwargs):
-    return TranslatorRegistry.TranslateModel(*args, **kwargs)
-
-
-def ConvertTensorProtosToInitNet(net_params, input_name):
-    """Takes the net_params returned from TranslateModel, and wrap it as an
-    init net that contain GivenTensorFill.
-
-    This is a very simple feature that only works with float tensors, and is
-    only intended to be used in an environment where you want a single
-    initialization file - for more complex cases, use a db to store the
-    parameters.
-    """
-    init_net = caffe2_pb2.NetDef()
-    for tensor in net_params.protos:
-        if len(tensor.float_data) == 0:
-            raise RuntimeError(
-                "Only float tensors are supported in this util.")
-        op = core.CreateOperator(
-            "GivenTensorFill", [], [tensor.name],
-            arg=[
-                utils.MakeArgument("shape", list(tensor.dims)),
-                utils.MakeArgument("values", tensor.float_data)])
-        init_net.op.extend([op])
-    init_net.op.extend([core.CreateOperator("ConstantFill", [], [input_name], shape=[1])])
-    return init_net
-
-
-def BaseTranslate(layer, caffe2_type):
-    """A simple translate interface that maps the layer input and output."""
-    caffe2_op = caffe2_pb2.OperatorDef()
-    caffe2_op.type = caffe2_type
-    caffe2_op.input.extend(layer.bottom)
-    caffe2_op.output.extend(layer.top)
-    return caffe2_op
-
-
-def AddArgument(op, key, value):
-    """Makes an argument based on the value type."""
-    op.arg.extend([utils.MakeArgument(key, value)])
-
-################################################################################
-# Common translators for layers.
-################################################################################
-
-
-@TranslatorRegistry.Register("Input")
-def TranslateInput(layer, pretrained_blobs, is_test, **kwargs):
-    return [], []
-
-
-@TranslatorRegistry.Register("VideoData")
-def TranslateVideoData(layer, pretrained_blobs, is_test, **kwargs):
-    return [], []
-
-
-@TranslatorRegistry.Register("Data")
-def TranslateData(layer, pretrained_blobs, is_test, **kwargs):
-    return [], []
-
-
-# A function used in convolution, pooling and deconvolution to deal with
-# conv pool specific parameters.
-def _TranslateStridePadKernelHelper(param, caffe_op):
-    try:
-        if (len(param.stride) > 1 or len(param.kernel_size) > 1 or
-                len(param.pad) > 1):
-            raise NotImplementedError(
-                "Translator currently does not support non-conventional "
-                "pad/kernel/stride settings."
-            )
-        stride = param.stride[0] if len(param.stride) else 1
-        pad = param.pad[0] if len(param.pad) else 0
-        kernel = param.kernel_size[0] if len(param.kernel_size) else 0
-    except TypeError:
-        # This catches the case of a PoolingParameter, in which case we are
-        # having non-repeating pad, stride and kernel.
-        stride = param.stride
-        pad = param.pad
-        kernel = param.kernel_size
-    # Get stride
-    if param.HasField("stride_h") or param.HasField("stride_w"):
-        AddArgument(caffe_op, "stride_h", param.stride_h)
-        AddArgument(caffe_op, "stride_w", param.stride_w)
-    else:
-        AddArgument(caffe_op, "stride", stride)
-    # Get pad
-    if param.HasField("pad_h") or param.HasField("pad_w"):
-        if param.pad_h == param.pad_w:
-            AddArgument(caffe_op, "pad", param.pad_h)
-        else:
-            AddArgument(caffe_op, "pad_t", param.pad_h)
-            AddArgument(caffe_op, "pad_b", param.pad_h)
-            AddArgument(caffe_op, "pad_l", param.pad_w)
-            AddArgument(caffe_op, "pad_r", param.pad_w)
-    else:
-        AddArgument(caffe_op, "pad", pad)
-    # Get kernel
-    if param.HasField("kernel_h") or param.HasField("kernel_w"):
-        AddArgument(caffe_op, "kernel_h", param.kernel_h)
-        AddArgument(caffe_op, "kernel_w", param.kernel_w)
-    else:
-        AddArgument(caffe_op, "kernel", kernel)
-
-
-@TranslatorRegistry.Register("Convolution3D")
-def TranslateConvNd(layer, pretrained_blobs, is_test, **kwargs):
-    param = layer.convolution3d_param
-    caffe_op = BaseTranslate(layer, "Conv")
-    output = caffe_op.output[0]
-    caffe_op.input.append(output + '_w')
-
-    AddArgument(
-        caffe_op,
-        "kernels",
-        [param.kernel_depth, param.kernel_size, param.kernel_size])
-    AddArgument(
-        caffe_op,
-        "strides",
-        [param.temporal_stride, param.stride, param.stride])
-    temporal_pad = 0
-    spatial_pad = 0
-    if hasattr(param, 'temporal_pad'):
-        temporal_pad = param.temporal_pad
-    if hasattr(param, 'pad'):
-        spatial_pad = param.pad
-    AddArgument(caffe_op, "pads", [temporal_pad, spatial_pad, spatial_pad] * 2)
-
-    # weight
-    params = [
-        utils.NumpyArrayToCaffe2Tensor(pretrained_blobs[0], output + '_w')]
-    # bias
-    if len(pretrained_blobs) == 2:
-        caffe_op.input.append(output + '_b')
-        params.append(
-            utils.NumpyArrayToCaffe2Tensor(
-                pretrained_blobs[1].flatten(), output + '_b'))
-    return caffe_op, params
-
-
-@TranslatorRegistry.Register("Convolution")
-def TranslateConv(layer, pretrained_blobs, is_test, **kwargs):
-    param = layer.convolution_param
-    caffe_op = BaseTranslate(layer, "Conv")
-    output = caffe_op.output[0]
-    caffe_op.input.append(output + '_w')
-    _TranslateStridePadKernelHelper(param, caffe_op)
-    # weight
-    params = [
-        utils.NumpyArrayToCaffe2Tensor(pretrained_blobs[0], output + '_w')]
-    # bias
-    if len(pretrained_blobs) == 2:
-        caffe_op.input.append(output + '_b')
-        params.append(
-            utils.NumpyArrayToCaffe2Tensor(
-                pretrained_blobs[1].flatten(), output + '_b'))
-    # Group convolution option
-    if param.group != 1:
-        AddArgument(caffe_op, "group", param.group)
-    # Get dilation - not tested. If you have a model and this checks out,
-    # please provide a test and uncomment this.
-    if len(param.dilation) > 0:
-        if len(param.dilation) == 1:
-            AddArgument(caffe_op, "dilation", param.dilation[0])
-        elif len(param.dilation) == 2:
-            AddArgument(caffe_op, "dilation_h", param.dilation[0])
-            AddArgument(caffe_op, "dilation_w", param.dilation[1])
-    return caffe_op, params
-
-
-@TranslatorRegistry.Register("Deconvolution")
-def TranslateDeconv(layer, pretrained_blobs, is_test, **kwargs):
-    param = layer.convolution_param
-    if param.group > 1:
-        raise NotImplementedError(
-            "Translator currently does not support group deconvolution."
-        )
-    caffe_op = BaseTranslate(layer, "ConvTranspose")
-    output = caffe_op.output[0]
-    _TranslateStridePadKernelHelper(param, caffe_op)
-    caffe_op.input.extend([output + '_w'])
-    AddArgument(caffe_op, "order", "NCHW")
-    weight = utils.NumpyArrayToCaffe2Tensor(pretrained_blobs[0], output + '_w')
-    if param.bias_term:
-        bias = utils.NumpyArrayToCaffe2Tensor(
-            pretrained_blobs[1].flatten(), output + '_b'
-        )
-        caffe_op.input.extend([output + '_b'])
-        return caffe_op, [weight, bias]
-    else:
-        return caffe_op, [weight]
-
-
-@TranslatorRegistry.Register("Crop")
-def TranslateCrop(layer, pretrained_blobs, is_test, **kwargs):
-    net, net_params, input_dims = kwargs['net'], kwargs['net_params'], kwargs['input_dims']
-    n, c, h, w = input_dims
-    dummy_input = np.random.randn(n, c, h, w).astype(np.float32)
-    dim_map = _GetBlobDimMap(net, net_params, dummy_input)
-    param = layer.crop_param
-    axis, offsets = param.axis, param.offset
-    caffe_op = BaseTranslate(layer, "Slice")
-    input_1 = caffe_op.input[1]
-    input_1_dim = dim_map[input_1]
-    starts, ends = [], []
-    dims = len(dim_map[input_1])
-    assert len(offsets) == 1, 'Caffe Translator for Crop only works for offset \
-    of 1 for now'
-    for _ in range(axis):
-        starts.append(0)
-        ends.append(-1)
-    end_offset = [int(offsets[0] + input_1_dim[i]) for i in range(axis, dims)]
-    ends.extend(end_offset)
-    starts.extend([offsets[0]] * len(end_offset))
-    op = caffe2_pb2.OperatorDef()
-    op.input.extend([caffe_op.input[0]])
-    op.output.extend(caffe_op.output)
-    op.arg.extend(caffe_op.arg)
-    op.type = caffe_op.type
-    AddArgument(op, "starts", starts)
-    AddArgument(op, "ends", ends)
-    return op, []
-
-@TranslatorRegistry.Register("ReLU")
-def TranslateRelu(layer, pretrained_blobs, is_test, **kwargs):
-    return BaseTranslate(layer, "Relu"), []
-
-
-@TranslatorRegistry.Register("Pooling")
-def TranslatePool(layer, pretrained_blobs, is_test, **kwargs):
-    param = layer.pooling_param
-    if param.pool == caffe_pb2.PoolingParameter.MAX:
-        caffe_op = BaseTranslate(layer, "MaxPool")
-    elif param.pool == caffe_pb2.PoolingParameter.AVE:
-        caffe_op = BaseTranslate(layer, "AveragePool")
-    _TranslateStridePadKernelHelper(param, caffe_op)
-    AddArgument(caffe_op, "order", "NCHW")
-    try:
-        # In the Facebook port of Caffe, a torch_pooling field was added to
-        # map the pooling computation of Torch. Essentially, it uses
-        #   floor((height + 2 * padding - kernel) / stride) + 1
-        # instead of
-        #   ceil((height + 2 * padding - kernel) / stride) + 1
-        # which is Caffe's version.
-        # Torch pooling is actually the same as Caffe2 pooling, so we don't
-        # need to do anything.
-        is_torch_pooling = param.torch_pooling
-    except AttributeError:
-        is_torch_pooling = False
-    if not is_torch_pooling:
-        AddArgument(caffe_op, "legacy_pad",
-                    caffe2_legacy_pb2.CAFFE_LEGACY_POOLING)
-    if param.global_pooling:
-        AddArgument(caffe_op, "global_pooling", 1)
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("Pooling3D")
-def TranslatePool3D(layer, pretrained_blobs, is_test, **kwargs):
-    param = layer.pooling3d_param
-    if param.pool == caffe_pb2.Pooling3DParameter.MAX:
-        caffe_op = BaseTranslate(layer, "MaxPool")
-
-    elif param.pool == caffe_pb2.Pooling3DParameter.AVE:
-        caffe_op = BaseTranslate(layer, "AveragePool")
-    AddArgument(caffe_op, "order", "NCHW")
-    AddArgument(
-        caffe_op,
-        "kernels",
-        [param.kernel_depth, param.kernel_size, param.kernel_size])
-
-    AddArgument(
-        caffe_op,
-        "strides",
-        [param.temporal_stride, param.stride, param.stride])
-    temporal_pad = 0
-    spatial_pad = 0
-    if hasattr(param, 'temporal_pad'):
-        temporal_pad = param.temporal_pad
-    if hasattr(param, 'pad'):
-        spatial_pad = param.pad
-    AddArgument(caffe_op, "pads", [temporal_pad, spatial_pad, spatial_pad] * 2)
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("LRN")
-def TranslateLRN(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "LRN")
-    caffe_op.output.extend(['_' + caffe_op.output[0] + '_scale'])
-    param = layer.lrn_param
-    if param.norm_region != caffe_pb2.LRNParameter.ACROSS_CHANNELS:
-        raise ValueError(
-            "Does not support norm region other than across channels.")
-    AddArgument(caffe_op, "size", int(param.local_size))
-    AddArgument(caffe_op, "alpha", float(param.alpha))
-    AddArgument(caffe_op, "beta", float(param.beta))
-    AddArgument(caffe_op, "bias", float(param.k))
-    AddArgument(caffe_op, "order", "NCHW")
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("InnerProduct")
-def TranslateInnerProduct(layer, pretrained_blobs, is_test, **kwargs):
-    param = layer.inner_product_param
-    try:
-        if param.axis != 1 or param.transpose:
-            raise ValueError(
-                "We don't have testing case for non-default axis and transpose "
-                "cases yet so we are disabling it for now. If you have a model "
-                "with this, please do send us your model for us to update this "
-                "support, and you are more than welcome to send a PR for this.")
-    except AttributeError:
-        # We might be using an historic Caffe protobuf that does not have axis
-        # and transpose arguments, so we will silently pass.
-        pass
-    caffe_op = BaseTranslate(layer, "FC")
-    output = caffe_op.output[0]
-    caffe_op.input.extend([output + '_w', output + '_b'])
-    # To provide the old-style 4-dimensional blob (1, 1, dim_output, dim_input)
-    # case, we always explicitly reshape the pretrained blob.
-    if pretrained_blobs[0].ndim not in [2, 4]:
-        raise ValueError("Unexpected weight ndim.")
-    if (pretrained_blobs[0].ndim == 4 and
-            list(pretrained_blobs[0].shape[:2]) != [1, 1]):
-        raise ValueError(
-            "If pretrained blob has 4 dims (old-style Caffe), the first two "
-            "should be of value 1, but I got " + str(pretrained_blobs[0].shape))
-    weight = utils.NumpyArrayToCaffe2Tensor(
-        pretrained_blobs[0].reshape(-1, pretrained_blobs[0].shape[-1]),
-        output + '_w'
-    )
-    bias = utils.NumpyArrayToCaffe2Tensor(
-        pretrained_blobs[1].flatten(), output + '_b'
-    )
-    return caffe_op, [weight, bias]
-
-
-@TranslatorRegistry.Register("Dropout")
-def TranslateDropout(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "Dropout")
-    caffe_op.output.extend(['_' + caffe_op.output[0] + '_mask'])
-    param = layer.dropout_param
-    AddArgument(caffe_op, "ratio", param.dropout_ratio)
-    if (is_test):
-        AddArgument(caffe_op, "is_test", 1)
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("Softmax")
-def TranslateSoftmax(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "Softmax")
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("SoftmaxWithLoss")
-def TranslateSoftmaxWithLoss(layer, pretrained_blobs, is_test, **kwargs):
-    softmax_op = core.CreateOperator(
-        "Softmax", [layer.bottom[0]],
-        layer.bottom[0] + "_translator_autogen_softmax")
-    xent_op = core.CreateOperator(
-        "LabelCrossEntropy",
-        [softmax_op.output[0], layer.bottom[1]],
-        layer.bottom[0] + "_translator_autogen_xent")
-    loss_op = core.CreateOperator(
-        "AveragedLoss",
-        xent_op.output[0],
-        layer.top[0])
-    return [softmax_op, xent_op, loss_op], []
-
-
-@TranslatorRegistry.Register("Accuracy")
-def TranslateAccuracy(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "Accuracy")
-    if layer.accuracy_param.top_k != 1:
-        AddArgument(caffe_op, "top_k", layer.accuracy_param.top_k)
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("Concat")
-def TranslateConcat(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "Concat")
-    caffe_op.output.extend(['_' + caffe_op.output[0] + '_dims'])
-    AddArgument(caffe_op, "order", "NCHW")
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("TanH")
-def TranslateTanH(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "Tanh")
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("InstanceNorm")
-def TranslateInstanceNorm(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "InstanceNorm")
-    output = caffe_op.output[0]
-    weight = utils.NumpyArrayToCaffe2Tensor(
-        pretrained_blobs[0].flatten(), output + '_w')
-    bias = utils.NumpyArrayToCaffe2Tensor(
-        pretrained_blobs[1].flatten(), output + '_b')
-    caffe_op.input.extend([output + '_w', output + '_b'])
-    AddArgument(caffe_op, "order", "NCHW")
-    return caffe_op, [weight, bias]
-
-
-@TranslatorRegistry.Register("BatchNorm")
-def TranslateBatchNorm(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "SpatialBN")
-    output = caffe_op.output[0]
-    param = layer.batch_norm_param
-    AddArgument(caffe_op, "is_test", is_test)
-    AddArgument(caffe_op, "epsilon", param.eps)
-    AddArgument(caffe_op, "order", "NCHW")
-
-    caffe_op.input.extend(
-        [output + "_scale",
-         output + "_bias",
-         output + "_mean",
-         output + "_var"])
-    if not is_test:
-        caffe_op.output.extend(
-            [output + "_mean",
-             output + "_var",
-             output + "_saved_mean",
-             output + "_saved_var"])
-
-    n_channels = pretrained_blobs[0].shape[0]
-    if pretrained_blobs[2][0] != 0:
-        mean = utils.NumpyArrayToCaffe2Tensor(
-            (1. / pretrained_blobs[2][0]) * pretrained_blobs[0],
-            output + '_mean')
-        var = utils.NumpyArrayToCaffe2Tensor(
-            (1. / pretrained_blobs[2][0]) * pretrained_blobs[1],
-            output + '_var')
-    else:
-        raise RuntimeError("scalar is zero.")
-    if len(pretrained_blobs) > 3:
-        # IntelCaffe and NVCaffe uses fused BN+Scale,
-        # three blobs for BN and two blobs for Scale,
-        # so that the total number of blobs becomes five (including scale and bias).
-        scale = utils.NumpyArrayToCaffe2Tensor(
-            pretrained_blobs[3].flatten(),
-            output + '_scale')
-        bias = utils.NumpyArrayToCaffe2Tensor(
-            pretrained_blobs[4].flatten(),
-            output + '_bias')
-    else:
-        pretrained_blobs[2][0] = 1
-        pretrained_blobs[2] = np.tile(pretrained_blobs[2], (n_channels, ))
-        scale = utils.NumpyArrayToCaffe2Tensor(
-            pretrained_blobs[2],
-            output + '_scale')
-        bias = utils.NumpyArrayToCaffe2Tensor(
-            np.zeros_like(pretrained_blobs[2]),
-            output + '_bias')
-
-    return caffe_op, [scale, bias, mean, var]
-
-
-@TranslatorRegistry.Register("Eltwise")
-def TranslateElementWise(layer, pretrained_blobs, is_test, **kwargs):
-    param = layer.eltwise_param
-    # TODO(jiayq): if we have a protobuf that uses this, lift this constraint
-    # and verify that we can correctly translate.
-    if len(param.coeff) or param.operation != 1:
-        raise RuntimeError("This eltwise layer is not yet supported.")
-    caffe_op = BaseTranslate(layer, "Sum")
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("Scale")
-def TranslateScale(layer, pretrained_blobs, is_test, **kwargs):
-    mul_op = BaseTranslate(layer, "Mul")
-    scale_param = layer.scale_param
-    AddArgument(mul_op, "axis", scale_param.axis)
-    AddArgument(mul_op, "broadcast", True)
-    if len(mul_op.input) == 1:
-        # the scale parameter is in pretrained blobs
-        if scale_param.num_axes != 1:
-            raise RuntimeError("This path has not been verified yet.")
-
-        output = mul_op.output[0]
-        mul_op_param = output + 'scale_w'
-        mul_op.input.append(mul_op_param)
-        weights = []
-        weights.append(utils.NumpyArrayToCaffe2Tensor(
-            pretrained_blobs[0].flatten(), mul_op_param))
-
-        add_op = None
-        if len(pretrained_blobs) == 1:
-            # No bias-term in Scale layer
-            pass
-        elif len(pretrained_blobs) == 2:
-            # Caffe Scale layer supports a bias term such that it computes
-            # (scale_param * X + bias), whereas Caffe2 Mul op doesn't.
-            # Include a separate Add op for the bias followed by Mul.
-            add_op = copy.deepcopy(mul_op)
-            add_op.type = "Add"
-            add_op_param = output + 'scale_b'
-            internal_blob = output + "_internal"
-            del mul_op.output[:]
-            mul_op.output.append(internal_blob)
-            del add_op.input[:]
-            add_op.input.append(internal_blob)
-            add_op.input.append(add_op_param)
-            weights.append(utils.NumpyArrayToCaffe2Tensor(
-                pretrained_blobs[1].flatten(), add_op_param))
-        else:
-            raise RuntimeError("Unexpected number of pretrained blobs in Scale")
-
-        caffe_ops = [mul_op]
-        if add_op:
-            caffe_ops.append(add_op)
-        assert len(caffe_ops) == len(weights)
-        return caffe_ops, weights
-    elif len(mul_op.input) == 2:
-        # TODO(jiayq): find a protobuf that uses this and verify.
-        raise RuntimeError("This path has not been verified yet.")
-    else:
-        raise RuntimeError("Unexpected number of inputs.")
-
-
-@TranslatorRegistry.Register("Reshape")
-def TranslateReshape(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "Reshape")
-    caffe_op.output.append("_" + caffe_op.input[0] + "_dims")
-    reshape_param = layer.reshape_param
-    AddArgument(caffe_op, 'shape', reshape_param.shape.dim)
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("Flatten")
-def TranslateFlatten(layer, pretrained_blobs, is_test, **kwargs):
-    param = layer.flatten_param
-    if param.end_axis != -1:
-        raise NotImplementedError("flatten_param.end_axis not supported yet.")
-
-    if param.axis == 0:
-        caffe_op = BaseTranslate(layer, "FlattenToVec")
-    elif param.axis == 1:
-        caffe_op = BaseTranslate(layer, "Flatten")
-    else:
-        # This could be a Reshape op, but dim size is not known here.
-        raise NotImplementedError(
-            "Not supported yet for flatten_param.axis {}.".format(param.axis))
-
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("Sigmoid")
-def TranslateSigmoid(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "Sigmoid")
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("ROIPooling")
-def TranslateROIPooling(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "RoIPool")
-    AddArgument(caffe_op, "order", "NCHW")
-
-    if is_test:
-        AddArgument(caffe_op, "is_test", is_test)
-    else:
-        # Only used for gradient computation
-        caffe_op.output.append(caffe_op.output[0] + '_argmaxes')
-
-    param = layer.roi_pooling_param
-    if param.HasField('pooled_h'):
-        AddArgument(caffe_op, 'pooled_h', param.pooled_h)
-    if param.HasField('pooled_w'):
-        AddArgument(caffe_op, 'pooled_w', param.pooled_w)
-    if param.HasField('spatial_scale'):
-        AddArgument(caffe_op, 'spatial_scale', param.spatial_scale)
-
-    return caffe_op, []
-
-
-@TranslatorRegistry.Register("PReLU")
-def TranslatePRelu(layer, pretrained_blobs, is_test, **kwargs):
-    caffe_op = BaseTranslate(layer, "PRelu")
-    output = caffe_op.output[0]
-    caffe_op.input.extend([output + '_Slope'])
-    slope = utils.NumpyArrayToCaffe2Tensor(pretrained_blobs[0], output + '_Slope')
-
-    return caffe_op, [slope]
-
-
-@TranslatorRegistry.Register("Reduction")
-def TranslateReduction(layer, pretrained_blobs, is_test, **kwargs):
-    param = layer.reduction_param
-    if param.operation == caffe_pb2.ReductionParameter.SUM:
-        caffe_op = BaseTranslate(layer, "ReduceBackSum")
-    elif param.operation == caffe_pb2.ReductionParameter.MEAN:
-        caffe_op = BaseTranslate(layer, "ReduceBackMean")
-    else:
-        raise NotImplementedError("Not yet supported")
-
-    if param.axis > 0:
-        # We can't figure out the number of dims to reduce from positive axis
-        # for back reduction since the shape info is not known here.
-        raise NotImplementedError("Not yet supported")
-    num_reduce_dim = -param.axis
-    AddArgument(caffe_op, "num_reduce_dim", num_reduce_dim)
-
-    return caffe_op, []
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description="Utilitity to convert pretrained caffe models to Caffe2 models.")
-    parser.add_argument("prototext", help="Caffe prototext.")
-    parser.add_argument("caffemodel", help="Caffe trained model.")
-    parser.add_argument("--init_net", help="Caffe2 initialization net.",
-                        default="init_net.pb")
-    parser.add_argument("--predict_net", help="Caffe2 prediction net.",
-                        default="predict_net.pb")
-    parser.add_argument("--remove_legacy_pad", help="Remove legacy pad \
-                        (Only works for nets with one input blob)",
-                        action="store_true",
-                        default=False)
-    parser.add_argument("--input_dims", help="Dimension of input blob", nargs='+',
-                        type=int, default=[])
-    args = parser.parse_args()
-
-    caffenet = caffe_pb2.NetParameter()
-    caffenet_pretrained = caffe_pb2.NetParameter()
-    input_proto = args.prototext
-    input_caffemodel = args.caffemodel
-    output_init_net = args.init_net
-    output_predict_net = args.predict_net
-
-    with open(input_proto) as f:
-        text_format.Merge(f.read(), caffenet)
-    with open(input_caffemodel, 'rb') as f:
-        caffenet_pretrained.ParseFromString(f.read())
-    net, pretrained_params = TranslateModel(
-        caffenet, caffenet_pretrained, is_test=True,
-        remove_legacy_pad=args.remove_legacy_pad,
-        input_dims=args.input_dims
-    )
-
-    # Assume there is one input and one output
-    external_input = net.op[0].input[0]
-    external_output = net.op[-1].output[0]
-
-    net.external_input.extend([external_input])
-    net.external_input.extend([param.name for param in pretrained_params.protos])
-    net.external_output.extend([external_output])
-    init_net = ConvertTensorProtosToInitNet(pretrained_params, external_input)
-
-    with open(output_predict_net, 'wb') as f:
-        f.write(net.SerializeToString())
-    with open(output_predict_net + 'txt', 'w') as f:
-        f.write(str(net))
-    with open(output_init_net, 'wb') as f:
-        f.write(init_net.SerializeToString())
diff --git a/caffe2/python/caffe_translator_test.py b/caffe2/python/caffe_translator_test.py
deleted file mode 100644
index ff24afe975f2..000000000000
--- a/caffe2/python/caffe_translator_test.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# This a large test that goes through the translation of the bvlc caffenet
-# model, runs an example through the whole model, and verifies numerically
-# that all the results look right. In default, it is disabled unless you
-# explicitly want to run it.
-
-from google.protobuf import text_format
-import numpy as np
-import os
-import sys
-
-CAFFE_FOUND = False
-try:
-    from caffe.proto import caffe_pb2
-    from caffe2.python import caffe_translator
-    CAFFE_FOUND = True
-except Exception as e:
-    # Safeguard so that we only catch the caffe module not found exception.
-    if ("'caffe'" in str(e)):
-        print(
-            "PyTorch/Caffe2 now requires a separate installation of caffe. "
-            "Right now, this is not found, so we will skip the caffe "
-            "translator test.")
-
-from caffe2.python import utils, workspace, test_util
-import unittest
-
-def setUpModule():
-    # Do nothing if caffe and test data is not found
-    if not (CAFFE_FOUND and os.path.exists('data/testdata/caffe_translator')):
-        return
-    # We will do all the computation stuff in the global space.
-    caffenet = caffe_pb2.NetParameter()
-    caffenet_pretrained = caffe_pb2.NetParameter()
-    with open('data/testdata/caffe_translator/deploy.prototxt') as f:
-        text_format.Merge(f.read(), caffenet)
-    with open('data/testdata/caffe_translator/'
-              'bvlc_reference_caffenet.caffemodel') as f:
-        caffenet_pretrained.ParseFromString(f.read())
-    for remove_legacy_pad in [True, False]:
-        net, pretrained_params = caffe_translator.TranslateModel(
-            caffenet, caffenet_pretrained, is_test=True,
-            remove_legacy_pad=remove_legacy_pad
-        )
-        with open('data/testdata/caffe_translator/'
-                  'bvlc_reference_caffenet.translatedmodel',
-                  'w') as fid:
-            fid.write(str(net))
-        for param in pretrained_params.protos:
-            workspace.FeedBlob(param.name, utils.Caffe2TensorToNumpyArray(param))
-        # Let's also feed in the data from the Caffe test code.
-        data = np.load('data/testdata/caffe_translator/data_dump.npy').astype(
-            np.float32)
-        workspace.FeedBlob('data', data)
-        # Actually running the test.
-        workspace.RunNetOnce(net.SerializeToString())
-
-
-@unittest.skipIf(not CAFFE_FOUND,
-                 'No Caffe installation found.')
-@unittest.skipIf(not os.path.exists('data/testdata/caffe_translator'),
-                 'No testdata existing for the caffe translator test. Exiting.')
-class TestNumericalEquivalence(test_util.TestCase):
-    def testBlobs(self):
-        names = [
-            "conv1", "pool1", "norm1", "conv2", "pool2", "norm2", "conv3",
-            "conv4", "conv5", "pool5", "fc6", "fc7", "fc8", "prob"
-        ]
-        for name in names:
-            print('Verifying {}'.format(name))
-            caffe2_result = workspace.FetchBlob(name)
-            reference = np.load(
-                'data/testdata/caffe_translator/' + name + '_dump.npy'
-            )
-            self.assertEqual(caffe2_result.shape, reference.shape)
-            scale = np.max(caffe2_result)
-            np.testing.assert_almost_equal(
-                caffe2_result / scale,
-                reference / scale,
-                decimal=5
-            )
-
-
-if __name__ == '__main__':
-    if len(sys.argv) == 1:
-        print(
-            'If you do not explicitly ask to run this test, I will not run it. '
-            'Pass in any argument to have the test run for you.'
-        )
-        sys.exit(0)
-    unittest.main()
diff --git a/caffe2/python/checkpoint.py b/caffe2/python/checkpoint.py
deleted file mode 100644
index 0b6baea95265..000000000000
--- a/caffe2/python/checkpoint.py
+++ /dev/null
@@ -1,833 +0,0 @@
-## @package checkpoint
-# Module caffe2.python.checkpoint
-
-
-
-
-
-import os
-import logging
-from caffe2.python import core, context
-from caffe2.python.net_builder import ops
-from caffe2.python.task import (
-    final_output,
-    Node,
-    Task,
-    TaskGroup,
-    TaskOutput,
-    WorkspaceType,
-)
-
-logger = logging.getLogger(__name__)
-
-
-
-class Job(context.Managed):
-    """
-    A Job defines three TaskGroups: the `init_group`, the `epoch_group` and the
-    `exit_group` which will be run by a JobRunner.
-
-    The `init_group` will be run only once at startup. Its role is to
-    initialize globally persistent blobs such as model weights, accumulators
-    and data file lists.
-
-    The `epoch_group` will be run in a loop after init_group. The loop will
-    exit when any of the stop signals added with `add_stop_condition` is True
-    at the end of an epoch.
-
-    The download_group will be run only once, after all the executions of
-    epoch_group finish. Its role is to collect the distribute scattered
-    parameters back after training.
-
-    The `exit_group` will be run only once at the very end of the job, the
-    role of this group is to save the results of training in the end of the job.
-
-    Jobs are context-driven, so that Tasks can be added to the active Job
-    without having to explicitly pass the job object around.
-
-    Example of usage:
-
-    def build_reader(partitions):
-        with Job.current().init_group:
-            reader = HiveReader(init_reader, ..., partitions)
-            Task(step=init_reader)
-        with Job.current().epoch_group:
-            limited_reader = ReaderWithLimit(reader, num_iter=10000)
-            data_queue = pipe(limited_reader, num_threads=8)
-            Job.current().add_stop_condition(limited_reader.data_finished())
-        return data_queue
-
-    def build_hogwild_trainer(reader, model):
-        with Job.current().init_group:
-            Task(step=model.param_init_net)
-        with Job.current().epoch_group:
-            pipe(reader, processor=model, num_threads=8)
-        with Job.current().exit_group:
-            Task(step=model.save_model_net)
-
-    with Job() as job:
-        reader = build_reader(partitions)
-        model = build_model(params)
-        build_hogwild_trainer(reader, model)
-    """
-    def __init__(self,
-                 init_group=None, epoch_group=None,
-                 download_group=None, exit_group=None,
-                 stop_conditions=None, nodes_to_checkpoint=None):
-        self.init_group = init_group or TaskGroup(
-            workspace_type=WorkspaceType.GLOBAL)
-        self.epoch_group = epoch_group or TaskGroup()
-        self.download_group = download_group or TaskGroup()
-        self.exit_group = exit_group or TaskGroup()
-        self.stop_conditions = stop_conditions or []
-        self._nodes_to_checkpoint = nodes_to_checkpoint
-
-    def nodes_to_checkpoint(self):
-        if self._nodes_to_checkpoint:
-            return self._nodes_to_checkpoint
-        else:
-            return self.init_group.used_nodes()
-
-    def compile(self, session_class):
-        self._nodes_to_checkpoint = self.nodes_to_checkpoint()
-        self.init_group = session_class.compile(self.init_group)
-        self.epoch_group = session_class.compile(self.epoch_group)
-        self.download_group = session_class.compile(self.download_group)
-        self.exit_group = session_class.compile(self.exit_group)
-
-    def __enter__(self):
-        super().__enter__()
-        self.epoch_group.__enter__()
-        return self
-
-    def __exit__(self, *args):
-        self.epoch_group.__exit__()
-        super().__exit__(*args)
-
-    def add_stop_condition(self, output):
-        if isinstance(output, core.BlobReference):
-            t = Task(outputs=[output], group=self.epoch_group)
-            output = t.outputs()[0]
-        assert isinstance(output, TaskOutput)
-        self.stop_conditions.append(output)
-
-
-def get_ckpt_filename(node_name, epoch):
-    """Returns the checkpoint filename.
-
-    Args:
-        node_name: A string. The name of the node.
-        epoch: An integer. The checkpoint epoch.
-
-    Returns:
-        ckpt_filename: A string. The filename of the checkpoint.
-    """
-    return node_name + '.' + str(epoch)
-
-
-def db_name(epoch, node_name, db_prefix, path_prefix=None):
-    """Returns the full db name where checkpoint files are saved.
-
-    Args:
-        epoch: An integer. The checkpoint epoch.
-        node_name: A string. The name of the node.
-        db_prefix: A string. The prefix used to construct full db name.
-        path_prefix: A string. Optional param used to construct db name or path
-            where checkpoint files are stored.
-    Returns:
-        db_name: A string. The absolute path of full_db_name where checkpoint
-            files are saved
-    """
-    if path_prefix:
-        db_name = path_prefix + get_ckpt_filename(node_name, epoch)
-    else:
-        ckpt_filename = get_ckpt_filename(node_name, epoch)
-        db_name = os.path.join(db_prefix, ckpt_filename)
-    return db_name
-
-
-class CheckpointManager:
-    """
-    Controls saving and loading of workspaces on every epoch boundary of a job.
-    If a CheckpointManager instance is passed to JobRunner, then JobRunner will
-    call `init`, `read` and `save` at different moments in between epoch runs.
-
-    Args:
-        db_prefix: The prefix used to construct full db name. Since `absolute_path`
-            is set to True, this will be used as db_name in SaveOp.
-        node_name: Name of the node where this checkpoint_manager is used.
-        db_type: Type of database to use for storing checkpoint.
-        metadata_handler: An optional object capable of reading/writing
-            checkpoint info in storage of choice.
-    """
-
-    BLOB_NAMES = "blob_names"
-
-    def __init__(self, db_prefix, node_name, db_type, metadata_handler=None):
-        self._db_prefix = db_prefix
-        self._node_name = node_name
-        self._db_type = db_type
-        self._metadata_handler = metadata_handler
-        # make sure these blobs are the first in the checkpoint file.
-        self._net = core.Net('!!checkpoint_mngr')
-        self._blob_names = self._net.AddExternalInput(self.BLOB_NAMES)
-        self._names_output = None
-        self._path_prefix = None
-        self._path_type = None
-        self._current_db_name = None
-        self._current_checkpoint_duration = None
-
-    """
-    Initialize the checkpoint manager. Determines all blobs that need to be saved
-    or loads from a checkpoint.
-
-    Args:
-        nodes: An array of nodes where this checkpoint manager is running. Should
-            only contain a single node.
-        retrieve_from_epoch: Set to a number to load blobs from this epoch.
-        path_prefix: Used to construct db name or path where checkpoint files are
-            stored.
-        path_type: Indicate the type of path where checkpoint files are stored.
-    """
-    def init(
-        self,
-        nodes=None,
-        retrieve_from_epoch=None,
-        path_prefix=None,
-        path_type=None
-    ):
-        """
-        Build a Task that will be run once after the job's `init_group` is run.
-        This task will determine which blobs need to be checkpointed.
-        If retrieve_from_epoch is not None, then the checkpoint metadata is
-        retrieved from a previously saved checkpoint.
-        """
-        assert nodes is None or len(nodes) == 1, (
-            'CheckpointManager only supports single node.')
-
-        with Task(outputs=[self._blob_names]) as task:
-            if retrieve_from_epoch is None:
-                ops.GetAllBlobNames(
-                    [],
-                    self._blob_names,
-                    include_shared=False)
-            else:
-                full_db_name = db_name(retrieve_from_epoch,
-                                        self._node_name, self._db_prefix, path_prefix)
-                db_type = path_type or self._db_type
-                logger.info("Initializing checkpoints from = %s"
-                            % full_db_name)
-                ops.Load(
-                    [], self._blob_names,
-                    db=full_db_name,
-                    db_type=db_type,
-                    absolute_path=True,
-                    keep_device=True,
-                )
-        self._names_output = task.outputs()[0]
-        return task
-
-    def blob_list(self):
-        assert self._names_output
-        return self._names_output.fetch().tolist()
-
-    def _timed_task(self, cp_op_name, add_op):
-        """
-        Build a Task that will measure the time span of checkpoint operations,
-        once operation is done, time can be read from _current_checkpoint_duration.
-
-        Args:
-            cp_op_name: A string name of the checkpoint operation.
-            add_op: A functor to add the checkpoint operation.
-
-        Returns:
-            A task with timer.
-        """
-        with Task(name=cp_op_name) as task:
-            with ops.task_init():
-                timer = ops.TimerBegin([], counter_name=self._node_name)
-            add_op()
-            with ops.task_exit():
-                time_span_blob = ops.TimerGetAndEnd(timer)
-            self._current_checkpoint_duration = final_output(time_span_blob)
-        return task
-
-    def collect_checkpoint_stats(self, stats):
-        """
-        Add one checkpoint stats into the stats.
-
-        Args:
-            stats: A dict of checkpoint stats that will be reported.
-        """
-        if self._current_db_name and self._current_checkpoint_duration:
-            stats[self._current_db_name] = self._current_checkpoint_duration.fetch()[0]
-        else:
-            logger.info(
-                "Failed to collect checkpoint stats: {}".format(
-                    self._current_db_name
-                )
-            )
-
-    def load(self, epoch, path_prefix=None, path_type=None):
-        """
-        Build a Task that will be run by JobRunner when the job is to be
-        resumed from a given epoch. This task will run a Load op that will
-        load and deserialize all relevant blobs from a persistent storage.
-        """
-        self._current_db_name = db_name(
-            epoch, self._node_name, self._db_prefix, path_prefix
-        )
-        db_type = path_type or self._db_type
-        logger.info("Loading checkpoints from = %s" % self._current_db_name)
-
-        def add_op():
-            ops.Load(
-                [],
-                self.blob_list(),
-                db=self._current_db_name,
-                db_type=db_type,
-                absolute_path=True,
-                keep_device=True,
-            )
-
-        return self._timed_task('checkpoint_load', add_op)
-
-    def load_blobs_from_checkpoint(self, blob_names, epoch):
-        """
-        Builds a Task that loads only the necessary blobs from a checkpoint of
-        the given epoch. The necessary blobs are given in the blob_names
-        argument.
-
-        Args:
-            blob_names: A list of strings. Each string is the name of a
-                blob.
-            epoch: The checkpoint epoch to load from.
-
-        Returns:
-            A Task which loads the specified blobs from the checkpoint of the
-            given epoch.
-        """
-        self._current_db_name = db_name(epoch, self._node_name, self._db_prefix)
-        logger.info('Load from %s' % self._current_db_name)
-
-        def add_op():
-            ops.Load(
-                [],
-                blob_names,
-                db=self._current_db_name,
-                db_type=self._db_type,
-                absolute_path=True,
-                allow_incomplete=True)
-
-        return self._timed_task('checkpoint_partial_load', add_op)
-
-    def check_db_exists(self, epoch):
-        logger.info('Check existence of %s' %
-                    db_name(epoch, self._node_name, self._db_prefix))
-        with Task() as task:
-            existence = ops.Const(False)
-            ops.DBExists(
-                [],
-                [existence],
-                db_name=db_name(epoch, self._node_name, self._db_prefix),
-                db_type=self._db_type,
-                absolute_path=True)
-            task.add_output(existence)
-        return task
-
-    def report_checkpoint_stats(self, action_name):
-        """
-        Report checkpoint operation stats for current node.
-
-        Args:
-            action_name: A string of the name of checkpoint operation.
-        """
-        all_stats = {}
-        self.collect_checkpoint_stats(all_stats)
-        if self._metadata_handler:
-            self._metadata_handler.report(action_name, all_stats)
-
-    def save(self, epoch):
-        """
-        Build a Task that is run once after `init_group` and after each
-        epoch is run. This will execute a Save ops to serialize and persist
-        blobs present in the global workspace.
-        """
-        self._current_db_name = db_name(epoch, self._node_name, self._db_prefix)
-        logger.info('Saving to %s' % self._current_db_name)
-
-        def add_op():
-            ops.Save(
-                self.blob_list(), [],
-                db=self._current_db_name,
-                db_type=self._db_type,
-                absolute_path=True)
-
-        return self._timed_task('checkpoint_save', add_op)
-
-    def write_checkpoint_metadata(self, epoch):
-        """
-        Write metadata for checkpoint
-
-        Args:
-            epoch: An integer. The epoch-id for which checkpoint metadata is
-                written
-        """
-        if self._metadata_handler is not None:
-            self._metadata_handler.write(epoch=epoch)
-
-    def get_resume_from_epoch_id(self, user_epoch=None):
-        """
-        Identify the epoch-id from which Job must resume
-
-        Args:
-            user_epoch: An integer. Optional parameter for user to explicitly
-                identify the epoch-id to load checkpoint from
-        Returns:
-            epoch: the epoch-id to load checkpoints from
-                or None if no checkpoints were written
-        """
-        last_epoch = user_epoch
-        if self._metadata_handler is not None:
-            last_epoch = self._metadata_handler.last_epoch(user_epoch=user_epoch)
-        return last_epoch
-
-    def set_params(self, nodes, path_prefix=None, path_type=None):
-        """Set parameters associated with CP manager
-
-        Args:
-            nodes: An array of nodes where this checkpoint manager is running.
-            path_prefix: Used to construct db name or path where checkpoint files are
-                stored.
-            path_type: Indicate the type of path where checkpoint files are stored.
-        """
-        if path_prefix:
-            self._path_prefix = path_prefix
-        if path_type:
-            self._path_type = path_type
-        if self._metadata_handler:
-            self._metadata_handler.set_params(
-                db_prefix=self._db_prefix,
-                db_type=self._db_type,
-                node_names=[str(self._node_name)],
-                path_prefix=self._path_prefix,
-                path_type=self._path_type)
-
-    def cp_accessible(self, epoch=None):
-        """Returns True if Checkpoint data is accessible
-
-        Args:
-            epoch: An integer. The epoch of the checkpoint. If None,
-                it implies we need to check if checkpoint directory is accessible
-
-        Returns:
-            is_cp_accessible: A boolean. Returns True if Checkpoint data is accessible
-        """
-        if self._metadata_handler is not None:
-            return self._metadata_handler.cp_accessible(epoch)
-        else:
-            return True
-
-
-class MultiNodeCheckpointManager:
-    """
-    Coordinates checkpointing and checkpointing across multiple nodes.
-    Each of `init`, `load` and `save` will build TaskGroups which will
-    trigger checkpointing on each of the nodes involved in a distributed job.
-
-    Args:
-        db_prefix: The prefix used to construct full db name. Since `absolute_path`
-            is set to True, this will be used as db_name in SaveOp.
-        db_type: Type of database to use for storing checkpoint.
-        metadata_handler: An optional object capable of reading/writing
-            checkpoint info in storage of choice.
-    """
-    def __init__(self, db_prefix, db_type, metadata_handler=None):
-        self._node_managers = None
-        self._db_prefix = db_prefix
-        self._db_type = db_type
-        self._metadata_handler = metadata_handler
-        self._path_prefix = None
-        self._path_type = None
-
-    def _task_group(self, func, *args, **kw):
-        assert self._node_managers is not None, 'init must be called first.'
-        with TaskGroup(WorkspaceType.GLOBAL) as task_group:
-            for node, manager in self._node_managers:
-                with Node(node):
-                    func(manager, *args, **kw)
-            return task_group
-
-    """
-    Args:
-        nodes: An array of nodes where this checkpoint manager is running.
-        retrieve_from_epoch: Set to a number to load blobs from this epoch.
-        path_prefix: Used to construct db name or path where checkpoint files are
-            stored.
-        path_type: Indicate the type of path where checkpoint files are stored.
-    """
-    def init(
-        self, nodes, retrieve_from_epoch=None, path_prefix=None, path_type=None
-    ):
-        if self._node_managers is not None:
-            assert [node for node, _ in self._node_managers] == nodes
-            return TaskGroup(WorkspaceType.GLOBAL)
-        self._node_managers = []
-        for node in nodes:
-            with Node(node):
-                manager = CheckpointManager(
-                    db_prefix=self._db_prefix,
-                    node_name=str(node),
-                    db_type=self._db_type)
-                self._node_managers.append((node, manager))
-        return self._task_group(
-            CheckpointManager.init,
-            nodes=[node],
-            retrieve_from_epoch=retrieve_from_epoch,
-            path_prefix=path_prefix,
-            path_type=path_type)
-
-    def load(self, epoch, path_prefix=None, path_type=None):
-        return self._task_group(
-            CheckpointManager.load,
-            epoch,
-            path_prefix=path_prefix,
-            path_type=path_type)
-
-    def load_blobs_locally(self, nodes, blob_names, epoch, session):
-        """Loads the necessary blobs from the checkpoints to the current node.
-
-        Args:
-            blob_names: A list of strings. Each string is the name of a
-                blob.
-            epoch: An integer. The checkpoint epoch to load from.
-            session: A Session object to execute the Load ops.
-        """
-        if self._node_managers is not None:
-            assert [node for node, _ in self._node_managers] == nodes
-        else:
-            self._node_managers = []
-            for node in nodes:
-                with Node(node):
-                    manager = CheckpointManager(
-                        db_prefix=self._db_prefix,
-                        node_name=str(node),
-                        db_type=self._db_type)
-                    self._node_managers.append((node, manager))
-        assert self._node_managers is not None, 'must initialize node managers'
-        for _, manager in self._node_managers:
-            existence_task = manager.check_db_exists(epoch)
-            session.run(existence_task)
-            existence = existence_task.outputs()[0].fetch()
-            if not existence:
-                logger.info('DB %s does not exist!' %
-                            db_name(epoch, manager._node_name, manager._db_prefix))
-                return False
-            load_task = manager.load_blobs_from_checkpoint(blob_names, epoch)
-            session.run(load_task)
-        logger.info('Successfully loaded from checkpoints.')
-        return True
-
-    def get_ckpt_db_name(self, node_name, epoch):
-        """Returns the DB name of the given node and the given epoch.
-
-        The DB name is effectively the checkpoint path of the given node and
-        the given epoch.
-
-        Args:
-            node_name: A string. The node name of interest.
-            epoch: An integer. The epoch of the checkpoint.
-
-        Returns:
-            checkpoint_db_name: A string. The checkpoint path of the given
-                node and the given epoch.
-        """
-        for node, manager in self._node_managers:
-            if str(node) == node_name:
-                return db_name(epoch, manager._node_name, manager._db_prefix)
-
-    def report_checkpoint_stats(self, action_name):
-        """
-        Report the checkpoint stats for all the nodes, we need to aggregate all
-        the node's stats together so that we know which node's checkpoint
-        operation dominates.
-
-        Args:
-            action_name: A string of the name of checkpoint operation.
-        """
-        all_stats = {}
-        for _, manager in self._node_managers:
-            manager.collect_checkpoint_stats(all_stats)
-        logger.debug("checkpoint stats: {}".format(all_stats))
-        if self._metadata_handler:
-            self._metadata_handler.report(action_name, all_stats)
-
-    def save(self, epoch):
-        """
-        Build a Task that will execute a Save ops to serialize and persist
-        blobs present in the global workspace.
-        """
-        return self._task_group(CheckpointManager.save, epoch)
-
-    def write_checkpoint_metadata(self, epoch):
-        """
-        Write metadata for checkpoint
-
-        Args:
-            epoch: An integer. The epoch-id for which checkpoint metadata is
-                written
-        """
-        if self._metadata_handler is not None:
-            self._metadata_handler.write(epoch=epoch)
-
-    def get_resume_from_epoch_id(self, user_epoch=None):
-        """
-        Identify the epoch-id from which Job must resume
-
-        Args:
-            user_epoch: An integer. Optional parameter for user to explicitly
-                identify the epoch-id to load checkpoint from
-        Returns:
-            epoch: the epoch-id to load checkpoints from
-                or None if no checkpoints were written
-        """
-        last_epoch = user_epoch
-        if self._metadata_handler is not None:
-            last_epoch = self._metadata_handler.last_epoch(user_epoch=user_epoch)
-        return last_epoch
-
-    def set_params(self, nodes, path_prefix=None, path_type=None):
-        """Set parameters associated with CP manager
-
-        Args:
-            nodes: An array of nodes where this checkpoint manager is running.
-            path_prefix: Used to construct db name or path where checkpoint files are
-                stored.
-            path_type: Indicate the type of path where checkpoint files are stored.
-        """
-        self._node_names = [str(node) for node in nodes]
-        if path_prefix:
-            self._path_prefix = path_prefix
-        if path_type:
-            self._path_type = path_type
-        if self._metadata_handler:
-            self._metadata_handler.set_params(
-                db_prefix=self._db_prefix,
-                db_type=self._db_type,
-                node_names=self._node_names,
-                path_prefix=self._path_prefix,
-                path_type=self._path_type)
-
-    def cp_accessible(self, epoch=None):
-        """Returns True if Checkpoint data is accessible
-
-        Args:
-            epoch: An integer. The epoch of the checkpoint. If None,
-                it implies we need to check if checkpoint directory is accessible
-
-        Returns:
-            is_cp_accessible: A boolean. Returns True if Checkpoint data is accessible
-        """
-        if self._metadata_handler is not None:
-            return self._metadata_handler.cp_accessible(epoch)
-        else:
-            return True
-
-
-class UploadTaskGroupBuilder:
-    """A simple class to upload checkpoints."""
-    def build(self, epoch, checkpoint_manager):
-        """Builds the task group to upload checkpoints.
-
-        Args:
-            epoch: An integer. The checkpoint epoch to be uploaded.
-            checkpoint_manager: Can be a CheckpointManager for single machine
-                or a MultiNodeCheckpointManager for multi-machine. The manager
-                that initializes/saves/loads checkpoints.
-
-        Raises:
-            NotImplementedError: This base class only has the interface,
-                the implementation will be in the subclasses.
-        """
-        raise NotImplementedError()
-
-
-class JobRunner:
-    """
-    Implement the runtime logic for jobs with checkpointing at the level of
-    epoch. Can be used to run either single-host or distributed jobs. Job
-    runner is a callable to be called once from the master, passing a session
-    as an argument. This call will block until the Job execution is complete.
-
-    If a checkpoint_manager is passed, checkpoints will be taken after
-    initialization and after each epoch execution. If, in addition,
-    `resume_from_epoch` is an epoch number, the corresponding checkpoint will
-    be loaded and job execution will continue from the given epoch. In
-    this case, the job's init_group will not be run.
-
-    Refer to checkpoint_test.py for an example.
-    """
-    def __init__(self, job, checkpoint_manager=None, resume_from_epoch=None,
-                 upload_task_group_builder=None):
-        """Initializes the JobRunner.
-
-        Args:
-            job: A Job object. The job to be executed.
-            checkpoint_manager: Can be a CheckpointManager for single machine
-                or a MultiNodeCheckpointManager for multi-machine. The manager
-                that initializes/saves/loads checkpoints.
-            resume_from_epoch: An integer. The epoch to resume from.
-            upload_task_group_builder: A subclass of the
-                UploadTaskGroupBuilder. Creates a task group to upload
-                checkpoints.
-        """
-        self.resume_from_epoch = resume_from_epoch
-        self.checkpoint_manager = checkpoint_manager
-        self.job = job
-        self.upload_task_group_builder = upload_task_group_builder
-
-    def train(self, session):
-        """Runs the training flow.
-
-        Args:
-            session: A Session object. Valid choises are: LocalSession,
-                LocalHostScheduler, and DistributedSession. It is used to
-                execute one TaskGroup a time.
-        """
-        # identify the epoch we must resume from
-        if self.checkpoint_manager:
-            self.checkpoint_manager.set_params(nodes=self.job.nodes_to_checkpoint())
-            self.resume_from_epoch = self.checkpoint_manager.\
-                get_resume_from_epoch_id(self.resume_from_epoch)
-            if self.resume_from_epoch is not None:
-                logger.info('Resuming from epoch {}'.format(self.resume_from_epoch))
-
-        # Initialize all the nodes.
-        from_scratch = self.resume_from_epoch is None
-        if from_scratch:
-            session.run(self.job.init_group)
-
-        if self.checkpoint_manager:
-            logger.info('Preparing checkpoints ...')
-            session.run(self.checkpoint_manager.init(
-                self.job.nodes_to_checkpoint(),
-                retrieve_from_epoch=self.resume_from_epoch))
-            # Save the first checkpoint before training starts, or resume from
-            # a previously saved checkpoint.
-            if from_scratch:
-                self.save_checkpoints(0, session)
-            else:
-                logger.info('Loading checkpoints for epoch {} ...'.format(
-                    self.resume_from_epoch))
-                session.run(
-                    self.checkpoint_manager.load(self.resume_from_epoch))
-                self.checkpoint_manager.report_checkpoint_stats('checkpoint_load')
-                logger.info('Checkpoint loaded')
-
-        logger.info("Finished initializing")
-
-        # Start training.
-        epoch = 1 if from_scratch else self.resume_from_epoch + 1
-        while True:
-            logger.info('Starting epoch %d' % epoch)
-            session.run(self.job.epoch_group)
-            logger.info('Finished epoch %d' % epoch)
-            stop_conditions = [o.fetch() for o in self.job.stop_conditions]
-
-            if self.checkpoint_manager:
-                self.save_checkpoints(epoch, session)
-
-            if any(stop_conditions):
-                logger.info('Stopping')
-                break
-            epoch += 1
-        logger.info('Finished training')
-        # Upload the checkpoints.
-        if (self.upload_task_group_builder):
-            upload_task_group = self.upload_task_group_builder.build(
-                epoch, self.checkpoint_manager)
-            session.run(upload_task_group)
-            logger.info('Finished uploading the checkpoints')
-
-        # Download the parameters to save
-        session.run(self.job.download_group)
-        logger.info('Finished downloading the parameters')
-
-        # Finally run the exit step to save nets
-        session.run(self.job.exit_group)
-        logger.info('Finished running the exit group')
-        return epoch
-
-    def load_blobs_from_checkpoints(self, blob_names, epoch, session):
-        """Loads the necessary blobs from the checkpoints.
-
-        Checkpoints store the snapshots of the workspace in each node.
-        Sometimes we only need to load a subset of the blobs from the
-        checkpoints. One common scenario is to load only the model blobs from
-        the checkpoints for evaluation purpose. Given the names of the
-        necessary blobs, this function goes over all the checkpoints of all the
-        nodes, but only loads the blobs specified in the blob_names to the
-        current workspace.
-
-        Args:
-            blob_names: A list of strings. Each string is the name of a
-                blob.
-            epoch: An integer. The checkpoint epoch to load from.
-            session: A Session object to execute the load ops.
-
-        Raises:
-            ValueError: When the checkpoint manager is invalid.
-        """
-        if not self.checkpoint_manager:
-            raise ValueError('Checkpoint manager is None')
-        logger.info('Loading checkpoint for epoch {} ...'.format(epoch))
-        result = self.checkpoint_manager.load_blobs_locally(
-            self.job.nodes_to_checkpoint(), blob_names, epoch, session)
-        self.checkpoint_manager.report_checkpoint_stats('checkpoint_partial_load')
-        return result
-
-    def save_checkpoints(self, epoch, session):
-        """Triggers operation to save checkpoints
-
-        This method will trigger the Save ops to serialize and persist the
-        blobs present in the global workspaace.
-
-        Args:
-            epoch: An integer. The checkpoint epoch-id that we are saving.
-            session: A Session object to execute the save ops.
-
-        Raises:
-            ValueError: When the checkpoint manager is invalid.
-        """
-        if not self.checkpoint_manager:
-            raise ValueError('Checkpoint manager is None')
-        try:
-            is_accessible = self.checkpoint_manager.cp_accessible(epoch=None)
-            if is_accessible:
-                logger.info('Saving checkpoints for epoch {}'.format(epoch))
-                session.run(self.checkpoint_manager.save(epoch))
-                self.checkpoint_manager.write_checkpoint_metadata(epoch)
-                logger.info('Checkpoints saved')
-                self.checkpoint_manager.report_checkpoint_stats('checkpoint_save')
-            else:
-                logger.warning("Checkpoint files cannot be accessed!")
-        except Exception as ex:
-            logger.warning("Unable to write checkpoint for epoch {}. Error={}".
-                            format(epoch, ex))
-
-
-def epoch_limiter(job, num_epochs):
-    """
-    Creates a task that will output True when a given
-    number of epochs has finished.
-    """
-    with job.init_group:
-        init_net = core.Net('epoch_counter_init')
-        counter = init_net.CreateCounter([], init_count=num_epochs - 1)
-        Task(step=init_net)
-
-    with job.epoch_group:
-        epoch_net = core.Net('epoch_countdown')
-        finished = epoch_net.CountDown(counter)
-        output = Task(step=epoch_net, outputs=finished).outputs()[0]
-    job.add_stop_condition(output)
diff --git a/caffe2/python/checkpoint_test.py b/caffe2/python/checkpoint_test.py
deleted file mode 100644
index b97e0f6c5bcd..000000000000
--- a/caffe2/python/checkpoint_test.py
+++ /dev/null
@@ -1,338 +0,0 @@
-
-
-
-
-
-from caffe2.python.schema import Struct, ConstRecord
-from caffe2.python import core, workspace, model_helper
-from caffe2.python.session import LocalSession
-from caffe2.python.dataset import Dataset
-from caffe2.python.pipeline import pipe
-from caffe2.python.checkpoint import (
-    CheckpointManager, MultiNodeCheckpointManager, Job, JobRunner, epoch_limiter,
-    UploadTaskGroupBuilder, db_name)
-from caffe2.python.net_builder import ops
-from caffe2.python.task import Node, Task, TaskGroup, WorkspaceType, Cluster
-from caffe2.python.test_util import TestCase
-from caffe2.python.dataio import ReaderWithLimit
-
-import numpy as np
-import os
-import shutil
-import tempfile
-
-
-def build_pipeline(node_id):
-    with Node('trainer_%d' % node_id):
-        with Job.current().init_group, Task():
-            data_arr = Struct(('val', np.array(list(range(10)))))
-            data = ConstRecord(ops, data_arr)
-            ds = Dataset(data, name='dataset:%d' % node_id)
-            full_reader = ds.reader(ops)
-            total = ops.Const([100])
-
-        def inc_total(rec):
-            ops.Add([total, rec.val()], [total])
-
-        epoch_reader = ReaderWithLimit(full_reader, num_iter=3)
-        pipe(epoch_reader, processor=inc_total)
-        Job.current().add_stop_condition(epoch_reader.data_finished())
-    return [total]
-
-
-EXPECTED_TOTALS = [103, 115, 136, 145]
-
-
-def local_copy_op(src, dest):
-    def copy_op(inputs, outputs):
-        shutil.copyfile(src, dest)
-    return copy_op
-
-
-class UploadToLocalFile(UploadTaskGroupBuilder):
-    def __init__(self, dest_dir):
-        self.dest_dir = dest_dir
-
-    def build(self, epoch, checkpoint_manager):
-        with TaskGroup(WorkspaceType.GLOBAL) as upload_task_group:
-            for node, manager in checkpoint_manager._node_managers:
-                with Node(str(node)), Task():
-                    src_path = db_name(epoch, manager._node_name, manager._db_prefix)
-                    dest_path = os.path.join(self.dest_dir, str(node))
-                    ops.Python((local_copy_op,
-                                [src_path, dest_path], {}))([], [])
-        return upload_task_group
-
-
-class TestCheckpoint(TestCase):
-    def run_with(self, builder):
-        with Cluster():
-            with Job() as job:
-                outputs = build_pipeline(node_id=0)
-            output_fetcher = Task(step=core.Net('empty'), outputs=outputs)
-
-            def fetch_total(session):
-                session.run(output_fetcher)
-                return output_fetcher.outputs()[0].fetch()
-
-            session, checkpoint = builder()
-            job.compile(LocalSession)
-            num_epochs = JobRunner(job, checkpoint).train(session)
-            self.assertEqual(num_epochs, len(EXPECTED_TOTALS))
-            self.assertEqual(fetch_total(session), EXPECTED_TOTALS[-1])
-
-            for initial_epoch in range(1, num_epochs + 1):
-                session, checkpoint = builder()
-                JobRunner(
-                    job,
-                    checkpoint, resume_from_epoch=initial_epoch
-                ).train(session)
-                self.assertEqual(fetch_total(session), EXPECTED_TOTALS[-1])
-
-            for epoch in range(1, num_epochs + 1):
-                session.run(checkpoint.load(epoch))
-                self.assertEqual(fetch_total(session),
-                                  EXPECTED_TOTALS[epoch - 1])
-
-    def test_single_checkpoint(self):
-        # test single node
-        try:
-            tmpdir = tempfile.mkdtemp()
-
-            def builder():
-                ws = workspace.C.Workspace()
-                session = LocalSession(ws)
-                checkpoint = CheckpointManager(tmpdir, 'temp_node', 'minidb')
-                return session, checkpoint
-
-            self.run_with(builder)
-        finally:
-            shutil.rmtree(tmpdir)
-
-        # test multi-node
-        try:
-            tmpdir = tempfile.mkdtemp()
-
-            def builder():
-                ws = workspace.C.Workspace()
-                session = LocalSession(ws)
-                checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
-                return session, checkpoint
-
-            self.run_with(builder)
-        finally:
-            shutil.rmtree(tmpdir)
-
-    def test_ckpt_name_and_load_model_from_ckpts(self):
-        try:
-            num_nodes = 3
-            tmpdir = tempfile.mkdtemp()
-            # First, check if the checkpoint name generation mechanism is
-            # correct.
-            checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
-            with Cluster():
-                with Job() as job:
-                    for node_id in range(num_nodes):
-                        build_pipeline(node_id)
-                job.compile(LocalSession)
-                checkpoint.init(job.nodes_to_checkpoint())
-
-                for node_id in range(num_nodes):
-                    epoch = 5
-                    node_name = 'trainer_%d' % node_id
-                    expected_db_name = tmpdir + '/' + node_name + '.5'
-                    self.assertEqual(
-                        checkpoint.get_ckpt_db_name(node_name, epoch),
-                        expected_db_name)
-            shutil.rmtree(tmpdir)
-
-            # Next, check mechanism to load model from checkpoints.
-            tmpdir = tempfile.mkdtemp()
-            workspace.ResetWorkspace()
-            for node_id in range(num_nodes):
-                ws = workspace.C.Workspace()
-                session = LocalSession(ws)
-                checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
-                with Cluster():
-                    with Job() as job:
-                        build_pipeline(node_id)
-                    job.compile(LocalSession)
-                    job_runner = JobRunner(job, checkpoint)
-                    num_epochs = job_runner.train(session)
-                self.assertEqual(num_epochs, len(EXPECTED_TOTALS))
-
-                # There are 17 global blobs after finishing up the job runner.
-                # (only blobs on init_group are checkpointed)
-                self.assertEqual(len(ws.blobs), 17)
-
-            ws = workspace.C.Workspace()
-            session = LocalSession(ws)
-            self.assertEqual(len(ws.blobs), 0)
-            model_blob_names = ['trainer_1/task_2/GivenTensorInt64Fill:0',
-                                'trainer_2/task_2/GivenTensorInt64Fill:0']
-            checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
-            with Cluster():
-                with Job() as job:
-                    for node_id in range(num_nodes):
-                        build_pipeline(node_id)
-                job.compile(LocalSession)
-                job_runner = JobRunner(job, checkpoint)
-                job_runner.load_blobs_from_checkpoints(
-                    blob_names=model_blob_names, epoch=1, session=session)
-
-                # Check that we can successfully load from checkpoints of epochs
-                # 1 to 4, but not epoch 5.
-                for epoch in range(1, 5):
-                    self.assertTrue(
-                        job_runner.load_blobs_from_checkpoints(
-                            blob_names=model_blob_names, epoch=epoch,
-                            session=session))
-                    # Check that all the model blobs are loaded.
-                    for blob_name in model_blob_names:
-                        self.assertTrue(ws.has_blob(blob_name))
-                        self.assertEqual(
-                            ws.fetch_blob(blob_name),
-                            np.array([EXPECTED_TOTALS[epoch - 1]]))
-                self.assertFalse(
-                    job_runner.load_blobs_from_checkpoints(
-                        blob_names=model_blob_names, epoch=5, session=session))
-
-        finally:
-            shutil.rmtree(tmpdir)
-
-    def test_upload_checkpoint(self):
-        try:
-            tmpdir = tempfile.mkdtemp()
-            upload_dir = os.path.join(tmpdir, "upload")
-            os.mkdir(upload_dir)
-            num_nodes = 3
-
-            # The uploaded files do not exist yet.
-            for node_id in range(num_nodes):
-                node_name = 'trainer_%d' % node_id
-                upload_path = os.path.join(upload_dir, node_name)
-                self.assertFalse(os.path.exists(upload_path))
-
-            # Create and run the job runner.
-            for node_id in range(3):
-                ws = workspace.C.Workspace()
-                session = LocalSession(ws)
-                checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
-                with Cluster():
-                    with Job() as job:
-                        build_pipeline(node_id)
-                    job.compile(LocalSession)
-                    local_upload_builder = UploadToLocalFile(upload_dir)
-                    job_runner = JobRunner(
-                        job, checkpoint,
-                        upload_task_group_builder=local_upload_builder)
-                    num_epochs = job_runner.train(session)
-                    self.assertEqual(num_epochs, len(EXPECTED_TOTALS))
-
-            # The uploaded files should exist now.
-            for node_id in range(num_nodes):
-                node_name = 'trainer_%d' % node_id
-                upload_path = os.path.join(upload_dir, node_name)
-                self.assertTrue(os.path.exists(upload_path))
-
-        finally:
-            shutil.rmtree(tmpdir)
-
-    def test_ckpt_save_failure(self):
-        num_nodes = 3
-        # The goal of this test is to ensure that the job runs
-        # successfully even if saving a checkpoint fails.
-        # Hence tmpdir is a non existent directory to emulate a failure
-        # while saving checkpoints
-        tmpdir = "/tmp/path_does_not_exist/"
-
-        # Check the saving checkpoint failure does not cause job failure
-        workspace.ResetWorkspace()
-        for node_id in range(num_nodes):
-            ws = workspace.C.Workspace()
-            session = LocalSession(ws)
-            checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
-            with Cluster():
-                with Job() as job:
-                    build_pipeline(node_id)
-                job.compile(LocalSession)
-                job_runner = JobRunner(job, checkpoint)
-                num_epochs = job_runner.train(session)
-            # make sure all epochs are executed even though saving the checkpoint failed
-            # Saving checkpoint failure should not cause job failure
-            self.assertEqual(num_epochs, len(EXPECTED_TOTALS))
-
-    def test_download_group_simple(self):
-        """
-        A simple test that ensures we have download task group
-        executed between epoch_group and exit_group.
-        """
-        model = model_helper.ModelHelper(name="test_model")
-        download_net = core.Net("download_net")
-
-        for name in ["input1", "input2", "output", "download_result"]:
-            model.param_init_net.ConstantFill([],
-                                              [name],
-                                              shape=[8, ],
-                                              value=1.0,
-                                              run_once=0)
-        model.net.Add(["input1", "input2"], ["output"])
-        download_net.Copy(["output"], ["download_result"])
-
-        # All blob values are initialized as 1.0, after download_net executed
-        # we expect to see download result is the same as training result.
-        with Job() as job:
-            with Node("trainer:0"):
-                with job.init_group:
-                    Task(step=model.param_init_net)
-                with job.epoch_group:
-                    with Task():
-                        with ops.loop(1):
-                            ops.net(model.net)
-                with job.download_group:
-                    Task(step=download_net)
-
-                epoch_limiter(job, 1)
-
-        ws = workspace.C.Workspace()
-        session = LocalSession(ws)
-        job_runner = JobRunner(job)
-        job_runner.train(session)
-
-        expected_result = np.full(8, 2.0).astype(np.float32)
-        self.assertTrue(np.array_equal(expected_result,
-                                       ws.fetch_blob("output")))
-        self.assertTrue(np.array_equal(expected_result,
-                                       ws.fetch_blob("download_result")))
-
-    def test_reuse_checkpoint_manager(self):
-        """
-        A simple test that ensures we can reuse a MultiNodeCheckpointManager
-        object.
-        """
-        try:
-            tmpdir = tempfile.mkdtemp()
-            ws = workspace.C.Workspace()
-            session = LocalSession(ws)
-            checkpoint = MultiNodeCheckpointManager(tmpdir, 'minidb')
-
-            with Job() as job:
-                outputs = build_pipeline(node_id=0)
-            output_fetcher = Task(step=core.Net('empty'), outputs=outputs)
-            job.compile(LocalSession)
-
-            def fetch_total(session):
-                session.run(output_fetcher)
-                return output_fetcher.outputs()[0].fetch()
-
-            num_epochs = JobRunner(job, checkpoint).train(session)
-            for initial_epoch in range(1, num_epochs + 1):
-                JobRunner(
-                    job,
-                    checkpoint,
-                    resume_from_epoch=initial_epoch
-                ).train(session)
-                self.assertEqual(fetch_total(session), EXPECTED_TOTALS[-1])
-
-        finally:
-            shutil.rmtree(tmpdir)
diff --git a/caffe2/python/clean_workspace_test.py b/caffe2/python/clean_workspace_test.py
deleted file mode 100644
index c8285f4a1c5b..000000000000
--- a/caffe2/python/clean_workspace_test.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import unittest
-
-from caffe2.python import workspace
-
-
-# This test is extracted out from workspace_test.py because it relies on the pristine
-# state of the initial workspace. When tests are run in different orders, this test may
-# become flaky because of global state modifications impacting what the root folder is
-# after a reset.
-class TestWorkspace(unittest.TestCase):
-    def testRootFolder(self):
-        self.assertEqual(workspace.ResetWorkspace(), True)
-        self.assertEqual(workspace.RootFolder(), ".")
-        self.assertEqual(workspace.ResetWorkspace("/tmp/caffe-workspace-test"), True)
-        self.assertEqual(workspace.RootFolder(), "/tmp/caffe-workspace-test")
diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py
deleted file mode 100644
index 45a676b09c7b..000000000000
--- a/caffe2/python/cnn.py
+++ /dev/null
@@ -1,240 +0,0 @@
-## @package cnn
-# Module caffe2.python.cnn
-
-
-
-
-
-from caffe2.python import brew, workspace
-from caffe2.python.model_helper import ModelHelper
-from caffe2.proto import caffe2_pb2
-import logging
-
-
-class CNNModelHelper(ModelHelper):
-    """A helper model so we can write CNN models more easily, without having to
-    manually define parameter initializations and operators separately.
-    """
-
-    def __init__(self, order="NCHW", name=None,
-                 use_cudnn=True, cudnn_exhaustive_search=False,
-                 ws_nbytes_limit=None, init_params=True,
-                 skip_sparse_optim=False,
-                 param_model=None):
-        logging.warning(
-            "[====DEPRECATE WARNING====]: you are creating an "
-            "object from CNNModelHelper class which will be deprecated soon. "
-            "Please use ModelHelper object with brew module. For more "
-            "information, please refer to caffe2.ai and python/brew.py, "
-            "python/brew_test.py for more information."
-        )
-
-        cnn_arg_scope = {
-            'order': order,
-            'use_cudnn': use_cudnn,
-            'cudnn_exhaustive_search': cudnn_exhaustive_search,
-        }
-        if ws_nbytes_limit:
-            cnn_arg_scope['ws_nbytes_limit'] = ws_nbytes_limit
-        super().__init__(
-            skip_sparse_optim=skip_sparse_optim,
-            name="CNN" if name is None else name,
-            init_params=init_params,
-            param_model=param_model,
-            arg_scope=cnn_arg_scope,
-        )
-
-        self.order = order
-        self.use_cudnn = use_cudnn
-        self.cudnn_exhaustive_search = cudnn_exhaustive_search
-        self.ws_nbytes_limit = ws_nbytes_limit
-        if self.order != "NHWC" and self.order != "NCHW":
-            raise ValueError(
-                "Cannot understand the CNN storage order %s." % self.order
-            )
-
-    def ImageInput(self, blob_in, blob_out, use_gpu_transform=False, **kwargs):
-        return brew.image_input(
-            self,
-            blob_in,
-            blob_out,
-            order=self.order,
-            use_gpu_transform=use_gpu_transform,
-            **kwargs
-        )
-
-    def VideoInput(self, blob_in, blob_out, **kwargs):
-        return brew.video_input(
-            self,
-            blob_in,
-            blob_out,
-            **kwargs
-        )
-
-    def PadImage(self, blob_in, blob_out, **kwargs):
-        # TODO(wyiming): remove this dummy helper later
-        self.net.PadImage(blob_in, blob_out, **kwargs)
-
-    def ConvNd(self, *args, **kwargs):
-        return brew.conv_nd(
-            self,
-            *args,
-            use_cudnn=self.use_cudnn,
-            order=self.order,
-            cudnn_exhaustive_search=self.cudnn_exhaustive_search,
-            ws_nbytes_limit=self.ws_nbytes_limit,
-            **kwargs
-        )
-
-    def Conv(self, *args, **kwargs):
-        return brew.conv(
-            self,
-            *args,
-            use_cudnn=self.use_cudnn,
-            order=self.order,
-            cudnn_exhaustive_search=self.cudnn_exhaustive_search,
-            ws_nbytes_limit=self.ws_nbytes_limit,
-            **kwargs
-        )
-
-    def ConvTranspose(self, *args, **kwargs):
-        return brew.conv_transpose(
-            self,
-            *args,
-            use_cudnn=self.use_cudnn,
-            order=self.order,
-            cudnn_exhaustive_search=self.cudnn_exhaustive_search,
-            ws_nbytes_limit=self.ws_nbytes_limit,
-            **kwargs
-        )
-
-    def GroupConv(self, *args, **kwargs):
-        return brew.group_conv(
-            self,
-            *args,
-            use_cudnn=self.use_cudnn,
-            order=self.order,
-            cudnn_exhaustive_search=self.cudnn_exhaustive_search,
-            ws_nbytes_limit=self.ws_nbytes_limit,
-            **kwargs
-        )
-
-    def GroupConv_Deprecated(self, *args, **kwargs):
-        return brew.group_conv_deprecated(
-            self,
-            *args,
-            use_cudnn=self.use_cudnn,
-            order=self.order,
-            cudnn_exhaustive_search=self.cudnn_exhaustive_search,
-            ws_nbytes_limit=self.ws_nbytes_limit,
-            **kwargs
-        )
-
-    def FC(self, *args, **kwargs):
-        return brew.fc(self, *args, **kwargs)
-
-    def PackedFC(self, *args, **kwargs):
-        return brew.packed_fc(self, *args, **kwargs)
-
-    def FC_Prune(self, *args, **kwargs):
-        return brew.fc_prune(self, *args, **kwargs)
-
-    def FC_Decomp(self, *args, **kwargs):
-        return brew.fc_decomp(self, *args, **kwargs)
-
-    def FC_Sparse(self, *args, **kwargs):
-        return brew.fc_sparse(self, *args, **kwargs)
-
-    def Dropout(self, *args, **kwargs):
-        return brew.dropout(
-            self, *args, order=self.order, use_cudnn=self.use_cudnn, **kwargs
-        )
-
-    def LRN(self, *args, **kwargs):
-        return brew.lrn(
-            self, *args, order=self.order, use_cudnn=self.use_cudnn, **kwargs
-        )
-
-    def Softmax(self, *args, **kwargs):
-        return brew.softmax(self, *args, use_cudnn=self.use_cudnn, **kwargs)
-
-    def SpatialBN(self, *args, **kwargs):
-        return brew.spatial_bn(self, *args, order=self.order, **kwargs)
-
-    def SpatialGN(self, *args, **kwargs):
-        return brew.spatial_gn(self, *args, order=self.order, **kwargs)
-
-    def InstanceNorm(self, *args, **kwargs):
-        return brew.instance_norm(self, *args, order=self.order, **kwargs)
-
-    def Relu(self, *args, **kwargs):
-        return brew.relu(
-            self, *args, order=self.order, use_cudnn=self.use_cudnn, **kwargs
-        )
-
-    def PRelu(self, *args, **kwargs):
-        return brew.prelu(self, *args, **kwargs)
-
-    def Concat(self, *args, **kwargs):
-        return brew.concat(self, *args, order=self.order, **kwargs)
-
-    def DepthConcat(self, *args, **kwargs):
-        """The old depth concat function - we should move to use concat."""
-        print("DepthConcat is deprecated. use Concat instead.")
-        return self.Concat(*args, **kwargs)
-
-    def Sum(self, *args, **kwargs):
-        return brew.sum(self, *args, **kwargs)
-
-    def Transpose(self, *args, **kwargs):
-        return brew.transpose(self, *args, use_cudnn=self.use_cudnn, **kwargs)
-
-    def Iter(self, *args, **kwargs):
-        return brew.iter(self, *args, **kwargs)
-
-    def Accuracy(self, *args, **kwargs):
-        return brew.accuracy(self, *args, **kwargs)
-
-    def MaxPool(self, *args, **kwargs):
-        return brew.max_pool(
-            self, *args, use_cudnn=self.use_cudnn, order=self.order, **kwargs
-        )
-
-    def MaxPoolWithIndex(self, *args, **kwargs):
-        return brew.max_pool_with_index(self, *args, order=self.order, **kwargs)
-
-    def AveragePool(self, *args, **kwargs):
-        return brew.average_pool(
-            self, *args, use_cudnn=self.use_cudnn, order=self.order, **kwargs
-        )
-
-    @property
-    def XavierInit(self):
-        return ('XavierFill', {})
-
-    def ConstantInit(self, value):
-        return ('ConstantFill', dict(value=value))
-
-    @property
-    def MSRAInit(self):
-        return ('MSRAFill', {})
-
-    @property
-    def ZeroInit(self):
-        return ('ConstantFill', {})
-
-    def AddWeightDecay(self, weight_decay):
-        return brew.add_weight_decay(self, weight_decay)
-
-    @property
-    def CPU(self):
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CPU
-        return device_option
-
-    @property
-    def GPU(self, gpu_id=0):
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = workspace.GpuDeviceType
-        device_option.device_id = gpu_id
-        return device_option
diff --git a/caffe2/python/context.py b/caffe2/python/context.py
deleted file mode 100644
index f04b3b692d87..000000000000
--- a/caffe2/python/context.py
+++ /dev/null
@@ -1,106 +0,0 @@
-## @package context
-# Module caffe2.python.context
-
-import inspect
-import threading
-import functools
-
-
-class _ContextInfo:
-    def __init__(self, cls, allow_default):
-        self.cls = cls
-        self.allow_default = allow_default
-        self._local_stack = threading.local()
-
-    @property
-    def _stack(self):
-        if not hasattr(self._local_stack, 'obj'):
-            self._local_stack.obj = []
-        return self._local_stack.obj
-
-    def enter(self, value):
-        self._stack.append(value)
-
-    def exit(self, value):
-        assert len(self._stack) > 0, 'Context %s is empty.' % self.cls
-        assert self._stack.pop() == value
-
-    def get_active(self, required=True):
-        if len(self._stack) == 0:
-            if not required:
-                return None
-            assert self.allow_default, (
-                'Context %s is required but none is active.' % self.cls)
-            self.enter(self.cls())
-        return self._stack[-1]
-
-
-class _ContextRegistry:
-    def __init__(self):
-        self._ctxs = {}
-
-    def get(self, cls):
-        if cls not in self._ctxs:
-            assert issubclass(cls, Managed), "must be a context managed class, got {}".format(cls)
-            self._ctxs[cls] = _ContextInfo(cls, allow_default=issubclass(cls, DefaultManaged))
-        return self._ctxs[cls]
-
-
-_CONTEXT_REGISTRY = _ContextRegistry()
-
-
-def _context_registry():
-    global _CONTEXT_REGISTRY
-    return _CONTEXT_REGISTRY
-
-
-def _get_managed_classes(obj):
-    return [
-        cls for cls in inspect.getmro(obj.__class__)
-        if issubclass(cls, Managed) and cls != Managed and cls != DefaultManaged
-    ]
-
-
-
-class Managed:
-    """
-    Managed makes the inheritted class a context managed class.
-
-        class Foo(Managed): ...
-
-        with Foo() as f:
-            assert f == Foo.current()
-    """
-
-    @classmethod
-    def current(cls, value=None, required=True):
-        ctx_info = _context_registry().get(cls)
-        if value is not None:
-            assert isinstance(value, cls), (
-                'Wrong context type. Expected: %s, got %s.' % (cls, type(value)))
-            return value
-        return ctx_info.get_active(required=required)
-
-    def __enter__(self):
-        for cls in _get_managed_classes(self):
-            _context_registry().get(cls).enter(self)
-        return self
-
-    def __exit__(self, *args):
-        for cls in _get_managed_classes(self):
-            _context_registry().get(cls).exit(self)
-
-    def __call__(self, func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            with self:
-                return func(*args, **kwargs)
-        return wrapper
-
-
-class DefaultManaged(Managed):
-    """
-    DefaultManaged is similar to Managed but if there is no parent when
-    current() is called it makes a new one.
-    """
-    pass
diff --git a/caffe2/python/context.pyi b/caffe2/python/context.pyi
deleted file mode 100644
index 3b2dd49ad877..000000000000
--- a/caffe2/python/context.pyi
+++ /dev/null
@@ -1,13 +0,0 @@
-from typing import Optional, TypeVar, Type
-
-_T = TypeVar('_T')
-
-class Managed:
-    @classmethod
-    def current(cls: Type[_T], value: Optional[_T] = None, required: bool = True) -> _T: ...
-
-    def __call__(self, func: _T) -> _T: ...
-
-    def __enter__(self: _T) -> _T: ...
-
-class DefaultManaged(Managed): ...
diff --git a/caffe2/python/context_test.py b/caffe2/python/context_test.py
deleted file mode 100644
index 0ca36e49ac80..000000000000
--- a/caffe2/python/context_test.py
+++ /dev/null
@@ -1,67 +0,0 @@
-
-
-
-
-
-from caffe2.python import context, test_util
-from threading import Thread
-
-
-class MyContext(context.Managed):
-    pass
-
-class DefaultMyContext(context.DefaultManaged):
-    pass
-
-class ChildMyContext(MyContext):
-    pass
-
-
-class TestContext(test_util.TestCase):
-    def use_my_context(self):
-        try:
-            for _ in range(100):
-                with MyContext() as a:
-                    for _ in range(100):
-                        self.assertTrue(MyContext.current() == a)
-        except Exception as e:
-            self._exceptions.append(e)
-
-    def testMultiThreaded(self):
-        threads = []
-        self._exceptions = []
-        for _ in range(8):
-            thread = Thread(target=self.use_my_context)
-            thread.start()
-            threads.append(thread)
-        for t in threads:
-            t.join()
-        for e in self._exceptions:
-            raise e
-
-    @MyContext()
-    def testDecorator(self):
-        self.assertIsNotNone(MyContext.current())
-
-    def testNonDefaultCurrent(self):
-        with self.assertRaises(AssertionError):
-            MyContext.current()
-
-        ctx = MyContext()
-        self.assertEqual(MyContext.current(value=ctx), ctx)
-
-        self.assertIsNone(MyContext.current(required=False))
-
-    def testDefaultCurrent(self):
-        self.assertIsInstance(DefaultMyContext.current(), DefaultMyContext)
-
-    def testNestedContexts(self):
-        with MyContext() as ctx1:
-            with DefaultMyContext() as ctx2:
-                self.assertEqual(DefaultMyContext.current(), ctx2)
-                self.assertEqual(MyContext.current(), ctx1)
-
-    def testChildClasses(self):
-        with ChildMyContext() as ctx:
-            self.assertEqual(ChildMyContext.current(), ctx)
-            self.assertEqual(MyContext.current(), ctx)
diff --git a/caffe2/python/control.py b/caffe2/python/control.py
deleted file mode 100644
index 4a6fc066f693..000000000000
--- a/caffe2/python/control.py
+++ /dev/null
@@ -1,574 +0,0 @@
-## @package control
-# Module caffe2.python.control
-"""
-Implement functions for controlling execution of nets and steps, including
-  Do
-  DoParallel
-  For-loop
-  While-loop
-  Do-While-loop
-  Switch
-  If
-"""
-
-
-
-
-
-
-from caffe2.python import core
-
-
-# Used to generate names of the steps created by the control functions.
-# It is actually the internal index of these steps.
-_current_idx = 1
-_used_step_names = set()
-
-
-def _get_next_step_name(control_name, base_name):
-    global _current_idx, _used_step_names
-    concat_name = '%s/%s' % (base_name, control_name)
-    next_name = concat_name
-    while next_name in _used_step_names:
-        next_name = '%s_%d' % (concat_name, _current_idx)
-        _current_idx += 1
-    _used_step_names.add(next_name)
-    return next_name
-
-
-def _MakeList(input):
-    """ input is a tuple.
-    Example:
-    (a, b, c)   --> [a, b, c]
-    (a)         --> [a]
-    ([a, b, c]) --> [a, b, c]
-    """
-    if len(input) == 0:
-        raise ValueError(
-            'input cannot be empty.')
-    elif len(input) == 1:
-        output = input[0]
-        if not isinstance(output, list):
-            output = [output]
-    else:
-        output = list(input)
-    return output
-
-
-def _IsNets(nets_or_steps):
-    if isinstance(nets_or_steps, list):
-        return all(isinstance(n, core.Net) for n in nets_or_steps)
-    else:
-        return isinstance(nets_or_steps, core.Net)
-
-
-def _PrependNets(nets_or_steps, *nets):
-    nets_or_steps = _MakeList((nets_or_steps,))
-    nets = _MakeList(nets)
-    if _IsNets(nets_or_steps):
-        return nets + nets_or_steps
-    else:
-        return [Do('prepend', nets)] + nets_or_steps
-
-
-def _AppendNets(nets_or_steps, *nets):
-    nets_or_steps = _MakeList((nets_or_steps,))
-    nets = _MakeList(nets)
-    if _IsNets(nets_or_steps):
-        return nets_or_steps + nets
-    else:
-        return nets_or_steps + [Do('append', nets)]
-
-
-def GetConditionBlobFromNet(condition_net):
-    """
-    The condition blob is the last external_output that must
-    be a single bool
-    """
-    assert len(condition_net.Proto().external_output) > 0, (
-        "Condition net %s must has at least one external output" %
-        condition_net.Proto.name)
-    # we need to use a blob reference here instead of a string
-    # otherwise, it will add another name_scope to the input later
-    # when we create new ops (such as OR of two inputs)
-    return core.BlobReference(condition_net.Proto().external_output[-1])
-
-
-def BoolNet(*blobs_with_bool_value):
-    """A net assigning constant bool values to blobs. It is mainly used for
-    initializing condition blobs, for example, in multi-task learning, we
-    need to access reader_done blobs before reader_net run. In that case,
-    the reader_done blobs must be initialized.
-
-    Args:
-    blobs_with_bool_value: one or more (blob, bool_value) pairs. The net will
-    assign each bool_value to the corresponding blob.
-
-    returns
-    bool_net: A net assigning constant bool values to blobs.
-
-    Examples:
-    - BoolNet((blob_1, bool_value_1), ..., (blob_n, bool_value_n))
-    - BoolNet([(blob_1, net1), ..., (blob_n, bool_value_n)])
-    - BoolNet((cond_1, bool_value_1))
-    """
-    blobs_with_bool_value = _MakeList(blobs_with_bool_value)
-    bool_net = core.Net('bool_net')
-    for blob, bool_value in blobs_with_bool_value:
-        out_blob = bool_net.ConstantFill(
-            [],
-            [blob],
-            shape=[],
-            value=bool_value,
-            dtype=core.DataType.BOOL)
-        bool_net.AddExternalOutput(out_blob)
-
-    return bool_net
-
-
-def NotNet(condition_blob_or_net):
-    """Not of a condition blob or net
-
-    Args:
-    condition_blob_or_net can be either blob or net. If condition_blob_or_net
-    is Net, the condition is its last external_output
-    that must be a single bool.
-
-    returns
-    not_net: the net NOT the input
-    out_blob: the output blob of the not_net
-    """
-    if isinstance(condition_blob_or_net, core.Net):
-        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
-    else:
-        condition_blob = condition_blob_or_net
-
-    not_net = core.Net('not_net')
-    out_blob = not_net.Not(condition_blob)
-    not_net.AddExternalOutput(out_blob)
-
-    return not_net, out_blob
-
-
-def _CopyConditionBlobNet(condition_blob):
-    """Make a condition net that copies the condition_blob
-
-    Args:
-    condition_blob is a single bool.
-
-    returns
-    not_net: the net NOT the input
-    out_blob: the output blob of the not_net
-    """
-    condition_net = core.Net('copy_condition_blob_net')
-    out_blob = condition_net.Copy(condition_blob)
-    condition_net.AddExternalOutput(out_blob)
-
-    return condition_net, out_blob
-
-
-def MergeConditionNets(name, condition_nets, relation):
-    """
-    Merge multi condition nets into a single condition nets.
-
-    Args:
-        name: name of the new condition net.
-        condition_nets: a list of condition nets. The last external_output
-                        of each condition net must be single bool value.
-        relation: can be 'And' or 'Or'.
-
-    Returns:
-        - A new condition net. Its last external output is relation of all
-          condition_nets.
-    """
-    if not isinstance(condition_nets, list):
-        return condition_nets
-    if len(condition_nets) <= 1:
-        return condition_nets[0] if condition_nets else None
-
-    merged_net = core.Net(name)
-    for i in range(len(condition_nets)):
-        net_proto = condition_nets[i].Proto()
-        assert net_proto.device_option == merged_net.Proto().device_option
-        assert net_proto.type == merged_net.Proto().type
-        merged_net.Proto().op.extend(net_proto.op)
-        merged_net.Proto().external_input.extend(net_proto.external_input)
-        # discard external outputs as we're combining them together
-        curr_cond = GetConditionBlobFromNet(condition_nets[i])
-        if i == 0:
-            last_cond = curr_cond
-        else:
-            last_cond = merged_net.__getattr__(relation)([last_cond, curr_cond])
-        # merge attributes
-        for k, v in condition_nets[i]._attr_dict.items():
-            merged_net._attr_dict[k] += v
-
-    merged_net.AddExternalOutput(last_cond)
-
-    return merged_net
-
-
-def CombineConditions(name, condition_nets, relation):
-    """
-    Combine conditions of multi nets into a single condition nets. Unlike
-    MergeConditionNets, the actual body of condition_nets is not copied into
-    the combine condition net.
-
-    One example is about multi readers. Each reader net has a reader_done
-    condition. When we want to check whether all readers are done, we can
-    use this function to build a new net.
-
-    Args:
-        name: name of the new condition net.
-        condition_nets: a list of condition nets. The last external_output
-                        of each condition net must be single bool value.
-        relation: can be 'And' or 'Or'.
-
-    Returns:
-        - A new condition net. Its last external output is relation of all
-          condition_nets.
-    """
-    if not condition_nets:
-        return None
-    if not isinstance(condition_nets, list):
-        raise ValueError('condition_nets must be a list of nets.')
-
-    if len(condition_nets) == 1:
-        condition_blob = GetConditionBlobFromNet(condition_nets[0])
-        condition_net, _ = _CopyConditionBlobNet(condition_blob)
-        return condition_net
-
-    combined_net = core.Net(name)
-    for i in range(len(condition_nets)):
-        curr_cond = GetConditionBlobFromNet(condition_nets[i])
-        if i == 0:
-            last_cond = curr_cond
-        else:
-            last_cond = combined_net.__getattr__(relation)(
-                [last_cond, curr_cond])
-
-    combined_net.AddExternalOutput(last_cond)
-
-    return combined_net
-
-
-def Do(name, *nets_or_steps):
-    """
-    Execute the sequence of nets or steps once.
-
-    Examples:
-    - Do('myDo', net1, net2, ..., net_n)
-    - Do('myDo', list_of_nets)
-    - Do('myDo', step1, step2, ..., step_n)
-    - Do('myDo', list_of_steps)
-    """
-    nets_or_steps = _MakeList(nets_or_steps)
-    if (len(nets_or_steps) == 1 and isinstance(
-            nets_or_steps[0], core.ExecutionStep)):
-        return nets_or_steps[0]
-    else:
-        return core.scoped_execution_step(
-            _get_next_step_name('Do', name), nets_or_steps)
-
-
-def DoParallel(name, *nets_or_steps):
-    """
-    Execute the nets or steps in parallel, waiting for all of them to finish
-
-    Examples:
-    - DoParallel('pDo', net1, net2, ..., net_n)
-    - DoParallel('pDo', list_of_nets)
-    - DoParallel('pDo', step1, step2, ..., step_n)
-    - DoParallel('pDo', list_of_steps)
-    """
-    nets_or_steps = _MakeList(nets_or_steps)
-    if (len(nets_or_steps) == 1 and isinstance(
-            nets_or_steps[0], core.ExecutionStep)):
-        return nets_or_steps[0]
-    else:
-        return core.scoped_execution_step(
-            _get_next_step_name('DoParallel', name),
-            nets_or_steps,
-            concurrent_substeps=True)
-
-
-def _RunOnceIf(name, condition_blob_or_net, nets_or_steps):
-    """
-    Execute nets_or_steps once if condition_blob_or_net evaluates as true.
-
-    If condition_blob_or_net is Net, the condition is its last external_output
-    that must be a single bool. And this net will be executed before
-    nets_or_steps so as to get the condition.
-    """
-    condition_not_net, stop_blob = NotNet(condition_blob_or_net)
-    if isinstance(condition_blob_or_net, core.Net):
-        nets_or_steps = _PrependNets(
-            nets_or_steps, condition_blob_or_net, condition_not_net)
-    else:
-        nets_or_steps = _PrependNets(nets_or_steps, condition_not_net)
-
-    def if_step(control_name):
-        return core.scoped_execution_step(
-            _get_next_step_name(control_name, name),
-            nets_or_steps,
-            should_stop_blob=stop_blob,
-            only_once=True,
-        )
-
-    if _IsNets(nets_or_steps):
-        bool_net = BoolNet((stop_blob, False))
-        return Do(name + '/_RunOnceIf',
-                  bool_net, if_step('_RunOnceIf-inner'))
-    else:
-        return if_step('_RunOnceIf')
-
-
-def _RunOnceIfNot(name, condition_blob_or_net, nets_or_steps):
-    """
-    Similar to _RunOnceIf() but Execute nets_or_steps once if
-    condition_blob_or_net evaluates as false.
-    """
-    if isinstance(condition_blob_or_net, core.Net):
-        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
-        nets_or_steps = _PrependNets(nets_or_steps, condition_blob_or_net)
-    else:
-        copy_net, condition_blob = _CopyConditionBlobNet(condition_blob_or_net)
-        nets_or_steps = _PrependNets(nets_or_steps, copy_net)
-
-    return core.scoped_execution_step(
-        _get_next_step_name('_RunOnceIfNot', name),
-        nets_or_steps,
-        should_stop_blob=condition_blob,
-        only_once=True,
-    )
-
-
-def For(name, nets_or_steps, iter_num):
-    """
-    Execute nets_or_steps iter_num times.
-
-    Args:
-    nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
-                   a list nets.
-    iter_num:    the number times to execute the nets_or_steps.
-
-    Returns:
-    A ExecutionStep instance.
-    """
-    init_net = core.Net('init-net')
-    iter_cnt = init_net.CreateCounter([], init_count=iter_num)
-    iter_net = core.Net('For-iter')
-    iter_done = iter_net.CountDown([iter_cnt])
-
-    for_step = core.scoped_execution_step(
-        _get_next_step_name('For-inner', name),
-        _PrependNets(nets_or_steps, iter_net),
-        should_stop_blob=iter_done)
-    return Do(name + '/For',
-              Do(name + '/For-init-net', init_net),
-              for_step)
-
-
-def While(name, condition_blob_or_net, nets_or_steps):
-    """
-    Execute nets_or_steps when condition_blob_or_net returns true.
-
-    Args:
-    condition_blob_or_net: If it is an instance of Net, its last
-      external_output must be a single bool.
-    nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
-                   a list nets.
-
-    Returns:
-    A ExecutionStep instance.
-    """
-    condition_not_net, stop_blob = NotNet(condition_blob_or_net)
-    if isinstance(condition_blob_or_net, core.Net):
-        nets_or_steps = _PrependNets(
-            nets_or_steps, condition_blob_or_net, condition_not_net)
-    else:
-        nets_or_steps = _PrependNets(nets_or_steps, condition_not_net)
-
-    def while_step(control_name):
-        return core.scoped_execution_step(
-            _get_next_step_name(control_name, name),
-            nets_or_steps,
-            should_stop_blob=stop_blob,
-        )
-
-    if _IsNets(nets_or_steps):
-        # In this case, while_step has sub-nets:
-        # [condition_blob_or_net, condition_not_net, nets_or_steps]
-        # If stop_blob is pre-set to True (this may happen when While() is
-        # called twice), the loop will exit after executing
-        # condition_blob_or_net. So we use BootNet to set stop_blob to
-        # False.
-        bool_net = BoolNet((stop_blob, False))
-        return Do(name + '/While', bool_net, while_step('While-inner'))
-    else:
-        return while_step('While')
-
-
-def Until(name, condition_blob_or_net, nets_or_steps):
-    """
-    Similar to While() but execute nets_or_steps when
-    condition_blob_or_net returns false
-    """
-    if isinstance(condition_blob_or_net, core.Net):
-        stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
-        nets_or_steps = _PrependNets(nets_or_steps, condition_blob_or_net)
-    else:
-        stop_blob = core.BlobReference(str(condition_blob_or_net))
-
-    return core.scoped_execution_step(
-        _get_next_step_name('Until', name),
-        nets_or_steps,
-        should_stop_blob=stop_blob)
-
-
-def DoWhile(name, condition_blob_or_net, nets_or_steps):
-    """
-    Execute nets_or_steps when condition_blob_or_net returns true. It will
-    execute nets_or_steps before evaluating condition_blob_or_net.
-
-    Args:
-    condition_blob_or_net: if it is an instance of Net, tts last external_output
-      must be a single bool.
-    nets_or_steps: a ExecutionStep or a Net or a list of ExecutionSteps or
-                   a list nets.
-
-    Returns:
-    A ExecutionStep instance.
-    """
-    condition_not_net, stop_blob = NotNet(condition_blob_or_net)
-    if isinstance(condition_blob_or_net, core.Net):
-        nets_or_steps = _AppendNets(
-            nets_or_steps, condition_blob_or_net, condition_not_net)
-    else:
-        nets_or_steps = _AppendNets(nets_or_steps, condition_not_net)
-
-    # If stop_blob is pre-set to True (this may happen when DoWhile() is
-    # called twice), the loop will exit after executing the first net/step
-    # in nets_or_steps. This is not what we want. So we use BootNet to
-    # set stop_blob to False.
-    bool_net = BoolNet((stop_blob, False))
-    return Do(name + '/DoWhile', bool_net, core.scoped_execution_step(
-        _get_next_step_name('DoWhile-inner', name),
-        nets_or_steps,
-        should_stop_blob=stop_blob,
-    ))
-
-
-def DoUntil(name, condition_blob_or_net, nets_or_steps):
-    """
-    Similar to DoWhile() but execute nets_or_steps when
-    condition_blob_or_net returns false. It will execute
-    nets_or_steps before evaluating condition_blob_or_net.
-
-    Special case: if condition_blob_or_net is a blob and is pre-set to
-    true, then only the first net/step of nets_or_steps will be executed and
-    loop is exited. So you need to be careful about the initial value the
-    condition blob when using DoUntil(), esp when DoUntil() is called twice.
-    """
-    if not isinstance(condition_blob_or_net, core.Net):
-        stop_blob = core.BlobReference(condition_blob_or_net)
-        return core.scoped_execution_step(
-            _get_next_step_name('DoUntil', name),
-            nets_or_steps,
-            should_stop_blob=stop_blob)
-
-    nets_or_steps = _AppendNets(nets_or_steps, condition_blob_or_net)
-    stop_blob = GetConditionBlobFromNet(condition_blob_or_net)
-
-    # If stop_blob is pre-set to True (this may happen when DoWhile() is
-    # called twice), the loop will exit after executing the first net/step
-    # in nets_or_steps. This is not what we want. So we use BootNet to
-    # set stop_blob to False.
-    bool_net = BoolNet((stop_blob, False))
-    return Do(name + '/DoUntil', bool_net, core.scoped_execution_step(
-        _get_next_step_name('DoUntil-inner', name),
-        nets_or_steps,
-        should_stop_blob=stop_blob,
-    ))
-
-
-def Switch(name, *conditions):
-    """
-    Execute the steps for which the condition is true.
-    Each condition is a tuple (condition_blob_or_net, nets_or_steps).
-    Note:
-      1. Multi steps can be executed if their conditions are true.
-      2. The conditions_blob_or_net (if it is Net) of all steps will be
-         executed once.
-
-    Examples:
-    - Switch('name', (cond_1, net_1), (cond_2, net_2), ..., (cond_n, net_n))
-    - Switch('name', [(cond_1, net1), (cond_2, net_2), ..., (cond_n, net_n)])
-    - Switch('name', (cond_1, net_1))
-    """
-    conditions = _MakeList(conditions)
-    return core.scoped_execution_step(
-        _get_next_step_name('Switch', name),
-        [_RunOnceIf(name + '/Switch', cond, step) for cond, step in conditions])
-
-
-def SwitchNot(name, *conditions):
-    """
-    Similar to Switch() but execute the steps for which the condition is False.
-    """
-    conditions = _MakeList(conditions)
-    return core.scoped_execution_step(
-        _get_next_step_name('SwitchNot', name),
-        [_RunOnceIfNot(name + '/SwitchNot', cond, step)
-         for cond, step in conditions])
-
-
-def If(name, condition_blob_or_net,
-       true_nets_or_steps, false_nets_or_steps=None):
-    """
-    condition_blob_or_net is first evaluated or executed. If the condition is
-    true, true_nets_or_steps is then executed, otherwise, false_nets_or_steps
-    is executed.
-
-    If condition_blob_or_net is Net, the condition is its last external_output
-    that must be a single bool. And this Net will be executred before both
-    true/false_nets_or_steps so as to get the condition.
-    """
-    if not false_nets_or_steps:
-        return _RunOnceIf(name + '/If',
-                          condition_blob_or_net, true_nets_or_steps)
-
-    if isinstance(condition_blob_or_net, core.Net):
-        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
-    else:
-        condition_blob = condition_blob_or_net
-
-    return Do(
-        name + '/If',
-        _RunOnceIf(name + '/If-true',
-                   condition_blob_or_net, true_nets_or_steps),
-        _RunOnceIfNot(name + '/If-false', condition_blob, false_nets_or_steps)
-    )
-
-
-def IfNot(name, condition_blob_or_net,
-          true_nets_or_steps, false_nets_or_steps=None):
-    """
-    If condition_blob_or_net returns false, executes true_nets_or_steps,
-    otherwise executes false_nets_or_steps
-    """
-    if not false_nets_or_steps:
-        return _RunOnceIfNot(name + '/IfNot',
-                             condition_blob_or_net, true_nets_or_steps)
-
-    if isinstance(condition_blob_or_net, core.Net):
-        condition_blob = GetConditionBlobFromNet(condition_blob_or_net)
-    else:
-        condition_blob = condition_blob_or_net
-
-    return Do(
-        name + '/IfNot',
-        _RunOnceIfNot(name + '/IfNot-true',
-                      condition_blob_or_net, true_nets_or_steps),
-        _RunOnceIf(name + '/IfNot-false', condition_blob, false_nets_or_steps)
-    )
diff --git a/caffe2/python/control_ops_grad.py b/caffe2/python/control_ops_grad.py
deleted file mode 100644
index a0e85f4d0bc1..000000000000
--- a/caffe2/python/control_ops_grad.py
+++ /dev/null
@@ -1,706 +0,0 @@
-## @package control_ops_grad
-# Module caffe2.python.control_ops_grad
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-
-
-def gen_do_gradient(op, g_output):
-    """
-    Generates gradient Do operator, given forward Do op and a list
-    of gradient blobs corresponding to forward op's outputs
-    Returns a gradient op and a list of blobs corresponding to input gradients
-    """
-    from caffe2.python.core import BlobReference
-    subnet, outer_to_inner_map, inner_to_outer_map, workspace_blob_name = \
-        _do_op_sanity_check_and_process(op)
-
-    assert len(g_output) == len(op.output), \
-        "Different number of gradient blobs and Do op outputs"
-
-    grad_ops, deduped_g_output = dedupe_g_output(op, g_output)
-    g_output = deduped_g_output
-
-    # From the outer net point of view:
-    #  Do is an operator that has some number of inputs and outputs;
-    #  we have to generate a gradient operator that writes into
-    #  corresponding input gradient blobs and has access to inputs, outputs
-    #  and gradient output blobs
-    # From the inner net point of view:
-    #  Do is an operator with a subnet and blob bindings,
-    #  we need to forward Do's output blob gradients into inner workspace,
-    #  use them to run backward pass generation and forward Do's input blob
-    #  gradients back into outer workspace
-
-    op_output = [str(o) for o in op.output]
-    op_output = op_output[:-1]  # remove workspace pointer blob
-    op_input = [str(i) for i in op.input]
-    op_input = op_input[:-1]  # remove workspace pointer blob
-
-    ordered_inner_output_blob_names = [outer_to_inner_map[o] for o in op_output]
-
-    backward_pass_initial_grad_map = {}
-    initial_grad_map = {}
-    for inner_output_name, outer_grad_output_name in \
-            zip(ordered_inner_output_blob_names, g_output):
-        # link inner_output_name to corresponding inner_grad_output_name for
-        # backward pass generation;
-        if outer_grad_output_name:
-            inner_grad_output_name = inner_output_name + "/_DO_OPERATOR_INNER_GRAD_"
-            backward_pass_initial_grad_map[BlobReference(inner_output_name)] = \
-                BlobReference(inner_grad_output_name)
-            initial_grad_map[inner_grad_output_name] = str(outer_grad_output_name)
-    assert len(initial_grad_map) > 0, "Empty initial gradient map for Do op"
-
-    inner_grad_ops, inner_grad_names_map = _gen_subgradient_pass(
-        subnet, backward_pass_initial_grad_map)
-
-    if len(inner_grad_ops) == 0:
-        return [], []
-
-    grad_copy_ops = []
-    g_input = []
-    new_op_outputs = []
-    new_blob_bindings = {}
-    for outer_input_name in op_input:
-        inner_input_name = outer_to_inner_map[outer_input_name]
-        if inner_input_name in inner_grad_names_map:
-            inner_grad_input_name = inner_grad_names_map[inner_input_name]
-            outer_grad_input_name = outer_input_name + "_grad"
-
-            # It is possible that inner_grad_input_name will need to be
-            # linked to another outer blob. For example:
-            #
-            #    // y - param initialized in init_net
-            #    x = ...
-            #    z = ...
-            #    with ops.IfNet(...):
-            #        ops.Add([z, x], y) # inner Do block
-            #    loss = f(..., y, ...)
-            #
-            # In this case x, y and z are external for the inner Do block,
-            # the inputs of the Do block are z and x and the output is y.
-            # When computing the gradient of input x given the gradient
-            # of output y it's easy to see that they are equal.
-            # During the generation of gradient Do operator, we link
-            # external gradient y (y_grad) to the internal name
-            # (y/_DO_OPERATOR_INNER_GRAD_) and generate the backward pass
-            # for the internal Do net. As a result we get gradient operators
-            # for the gradient Do and gradient map that maps internal Do
-            # blobs to their computed gradients.
-            # In this example, gradient map may have blob x linked to
-            # gradient blob y/_DO_OPERATOR_INNER_GRAD_.
-            # We should export gradient for x outside of Do, so
-            # we add a blob mapping from inner gradient blob
-            # (y/_DO_OPERATOR_INNER_GRAD_) to a new outer name (x_grad).
-            #
-            # (Note: since we use transparent blob mapping between outer and
-            # inner (Do's) workspace, these operations do not involve copying
-            # but are merely using blobs in outer workspace in the Do's operator
-            # workspace under (possibly) different names)
-            #
-            # At the same time, we need to add a blob mapping from inner name
-            # y/_DO_OPERATOR_INNER_GRAD_ to the outer blob y_grad
-            # Hence in this case, we cannot use existing blob mapping scheme
-            # that requires a bijection between subset of inner blob names and
-            # a set of all (Do's input and output) outer blob names
-
-            # TODO(iliacher): Remove unnecessary blob copying
-
-            new_inner_grad_input_name = \
-                inner_input_name + "/_DO_OPERATOR_INNER_GRAD_COPY_"
-            grad_copy_ops.append(_prepare_blob_copy_op(
-                inner_grad_input_name, new_inner_grad_input_name))
-
-            new_blob_bindings[new_inner_grad_input_name] = outer_grad_input_name
-            new_op_outputs.append(outer_grad_input_name)
-            g_input.append(outer_grad_input_name)
-        else:
-            g_input.append(None)
-
-    new_op_inputs = []
-    overwritten_names = set()
-    saved_local_blob_names = set()
-    for grad_op in inner_grad_ops:
-        grad_op_input = [str(i) for i in grad_op.input]
-        grad_op_output = [str(o) for o in grad_op.output]
-        for grad_op_input_name in grad_op_input:
-            if grad_op_input_name in overwritten_names:
-                continue
-            # check if this is an external blob
-            outer_name = inner_to_outer_map.get(grad_op_input_name, None)
-            if not outer_name:
-                # check if this is an external gradient blob
-                outer_name = initial_grad_map.get(grad_op_input_name, None)
-            if outer_name:
-                outer_name = str(outer_name)
-                if outer_name not in new_op_inputs:
-                    new_op_inputs.append(outer_name)
-
-                new_blob_bindings[grad_op_input_name] = outer_name
-            else:
-                # this is a local blob, we'll get it's value from
-                # a saved forward op workspace
-                saved_local_blob_names.add(grad_op_input_name)
-        overwritten_names.update(grad_op_output)
-
-    # add inner gradient copy ops
-    inner_grad_ops += grad_copy_ops
-
-    gradient_do_def = _prepare_gradient_do_op(
-        fwd_op=op,
-        fwd_net=subnet,
-        grad_ops=inner_grad_ops,
-        inputs=new_op_inputs,
-        outputs=new_op_outputs,
-        blob_bindings=new_blob_bindings,
-        saved_fwd_blobs=saved_local_blob_names,
-        workspace_blob_name=workspace_blob_name)
-    grad_ops.append(gradient_do_def)
-
-    _do_op_sanity_check_and_process(gradient_do_def)
-
-    return grad_ops, g_input
-
-
-def dedupe_g_output(op, g_output):
-    # When generation a gradient op it's possible to receive the same gradient
-    # blob corresponding to different forward op output blobs, Do operator
-    # requires a bijection between inner and outer names, make sure we do
-    # deduplication
-    grad_ops = []
-    deduped_g_output = []
-    init_grad_map = {}
-    for output_name, grad_name in zip(op.output, g_output):
-        if not grad_name:
-            deduped_g_output.append(grad_name)
-            continue
-
-        if output_name in init_grad_map:
-            deduped_g_output.append(init_grad_map[output_name])
-        else:
-            if grad_name not in init_grad_map.values():
-                init_grad_map[output_name] = grad_name
-                deduped_g_output.append(grad_name)
-            else:
-                deduped_grad_name = output_name + "_" + grad_name + "_DEDUP"
-                assert deduped_grad_name not in init_grad_map.values()
-                grad_copy_op = caffe2_pb2.OperatorDef()
-                grad_copy_op.type = "Copy"
-                grad_copy_op.input.extend([grad_name])
-                grad_copy_op.output.extend([deduped_grad_name])
-                grad_ops.append(grad_copy_op)
-                deduped_g_output.append(deduped_grad_name)
-                init_grad_map[output_name] = deduped_grad_name
-    return grad_ops, deduped_g_output
-
-
-def gen_while_gradient(op, g_output):
-    """
-    Generates gradient While operator
-    """
-    from caffe2.python.core import BlobReference
-    assert op.type == "While", "Expected While op"
-    assert len(op.input) > 0, "Expected at least one input in While op"
-
-    assert len(op.output) == len(g_output), \
-        "Different number of gradient blobs and While op outputs"
-
-    grad_ops, deduped_g_output = dedupe_g_output(op, g_output)
-    g_output = deduped_g_output
-
-    init_grad_map = {}
-    op_output = [str(o) for o in op.output]
-    for output_name, grad_output_name in zip(op_output, g_output):
-        if grad_output_name:
-            init_grad_map[BlobReference(output_name)] = \
-                BlobReference(grad_output_name)
-    assert len(init_grad_map) > 0, "Empty initial gradient map for While op"
-
-    loop_net = _get_net_argument(op, "loop_net")
-    assert loop_net, "Expected loop subnet in While op"
-    assert len(loop_net.op) == 1 and loop_net.op[0].type == "Do", \
-        "Gradient While op requires single Do op as a loop body"
-    do_op = loop_net.op[0]
-    do_args = _get_do_arguments(do_op)
-    assert "reuse_workspace" not in do_args or not do_args["reuse_workspace"], \
-        "Gradient While op requires Do loop body op without reuse_workspace set"
-
-    assert len(do_op.output) > 0, "Expected Do op with at least one output"
-    workspace_blob = do_op.output[-1]
-
-    loop_grad_net, loop_grad_map, loop_input_names, loop_output_names = \
-        _gen_subnet_gradient(loop_net, init_grad_map)
-    assert loop_grad_net, "Failed to get gradient net for loop body in While op"
-
-    grad_ops += _prepare_gradient_while_ops(
-        fwd_op=op,
-        input_names=loop_input_names,
-        output_names=loop_output_names,
-        loop_grad_net=loop_grad_net,
-        workspace_blob=workspace_blob,
-        init_grad_map=init_grad_map,
-        loop_grad_map=loop_grad_map)
-
-    op_input = [str(i) for i in op.input]
-    g_input = [loop_grad_map.get(i, None) for i in op_input]
-    return grad_ops, g_input
-
-
-# Constructs gradient While op, arguments:
-#  fwd_op - forward While op
-#  input_names - input blob names for a gradient op
-#  output_names - output blob names for a gradient op
-#  loop_grad_net - gradient loop body net
-#  workspace_blob - blob that holds forward workspaces stack
-#  init_grad_map - initial gradient to forward blob map
-#  loop_grad_map - gradient blob map for loop's body
-def _prepare_gradient_while_ops(
-        fwd_op, input_names, output_names, loop_grad_net, workspace_blob,
-        init_grad_map, loop_grad_map):
-    gradient_while_def = caffe2_pb2.OperatorDef()
-    gradient_while_def.CopyFrom(fwd_op)
-    if gradient_while_def.name:
-        gradient_while_def.name += "_grad"
-
-    loop_net_arg = caffe2_pb2.Argument()
-    loop_net_arg.name = "loop_net"
-    loop_net_arg.n.CopyFrom(loop_grad_net)
-
-    cond_net_arg = caffe2_pb2.Argument()
-    cond_net_arg.name = "cond_net"
-    from caffe2.python.core import Net, BlobReference
-    # Construct condition net - check that there're still forward workspaces
-    # left using HasScope op
-    cond_net = Net('gradient_loop_cond_net')
-    cond_init_net = Net('gradient_loop_cond_net_init')
-    cond_blob = cond_net.NextScopedBlob(cond_net.Name() + '/cond')
-    cond_init_net.HasScope(workspace_blob, cond_blob)
-    cond_net.HasScope(workspace_blob, cond_blob)
-    for blob, init_grad_blob in init_grad_map.items():
-        blob_name = str(blob)
-        init_grad_blob_name = str(init_grad_blob)
-        if blob_name in loop_grad_map and \
-                loop_grad_map[blob_name] != init_grad_blob_name:
-            cond_net.Copy(
-                BlobReference(loop_grad_map[blob_name]), init_grad_blob)
-            cond_init_net.Copy(
-                init_grad_blob, BlobReference(loop_grad_map[blob_name]))
-    cond_net_arg.n.CopyFrom(cond_net.Proto())
-
-    del gradient_while_def.arg[:]
-    gradient_while_def.arg.extend([loop_net_arg, cond_net_arg])
-
-    del gradient_while_def.control_input[:]
-    del gradient_while_def.input[:]
-    gradient_while_def.input.extend(
-        [str(cond_blob).encode('utf-8')] + list(input_names))
-    del gradient_while_def.output[:]
-    gradient_while_def.output.extend(output_names)
-    gradient_while_def.is_gradient_op = True
-    return [o for o in cond_init_net.Proto().op] + [gradient_while_def]
-
-
-def _get_do_arguments(do_op):
-    assert do_op.type == "Do", "Expected Do op"
-    args = {}
-    for arg in do_op.arg:
-        if not arg.name:
-            continue
-        if arg.name == "net":
-            assert arg.n, "Expected non empty net argument"
-            args["net"] = arg.n
-        elif arg.name == "reuse_workspace":
-            assert arg.i, "Expected non empty reuse_workspace argument"
-            args["reuse_workspace"] = bool(arg.i)
-        elif arg.name == "inner_blobs":
-            assert arg.strings, "Expected non empty inner_blobs argument"
-            args["inner_blobs"] = arg.strings
-        elif arg.name == "outer_blobs_idx":
-            assert arg.ints, "Expected non empty outer_blobs_idx argument"
-            args["outer_blobs_idx"] = arg.ints
-    return args
-
-
-def gen_if_gradient(op, g_output):
-    """
-    Generates gradient If operator, given forward If op and a list
-    of gradient blobs corresponding to forward op's outputs
-    Returns a gradient op and a list of blobs corresponding to input gradients
-    """
-    from caffe2.python.core import BlobReference
-    assert op.type == "If", "Expected If op"
-    # first input is the condition blob
-    assert len(op.input) > 0, "Expected at least one input in If op"
-
-    assert len(op.output) == len(g_output), \
-        "Different number of gradient blobs and If op outputs"
-
-    grad_ops, deduped_g_output = dedupe_g_output(op, g_output)
-    g_output = deduped_g_output
-
-    init_grad_map = {}  # map from if's output blob to output gradient blob
-    op_input = [str(i) for i in op.input]
-    op_output = [str(o) for o in op.output]
-    for output_name, grad_output_name in zip(op_output, g_output):
-        if grad_output_name:
-            init_grad_map[BlobReference(output_name)] = \
-                BlobReference(grad_output_name)
-    # shouldn't call without at least one output gradient available
-    assert len(init_grad_map) > 0, "Empty initial gradient map for If op"
-
-    grad_map = {}  # map from blob to gradient blob
-    then_net = _get_net_argument(op, "then_net")
-    assert then_net, "Expected then subnet in If op"
-    then_grad_net, then_grad_map, then_input_names, then_output_names = \
-        _gen_subnet_gradient(then_net, init_grad_map)
-    assert then_grad_net, "Failed to get gradient net for then in If op"
-    grad_map.update(then_grad_map)
-
-    else_input_names = set()
-    else_output_names = set()
-    else_grad_map = {}
-    else_grad_net = None
-    else_net = _get_net_argument(op, "else_net")
-    if else_net:
-        else_grad_net, else_grad_map, else_input_names, else_output_names = \
-            _gen_subnet_gradient(else_net, init_grad_map)
-        assert else_grad_net, "Failed to get gradient net for else in If op"
-        # consider case: else doesn't update blob's gradient and keeps original
-        # from init_grad_map, but then updates the gradient
-        for else_blob, else_grad_blob in else_grad_map.items():
-            if else_blob in then_grad_map:
-                then_grad_blob = then_grad_map[else_blob]
-                # if both then and else branches have grad blob name for the same
-                # blob and grad names are different, then one of the branches
-                # doesn't use blob and has original grad blob name in it's grad map,
-                # and another branch uses blob and has <blob_name>_grad name
-                # in it's grad map (might be different from original grad blob)
-                if then_grad_blob != else_grad_blob:
-                    init_grad_name = init_grad_map[else_blob] \
-                        if else_blob in init_grad_map else None
-
-                    if then_grad_blob == init_grad_name:
-                        grad_map[else_blob] = else_grad_blob
-                    elif else_grad_blob == init_grad_name:
-                        grad_map[else_blob] = then_grad_blob
-                    else:
-                        raise "Unexpected grad blob name " + else_blob + ", " + \
-                            else_grad_blob + ", " + then_grad_blob
-            else:
-                grad_map[else_blob] = else_grad_blob
-
-    # make sure gradients of blobs that were not computed
-    # by the selected if's branch are initialized with zeros
-    then_other_output_names = \
-        then_output_names - (then_output_names & else_output_names)
-    then_other_grad_output_names = set(
-        [o for o in then_other_output_names if o in then_grad_map.values()])
-    zero_then = _gen_grad_zero_init_ops(
-        init_grad_map, then_grad_map, then_other_grad_output_names)
-    if else_grad_net:
-        else_grad_net.op.extend(zero_then)
-    elif len(zero_then) > 0:
-        else_grad_net = caffe2_pb2.NetDef()
-        else_grad_net.CopyFrom(then_grad_net)
-        if else_grad_net.name:
-            else_grad_net.name += "_auto_else_zero_blobs_"
-        del else_grad_net.op[:]
-        else_grad_net.op.extend(zero_then)
-        del else_grad_net.external_input[:]
-        del else_grad_net.external_output[:]
-
-    else_other_output_names = \
-        else_output_names - (then_output_names & else_output_names)
-    else_other_grad_output_names = set(
-        [o for o in else_other_output_names if o in else_grad_map.values()])
-    zero_else = _gen_grad_zero_init_ops(
-        init_grad_map, else_grad_map, else_other_grad_output_names)
-    then_grad_net.op.extend(zero_else)
-
-    output_names = list(then_output_names | else_output_names)
-    input_names = then_input_names | else_input_names
-    # make sure condition blob is the first in the list
-    input_names = [op_input[0]] + list(input_names - set(op_input[0]))
-    gradient_if_def = _prepare_gradient_if_op(
-        fwd_op=op,
-        input_names=input_names,
-        output_names=output_names,
-        then_grad_net=then_grad_net,
-        else_grad_net=else_grad_net)
-    g_input = [grad_map.get(i, None) for i in op_input]
-    return grad_ops + [gradient_if_def], g_input
-
-
-def _gen_subnet_gradient(subnet, init_grad):
-    grad_ops, grad_names_map = _gen_subgradient_pass(
-        subnet, init_grad)
-
-    output_names = set()
-    input_names = set()
-    for grad_op in grad_ops:
-        for grad_op_input in grad_op.input:
-            if str(grad_op_input) not in output_names:
-                input_names.add(str(grad_op_input))
-        for grad_op_output in grad_op.output:
-            output_names.add(str(grad_op_output))
-
-    gradient_net_def = caffe2_pb2.NetDef()
-    gradient_net_def.CopyFrom(subnet)
-    if gradient_net_def.name:
-        gradient_net_def.name += "_grad"
-    del gradient_net_def.op[:]
-    gradient_net_def.op.extend(grad_ops)
-    del gradient_net_def.external_input[:]
-    del gradient_net_def.external_output[:]
-
-    return gradient_net_def, grad_names_map, input_names, output_names
-
-
-def _get_net_argument(op, net_name):
-    for arg in op.arg:
-        if arg.name and arg.name == net_name:
-            assert arg.n, "Expected non empty net argument " + net_name
-            return arg.n
-    return None
-
-
-def getNetArgument(op, net_name):
-    """A wrapper for external call"""
-    return _get_net_argument(op, net_name)
-
-
-def _gen_subgradient_pass(subnet, init_grad):
-    from caffe2.python.core import IR
-    subnet_ir = IR(subnet.op)
-    grad_ops, grad_blob_map = \
-        subnet_ir.GetBackwardPass(init_grad)
-    grad_names_map = {}
-    for b, g in grad_blob_map.items():
-        grad_names_map[str(b)] = str(g)
-    return grad_ops, grad_names_map
-
-
-def _do_op_sanity_check_and_process(op):
-    assert op.type == "Do", "Expected Do op"
-
-    subnet = _get_net_argument(op, "net")
-    assert subnet, "No net argument found in Do op"
-
-    inner_blobs = None
-    outer_blobs_idx = None
-    for arg in op.arg:
-        if arg.name and arg.name == "inner_blobs":
-            assert not inner_blobs, "inner_blobs redefinition"
-            assert arg.strings and len(arg.strings) > 0, \
-                "Empty inner_blobs argument in Do op"
-            inner_blobs = [s.decode('utf-8') for s in arg.strings]
-        if arg.name and arg.name == "outer_blobs_idx":
-            assert not outer_blobs_idx, "outer_blobs_idx redefinition"
-            assert arg.ints and len(arg.ints) > 0, \
-                "Empty outer_blobs_idx argument in Do op"
-            outer_blobs_idx = arg.ints
-        if inner_blobs and outer_blobs_idx:
-            break
-
-    assert inner_blobs, "No inner_blobs argument found in Do op"
-    assert outer_blobs_idx, "No outer_blobs_idx argument found in Do op"
-
-    assert len(inner_blobs) == len(outer_blobs_idx), \
-        "Arguments inner_blobs and outer_blobs_idx of different length in Do op"
-
-    all_inner_blobs = set(inner_blobs)
-    assert len(all_inner_blobs) == len(inner_blobs), \
-        "Found duplicates in inner_blobs in Do op"
-
-    op_input = [str(i) for i in op.input]
-    assert len(op_input) > 0, "Expected at least one input blob"
-    # remove last input blob that holds pointer to workspace
-    input_workspace_blob_name = op_input[-1]
-    op_input = op_input[:-1]
-
-    op_output = [str(o) for o in op.output]
-    assert len(op_output) > 0, "Expected at least one output blob"
-    # remove last output blob that holds pointer to workspace
-    workspace_blob_name = op_output[-1]
-    assert input_workspace_blob_name == workspace_blob_name, \
-        "Expected same input/output workspace blob"
-    op_output = op_output[:-1]
-
-    all_op_input_blob_names = set(op_input)
-    assert len(all_op_input_blob_names) == len(op_input), \
-        "Found duplicates in Do op inputs"
-    all_op_output_blob_names = set(op_output)
-    assert len(all_op_output_blob_names) == len(op_output), \
-        "Found duplicates in Do op outputs"
-
-    ordered_outer_blob_names = op_input + op_output
-    all_outer_blob_names = set(ordered_outer_blob_names)
-    used_outer_blob_names = set()
-    outer_to_inner_map = {}
-    inner_to_outer_map = {}
-    for inner_name, outer_blob_idx in zip(inner_blobs, outer_blobs_idx):
-        assert outer_blob_idx >= 0 and \
-            outer_blob_idx < len(ordered_outer_blob_names), \
-            "Outer blob index is out of bounds in Do op"
-        outer_name = ordered_outer_blob_names[outer_blob_idx]
-        assert outer_name not in used_outer_blob_names, \
-            "Reusage of outer blob name " + outer_name + " in Do op"
-        used_outer_blob_names.add(outer_name)
-        outer_to_inner_map[outer_name] = inner_name
-        inner_to_outer_map[inner_name] = outer_name
-
-    assert len(used_outer_blob_names) == len(all_outer_blob_names), \
-        "Not all outer blob names are used in blob bindings in Do op"
-
-    return subnet, outer_to_inner_map, inner_to_outer_map, workspace_blob_name
-
-
-def _prepare_blob_copy_op(from_name, to_name):
-    copy_op_def = caffe2_pb2.OperatorDef()
-    copy_op_def.type = "Copy"
-    copy_op_def.input.extend([from_name])
-    copy_op_def.output.extend([to_name])
-    return copy_op_def
-
-
-def _prepare_gradient_do_op(
-        fwd_op, fwd_net, grad_ops, inputs, outputs, blob_bindings, saved_fwd_blobs,
-        workspace_blob_name):
-    gradient_net_def = caffe2_pb2.NetDef()
-    gradient_net_def.CopyFrom(fwd_net)
-    if gradient_net_def.name:
-        gradient_net_def.name += "_grad"
-    del gradient_net_def.op[:]
-    gradient_net_def.op.extend(grad_ops)
-    del gradient_net_def.external_input[:]
-    del gradient_net_def.external_output[:]
-
-    gradient_do_def = caffe2_pb2.OperatorDef()
-    gradient_do_def.CopyFrom(fwd_op)
-    if gradient_do_def.name and len(gradient_do_def.name) > 0:
-        gradient_do_def.name += "_grad"
-
-    del gradient_do_def.input[:]
-    gradient_do_def.input.extend(inputs)
-    # workspace pointer blob
-    gradient_do_def.input.append(workspace_blob_name)
-    del gradient_do_def.output[:]
-    gradient_do_def.output.extend(outputs)
-    # workspace pointer blob
-    gradient_do_def.output.append(workspace_blob_name)
-
-    net_arg = caffe2_pb2.Argument()
-    net_arg.name = "net"
-    net_arg.n.CopyFrom(gradient_net_def)
-
-    ordered_new_outer_names = inputs + outputs
-    inner_blobs = blob_bindings.keys()
-    new_outer_blobs_idx = [ordered_new_outer_names.index(blob_bindings[b])
-                            for b in inner_blobs]
-
-    inner_blobs_arg = caffe2_pb2.Argument()
-    inner_blobs_arg.name = "inner_blobs"
-    inner_blobs_arg.strings.extend([b.encode('utf-8') for b in inner_blobs])
-
-    outer_blobs_idx_arg = caffe2_pb2.Argument()
-    outer_blobs_idx_arg.name = "outer_blobs_idx"
-    outer_blobs_idx_arg.ints.extend(new_outer_blobs_idx)
-
-    saved_blobs_arg = caffe2_pb2.Argument()
-    saved_blobs_arg.name = "saved_fwd_blobs"
-    saved_blobs_arg.strings.extend(
-        [b.encode('utf-8') for b in saved_fwd_blobs])
-
-    del gradient_do_def.arg[:]
-    gradient_do_def.arg.extend([
-        net_arg, inner_blobs_arg, outer_blobs_idx_arg, saved_blobs_arg])
-    del gradient_do_def.control_input[:]
-
-    gradient_do_def.is_gradient_op = True
-
-    return gradient_do_def
-
-
-def _gen_grad_zero_init_ops(init_grad_map, grad_map, grad_output_names):
-    grad_init_ops = []
-    for grad_output in grad_output_names:
-        # get the corresponding output name blob and use it in ConstantFill
-        # so that grad_output has the same shape
-        output_name = None
-        for o, g in grad_map.items():
-            if g == grad_output:
-                output_name = o
-                break
-        assert output_name, "Unknown gradient output " + grad_output
-
-        grad_init_op = None
-        # make sure that we do not overwrite existing gradients with zeros
-        if output_name in init_grad_map:
-            init_grad_name = init_grad_map[output_name]
-            # in case we use a different gradient blob name, copy gradient
-            if init_grad_name != grad_output:
-                grad_init_op = caffe2_pb2.OperatorDef()
-                grad_init_op.type = "Copy"
-                grad_init_op.input.extend([str(init_grad_name)])
-                grad_init_op.output.extend([str(grad_output)])
-        else:
-            grad_init_op = caffe2_pb2.OperatorDef()
-            grad_init_op.type = "ConstantFill"
-            grad_init_op.input.extend([output_name])
-            grad_init_op.output.extend([grad_output])
-            value_arg = caffe2_pb2.Argument()
-            value_arg.name = "value"
-            value_arg.f = 0.0
-            grad_init_op.arg.extend([value_arg])
-
-        if grad_init_op:
-            grad_init_ops.append(grad_init_op)
-    return grad_init_ops
-
-
-def _prepare_gradient_if_op(
-        fwd_op, input_names, output_names, then_grad_net, else_grad_net):
-    gradient_if_def = caffe2_pb2.OperatorDef()
-    gradient_if_def.CopyFrom(fwd_op)
-    del gradient_if_def.input[:]
-    gradient_if_def.input.extend(input_names)
-    del gradient_if_def.output[:]
-    gradient_if_def.output.extend(output_names)
-
-    then_net_arg = caffe2_pb2.Argument()
-    then_net_arg.name = "then_net"
-    then_net_arg.n.CopyFrom(then_grad_net)
-    gradient_args = [then_net_arg]
-    if else_grad_net:
-        else_net_arg = caffe2_pb2.Argument()
-        else_net_arg.name = "else_net"
-        else_net_arg.n.CopyFrom(else_grad_net)
-        gradient_args.append(else_net_arg)
-
-    del gradient_if_def.arg[:]
-    gradient_if_def.arg.extend(gradient_args)
-    if gradient_if_def.name:
-        gradient_if_def.name += "_grad"
-    del gradient_if_def.control_input[:]
-    gradient_if_def.is_gradient_op = True
-    return gradient_if_def
-
-
-def disambiguate_grad_if_op_output(grad_op, idx, new_grad_output):
-    then_net = _get_net_argument(grad_op, "then_net")
-    old_grad_out_match = grad_op.output[idx]
-    for op in then_net.op:
-        for i, out in enumerate(op.output):
-            if out == old_grad_out_match:
-                op.output[i] = new_grad_output
-    else_net = _get_net_argument(grad_op, "else_net")
-    if else_net:
-        for op in else_net.op:
-            for i, out in enumerate(op.output):
-                if out == old_grad_out_match:
-                    op.output[i] = new_grad_output
-    grad_op.output[idx] = new_grad_output
diff --git a/caffe2/python/control_ops_grad_test.py b/caffe2/python/control_ops_grad_test.py
deleted file mode 100644
index f637e38a5e33..000000000000
--- a/caffe2/python/control_ops_grad_test.py
+++ /dev/null
@@ -1,49 +0,0 @@
-
-
-
-
-
-import unittest
-from caffe2.python import core, test_util, workspace
-from caffe2.python.control_ops_grad import disambiguate_grad_if_op_output
-from caffe2.python.model_helper import ModelHelper
-import numpy as np
-
-
-class TestControl(test_util.TestCase):
-    def test_disambiguate_grad_if_op_output(self):
-        workspace.FeedBlob("cond", np.array(True))
-        workspace.FeedBlob("then_grad", np.array(1))
-        workspace.FeedBlob("else_grad", np.array(2))
-
-        then_model = ModelHelper(name="then_test_model")
-        then_model.net.Copy("then_grad", "input_grad")
-
-        else_model = ModelHelper(name="else_test_model")
-        else_model.net.Copy("else_grad", "else_temp_grad")
-        else_model.net.Copy("else_temp", "input_grad")
-
-        # to BuildGradientGenerators, in forward pass, we need else temp
-        # as one of the output. Which later on results in a grad op like this:
-        grad_op = core.CreateOperator(
-            "If",
-            ["cond", "then_grad", "else_grad"],
-            ["input_grad", "else_temp_grad"],
-            then_net=then_model.net.Proto(),
-            else_net=else_model.net.Proto(),
-        )
-
-        # in certain cases, another branch of the net also generates input_grad
-        # and we call _DisambiguateGradOpOutput in core.py
-        new_grad_output = "input_grad" + "_autosplit_" + "0"
-        disambiguate_grad_if_op_output(grad_op, 0, new_grad_output)
-        self.assertEqual(grad_op.output[0], new_grad_output)
-        for arg in grad_op.arg:
-            if arg.name == "else_net":
-                self.assertEqual(arg.n.op[1].output[0], new_grad_output)
-            else:
-                self.assertEqual(arg.name, "then_net")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/control_ops_util.py b/caffe2/python/control_ops_util.py
deleted file mode 100644
index cfff82de318b..000000000000
--- a/caffe2/python/control_ops_util.py
+++ /dev/null
@@ -1,263 +0,0 @@
-## @package control_ops_util
-# Module caffe2.python.control_ops_util
-
-
-
-
-
-from caffe2.python import core
-
-
-def get_external_blob_names(net, lexical_scope):
-    """
-    Returns a set of blobs a given net depends on and a set of
-    output blobs that are written by the net
-    Inputs:
-        net - net to return input/output blobs for;
-        lexical_scope - all external blob names visible to the net
-    """
-    # Use the blobs that are actually read/written to as external inputs/outputs
-    net_proto = net.Proto()
-    net_ssa, _ = core.get_ssa(net_proto)
-    input_names = core.get_undefined_blobs(net_ssa)
-    for input_name in input_names:
-        assert str(input_name) in lexical_scope, \
-            "Input blob " + input_name + " is undefined"
-
-    output_names = set()
-    for op in net_proto.op:
-        for output in op.output:
-            if output in lexical_scope:
-                output_names.add(output)
-
-    return input_names, output_names
-
-
-def add_if_op(if_net, cond_blob, lexical_scope, then_net, else_net=None):
-    """
-    A helper function to add an If op to the net.
-    Automatically determines whether blobs in the then/else subnets are external
-    (from the outer workspace) or local (visible only inside subnet's workspace)
-    based on lexical scope - set of all outer blob names visible to the 'If'
-    operator. All the blobs in then/else subnets with names matching a name in lexical
-    scope and all the blobs that are first used as the operators' inputs are
-    considered outer blobs - these blobs must exist in the outer workspace,
-    then/else subnets can read their values and new values written into these blobs
-    will be visible outside of the 'If' operator. All other blobs are local - exist
-    only within inner workspaces for then/else.
-    Inputs:
-        if_net - net to add an If op to;
-        cond_blob - scalar bool blob reference, used as If condition;
-        lexical_scope - a set of outer blob names visible to then/else branches;
-        then_net/else_net - nets (core.Net) for then/else branches
-    """
-    then_input_blob_names, then_output_blob_names = get_external_blob_names(
-        then_net, lexical_scope)
-
-    else_input_blob_names = set()
-    else_output_blob_names = set()
-    if else_net:
-        else_input_blob_names, else_output_blob_names = get_external_blob_names(
-            else_net, lexical_scope)
-
-    input_blob_names = then_input_blob_names | else_input_blob_names
-    output_blob_names = then_output_blob_names | else_output_blob_names
-
-    if_inputs = [cond_blob]
-    if_inputs += [core.BlobReference(name=b, net=None) for b in input_blob_names]
-    if_outputs = [core.BlobReference(name=b, net=None) for b in output_blob_names]
-
-    do_then_net = core.Net('do_then_net')
-
-    then_input_blobs = \
-        [core.BlobReference(name=b, net=None) for b in then_input_blob_names]
-    then_output_blobs = \
-        [core.BlobReference(name=b, net=None) for b in then_output_blob_names]
-    then_input_output_names_ordered = [
-        str(b) for b in (then_input_blobs + then_output_blobs)]
-
-    then_outer_blob_names = list(then_input_blob_names | then_output_blob_names)
-    then_outer_blob_names_idx = [
-        then_input_output_names_ordered.index(b) for b in then_outer_blob_names]
-
-    # make sure to use net's name to have unique blob name across multiple subnets
-    do_then_workspace_blob = if_net.NextScopedBlob(if_net.Name() + '/workspace_if_then')
-    then_input_blobs.append(do_then_workspace_blob)
-    then_output_blobs.append(do_then_workspace_blob)
-    # make sure that added workspace pointer blobs are in if inputs/outputs
-    if_inputs.append(do_then_workspace_blob)
-    if_outputs.append(do_then_workspace_blob)
-
-    do_then_net.Do(
-        then_input_blobs,
-        then_output_blobs,
-        net=then_net.Proto(),
-        inner_blobs=then_outer_blob_names,
-        outer_blobs_idx=then_outer_blob_names_idx)
-    do_then_net.AddExternalOutput(*then_output_blobs)
-
-    if_args = {}
-    if_args['then_net'] = do_then_net.Proto()
-
-    do_else_workspace_blob = None
-    if else_net:
-        do_else_net = core.Net('do_else_net')
-
-        else_input_blobs = \
-            [core.BlobReference(name=b, net=None) for b in else_input_blob_names]
-        else_output_blobs = \
-            [core.BlobReference(name=b, net=None) for b in else_output_blob_names]
-        else_input_output_names_ordered = [
-            str(b) for b in (else_input_blobs + else_output_blobs)]
-
-        else_outer_blob_names = list(else_input_blob_names | else_output_blob_names)
-        else_outer_blob_names_idx = [
-            else_input_output_names_ordered.index(b) for b in else_outer_blob_names]
-
-        do_else_workspace_blob = \
-            if_net.NextScopedBlob(if_net.Name() + '/workspace_if_else')
-        else_input_blobs.append(do_else_workspace_blob)
-        else_output_blobs.append(do_else_workspace_blob)
-        # make sure that added workspace pointer blobs are in if inputs/outputs
-        if_inputs.append(do_else_workspace_blob)
-        if_outputs.append(do_else_workspace_blob)
-
-        do_else_net.Do(
-            else_input_blobs,
-            else_output_blobs,
-            net=else_net.Proto(),
-            inner_blobs=else_outer_blob_names,
-            outer_blobs_idx=else_outer_blob_names_idx)
-        do_else_net.AddExternalOutput(*else_output_blobs)
-        if_args['else_net'] = do_else_net.Proto()
-
-    if_net.CreateScope([], [do_then_workspace_blob])
-    if do_else_workspace_blob:
-        if_net.CreateScope([], [do_else_workspace_blob])
-    if_net.If(if_inputs, if_outputs, **if_args)
-    if_net.AddExternalOutput(*if_outputs)
-
-
-def add_while_op(
-        while_net, cond_blob, lexical_scope, loop_body_net, condition_body_net=None):
-    """
-    A helper function to add a While op to the net. Same rules for determining
-    outer and inner blobs as for the 'If' operator apply for the 'While' operator
-    loop and condition subnets. If specified, condition net is executed in a separate
-    workspace before the first and after each iteration, the last operator must have
-    a single scalar boolean output that is written into the condition blob.
-    Inputs:
-        while_net - net to add a While op to;
-        cond_blob - scalar bool blob reference, used as a stop condition;
-        lexical_scope - a set of outer blob names visible to the loop's body;
-        loop_body_net - net to execute on each iteration;
-        condition_body_net - net to compute condition value
-    """
-    input_blob_names, output_blob_names = get_external_blob_names(
-        loop_body_net, lexical_scope)
-
-    # Since it's possible that loop is not going to run even once
-    # we have to add loop's external outputs into inputs
-    input_blob_names |= output_blob_names
-
-    loop_inputs = [core.BlobReference(name=b, net=None) for b in input_blob_names]
-    loop_outputs = [core.BlobReference(name=b, net=None) for b in output_blob_names]
-
-    while_inputs = [cond_blob] + loop_inputs
-    while_outputs = [] + loop_outputs
-
-    do_loop_body_net = core.Net('do_loop_body_net')
-
-    loop_input_output_names_ordered = [
-        str(b) for b in (loop_inputs + loop_outputs)]
-    loop_body_outer_blob_names = list(input_blob_names | output_blob_names)
-    loop_body_outer_blob_names_idx = [
-        loop_input_output_names_ordered.index(b) for b in loop_body_outer_blob_names]
-
-    do_loop_body_workspace_blob = \
-        while_net.NextScopedBlob(while_net.Name() + '/workspace_loop_body')
-
-    loop_inputs.append(do_loop_body_workspace_blob)
-    loop_outputs.append(do_loop_body_workspace_blob)
-    # make sure that added workspace pointer blobs are in While inputs/outputs
-    while_inputs.append(do_loop_body_workspace_blob)
-    while_outputs.append(do_loop_body_workspace_blob)
-
-    do_loop_body_net.Do(
-        loop_inputs,
-        loop_outputs,
-        net=loop_body_net.Proto(),
-        inner_blobs=loop_body_outer_blob_names,
-        outer_blobs_idx=loop_body_outer_blob_names_idx,
-        copy_external_blobs=True)
-    do_loop_body_net.AddExternalOutput(*loop_outputs)
-
-    while_args = {}
-    while_args['loop_net'] = do_loop_body_net.Proto()
-
-    cond_workspace_blob = None
-    if condition_body_net:
-        cond_input_blob_names, cond_output_blob_names = get_external_blob_names(
-            condition_body_net, lexical_scope)
-
-        # make sure condition blob is written by condition net and is
-        # visible outside of it
-        found_condition_output = False
-        for op in condition_body_net.Proto().op:
-            if str(cond_blob) in op.output:
-                found_condition_output = True
-                break
-        assert found_condition_output, \
-            "Condition net does not write into condition blob"
-        if str(cond_blob) not in cond_output_blob_names:
-            cond_output_blob_names.add(str(cond_blob))
-
-        cond_inputs = [core.BlobReference(name=b, net=None)
-                        for b in cond_input_blob_names]
-        assert str(cond_blob) in cond_output_blob_names, \
-            'Condition blob expected in condition net output'
-        cond_outputs = [core.BlobReference(name=b, net=None)
-                        for b in cond_output_blob_names]
-
-        condition_net = core.Net('do_loop_condition_net')
-
-        cond_input_output_names_ordered = [
-            str(b) for b in (cond_inputs + cond_outputs)]
-        cond_body_outer_blob_names = \
-            list(cond_input_blob_names | cond_output_blob_names)
-        cond_body_outer_blob_names_idx = [
-            cond_input_output_names_ordered.index(b)
-            for b in cond_body_outer_blob_names]
-
-        cond_workspace_blob = \
-            while_net.NextScopedBlob(while_net.Name() + '/workspace_loop_cond')
-        cond_inputs.append(cond_workspace_blob)
-        cond_outputs.append(cond_workspace_blob)
-
-        condition_net.Do(
-            cond_inputs,
-            cond_outputs,
-            net=condition_body_net.Proto(),
-            inner_blobs=cond_body_outer_blob_names,
-            outer_blobs_idx=cond_body_outer_blob_names_idx)
-        condition_net.AddExternalOutput(*cond_outputs)
-
-        while_args['cond_net'] = condition_net.Proto()
-
-        while_inputs += [b for b in cond_inputs
-                            if str(b) not in input_blob_names]
-        while_outputs += [b for b in cond_outputs
-                            if str(b) not in output_blob_names]
-
-        if str(cond_blob) not in lexical_scope:
-            while_net.ConstantFill(
-                [],
-                cond_blob,
-                dtype=core.DataType.BOOL,
-                value=False)
-
-    while_net.CreateScope([], [do_loop_body_workspace_blob])
-    if cond_workspace_blob:
-        while_net.CreateScope([], [cond_workspace_blob])
-    while_net.While(while_inputs, while_outputs, **while_args)
-    while_net.AddExternalOutput(*while_outputs)
diff --git a/caffe2/python/control_test.py b/caffe2/python/control_test.py
deleted file mode 100644
index ee47ccb4bd08..000000000000
--- a/caffe2/python/control_test.py
+++ /dev/null
@@ -1,331 +0,0 @@
-
-
-
-
-
-from caffe2.python import control, core, test_util, workspace
-
-import logging
-logger = logging.getLogger(__name__)
-
-
-class TestControl(test_util.TestCase):
-    def setUp(self):
-        super().setUp()
-        self.N_ = 10
-
-        self.init_net_ = core.Net("init-net")
-        cnt = self.init_net_.CreateCounter([], init_count=0)
-        const_n = self.init_net_.ConstantFill(
-            [], shape=[], value=self.N_, dtype=core.DataType.INT64)
-        const_0 = self.init_net_.ConstantFill(
-            [], shape=[], value=0, dtype=core.DataType.INT64)
-
-        self.cnt_net_ = core.Net("cnt-net")
-        self.cnt_net_.CountUp([cnt])
-        curr_cnt = self.cnt_net_.RetrieveCount([cnt])
-        self.init_net_.ConstantFill(
-            [], [curr_cnt], shape=[], value=0, dtype=core.DataType.INT64)
-        self.cnt_net_.AddExternalOutput(curr_cnt)
-
-        self.cnt_2_net_ = core.Net("cnt-2-net")
-        self.cnt_2_net_.CountUp([cnt])
-        self.cnt_2_net_.CountUp([cnt])
-        curr_cnt_2 = self.cnt_2_net_.RetrieveCount([cnt])
-        self.init_net_.ConstantFill(
-            [], [curr_cnt_2], shape=[], value=0, dtype=core.DataType.INT64)
-        self.cnt_2_net_.AddExternalOutput(curr_cnt_2)
-
-        self.cond_net_ = core.Net("cond-net")
-        cond_blob = self.cond_net_.LT([curr_cnt, const_n])
-        self.cond_net_.AddExternalOutput(cond_blob)
-
-        self.not_cond_net_ = core.Net("not-cond-net")
-        cond_blob = self.not_cond_net_.GE([curr_cnt, const_n])
-        self.not_cond_net_.AddExternalOutput(cond_blob)
-
-        self.true_cond_net_ = core.Net("true-cond-net")
-        true_blob = self.true_cond_net_.LT([const_0, const_n])
-        self.true_cond_net_.AddExternalOutput(true_blob)
-
-        self.false_cond_net_ = core.Net("false-cond-net")
-        false_blob = self.false_cond_net_.GT([const_0, const_n])
-        self.false_cond_net_.AddExternalOutput(false_blob)
-
-        self.idle_net_ = core.Net("idle-net")
-        self.idle_net_.ConstantFill(
-            [], shape=[], value=0, dtype=core.DataType.INT64)
-
-    def CheckNetOutput(self, nets_and_expects):
-        """
-        Check the net output is expected
-        nets_and_expects is a list of tuples (net, expect)
-        """
-        for net, expect in nets_and_expects:
-            output = workspace.FetchBlob(
-                net.Proto().external_output[-1])
-            self.assertEqual(output, expect)
-
-    def CheckNetAllOutput(self, net, expects):
-        """
-        Check the net output is expected
-        expects is a list of bools.
-        """
-        self.assertEqual(len(net.Proto().external_output), len(expects))
-        for i in range(len(expects)):
-            output = workspace.FetchBlob(
-                net.Proto().external_output[i])
-            self.assertEqual(output, expects[i])
-
-    def BuildAndRunPlan(self, step):
-        plan = core.Plan("test")
-        plan.AddStep(control.Do('init', self.init_net_))
-        plan.AddStep(step)
-        self.assertEqual(workspace.RunPlan(plan), True)
-
-    def ForLoopTest(self, nets_or_steps):
-        step = control.For('myFor', nets_or_steps, self.N_)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(self.cnt_net_, self.N_)])
-
-    def testForLoopWithNets(self):
-        self.ForLoopTest(self.cnt_net_)
-        self.ForLoopTest([self.cnt_net_, self.idle_net_])
-
-    def testForLoopWithStep(self):
-        step = control.Do('count', self.cnt_net_)
-        self.ForLoopTest(step)
-        self.ForLoopTest([step, self.idle_net_])
-
-    def WhileLoopTest(self, nets_or_steps):
-        step = control.While('myWhile', self.cond_net_, nets_or_steps)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(self.cnt_net_, self.N_)])
-
-    def testWhileLoopWithNet(self):
-        self.WhileLoopTest(self.cnt_net_)
-        self.WhileLoopTest([self.cnt_net_, self.idle_net_])
-
-    def testWhileLoopWithStep(self):
-        step = control.Do('count', self.cnt_net_)
-        self.WhileLoopTest(step)
-        self.WhileLoopTest([step, self.idle_net_])
-
-    def UntilLoopTest(self, nets_or_steps):
-        step = control.Until('myUntil', self.not_cond_net_, nets_or_steps)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(self.cnt_net_, self.N_)])
-
-    def testUntilLoopWithNet(self):
-        self.UntilLoopTest(self.cnt_net_)
-        self.UntilLoopTest([self.cnt_net_, self.idle_net_])
-
-    def testUntilLoopWithStep(self):
-        step = control.Do('count', self.cnt_net_)
-        self.UntilLoopTest(step)
-        self.UntilLoopTest([step, self.idle_net_])
-
-    def DoWhileLoopTest(self, nets_or_steps):
-        step = control.DoWhile('myDoWhile', self.cond_net_, nets_or_steps)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(self.cnt_net_, self.N_)])
-
-    def testDoWhileLoopWithNet(self):
-        self.DoWhileLoopTest(self.cnt_net_)
-        self.DoWhileLoopTest([self.idle_net_, self.cnt_net_])
-
-    def testDoWhileLoopWithStep(self):
-        step = control.Do('count', self.cnt_net_)
-        self.DoWhileLoopTest(step)
-        self.DoWhileLoopTest([self.idle_net_, step])
-
-    def DoUntilLoopTest(self, nets_or_steps):
-        step = control.DoUntil('myDoUntil', self.not_cond_net_, nets_or_steps)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(self.cnt_net_, self.N_)])
-
-    def testDoUntilLoopWithNet(self):
-        self.DoUntilLoopTest(self.cnt_net_)
-        self.DoUntilLoopTest([self.cnt_net_, self.idle_net_])
-
-    def testDoUntilLoopWithStep(self):
-        step = control.Do('count', self.cnt_net_)
-        self.DoUntilLoopTest(step)
-        self.DoUntilLoopTest([self.idle_net_, step])
-
-    def IfCondTest(self, cond_net, expect, cond_on_blob):
-        if cond_on_blob:
-            step = control.Do(
-                'if-all',
-                control.Do('count', cond_net),
-                control.If('myIf', cond_net.Proto().external_output[-1],
-                           self.cnt_net_))
-        else:
-            step = control.If('myIf', cond_net, self.cnt_net_)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(self.cnt_net_, expect)])
-
-    def testIfCondTrueOnNet(self):
-        self.IfCondTest(self.true_cond_net_, 1, False)
-
-    def testIfCondTrueOnBlob(self):
-        self.IfCondTest(self.true_cond_net_, 1, True)
-
-    def testIfCondFalseOnNet(self):
-        self.IfCondTest(self.false_cond_net_, 0, False)
-
-    def testIfCondFalseOnBlob(self):
-        self.IfCondTest(self.false_cond_net_, 0, True)
-
-    def IfElseCondTest(self, cond_net, cond_value, expect, cond_on_blob):
-        if cond_value:
-            run_net = self.cnt_net_
-        else:
-            run_net = self.cnt_2_net_
-        if cond_on_blob:
-            step = control.Do(
-                'if-else-all',
-                control.Do('count', cond_net),
-                control.If('myIfElse', cond_net.Proto().external_output[-1],
-                           self.cnt_net_, self.cnt_2_net_))
-        else:
-            step = control.If('myIfElse', cond_net,
-                              self.cnt_net_, self.cnt_2_net_)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(run_net, expect)])
-
-    def testIfElseCondTrueOnNet(self):
-        self.IfElseCondTest(self.true_cond_net_, True, 1, False)
-
-    def testIfElseCondTrueOnBlob(self):
-        self.IfElseCondTest(self.true_cond_net_, True, 1, True)
-
-    def testIfElseCondFalseOnNet(self):
-        self.IfElseCondTest(self.false_cond_net_, False, 2, False)
-
-    def testIfElseCondFalseOnBlob(self):
-        self.IfElseCondTest(self.false_cond_net_, False, 2, True)
-
-    def IfNotCondTest(self, cond_net, expect, cond_on_blob):
-        if cond_on_blob:
-            step = control.Do(
-                'if-not',
-                control.Do('count', cond_net),
-                control.IfNot('myIfNot', cond_net.Proto().external_output[-1],
-                              self.cnt_net_))
-        else:
-            step = control.IfNot('myIfNot', cond_net, self.cnt_net_)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(self.cnt_net_, expect)])
-
-    def testIfNotCondTrueOnNet(self):
-        self.IfNotCondTest(self.true_cond_net_, 0, False)
-
-    def testIfNotCondTrueOnBlob(self):
-        self.IfNotCondTest(self.true_cond_net_, 0, True)
-
-    def testIfNotCondFalseOnNet(self):
-        self.IfNotCondTest(self.false_cond_net_, 1, False)
-
-    def testIfNotCondFalseOnBlob(self):
-        self.IfNotCondTest(self.false_cond_net_, 1, True)
-
-    def IfNotElseCondTest(self, cond_net, cond_value, expect, cond_on_blob):
-        if cond_value:
-            run_net = self.cnt_2_net_
-        else:
-            run_net = self.cnt_net_
-        if cond_on_blob:
-            step = control.Do(
-                'if-not-else',
-                control.Do('count', cond_net),
-                control.IfNot('myIfNotElse',
-                              cond_net.Proto().external_output[-1],
-                              self.cnt_net_, self.cnt_2_net_))
-        else:
-            step = control.IfNot('myIfNotElse', cond_net,
-                                 self.cnt_net_, self.cnt_2_net_)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(run_net, expect)])
-
-    def testIfNotElseCondTrueOnNet(self):
-        self.IfNotElseCondTest(self.true_cond_net_, True, 2, False)
-
-    def testIfNotElseCondTrueOnBlob(self):
-        self.IfNotElseCondTest(self.true_cond_net_, True, 2, True)
-
-    def testIfNotElseCondFalseOnNet(self):
-        self.IfNotElseCondTest(self.false_cond_net_, False, 1, False)
-
-    def testIfNotElseCondFalseOnBlob(self):
-        self.IfNotElseCondTest(self.false_cond_net_, False, 1, True)
-
-    def testSwitch(self):
-        step = control.Switch(
-            'mySwitch',
-            (self.false_cond_net_, self.cnt_net_),
-            (self.true_cond_net_, self.cnt_2_net_)
-        )
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(self.cnt_net_, 0), (self.cnt_2_net_, 2)])
-
-    def testSwitchNot(self):
-        step = control.SwitchNot(
-            'mySwitchNot',
-            (self.false_cond_net_, self.cnt_net_),
-            (self.true_cond_net_, self.cnt_2_net_)
-        )
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(self.cnt_net_, 1), (self.cnt_2_net_, 0)])
-
-    def testBoolNet(self):
-        bool_net = control.BoolNet(('a', True))
-        step = control.Do('bool', bool_net)
-        self.BuildAndRunPlan(step)
-        self.CheckNetAllOutput(bool_net, [True])
-
-        bool_net = control.BoolNet(('a', True), ('b', False))
-        step = control.Do('bool', bool_net)
-        self.BuildAndRunPlan(step)
-        self.CheckNetAllOutput(bool_net, [True, False])
-
-        bool_net = control.BoolNet([('a', True), ('b', False)])
-        step = control.Do('bool', bool_net)
-        self.BuildAndRunPlan(step)
-        self.CheckNetAllOutput(bool_net, [True, False])
-
-    def testCombineConditions(self):
-        # combined by 'Or'
-        combine_net = control.CombineConditions(
-            'test', [self.true_cond_net_, self.false_cond_net_], 'Or')
-        step = control.Do('combine',
-                          self.true_cond_net_,
-                          self.false_cond_net_,
-                          combine_net)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(combine_net, True)])
-
-        # combined by 'And'
-        combine_net = control.CombineConditions(
-            'test', [self.true_cond_net_, self.false_cond_net_], 'And')
-        step = control.Do('combine',
-                          self.true_cond_net_,
-                          self.false_cond_net_,
-                          combine_net)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(combine_net, False)])
-
-    def testMergeConditionNets(self):
-        # merged by 'Or'
-        merge_net = control.MergeConditionNets(
-            'test', [self.true_cond_net_, self.false_cond_net_], 'Or')
-        step = control.Do('merge', merge_net)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(merge_net, True)])
-
-        # merged by 'And'
-        merge_net = control.MergeConditionNets(
-            'test', [self.true_cond_net_, self.false_cond_net_], 'And')
-        step = control.Do('merge', merge_net)
-        self.BuildAndRunPlan(step)
-        self.CheckNetOutput([(merge_net, False)])
diff --git a/caffe2/python/convert.py b/caffe2/python/convert.py
deleted file mode 100644
index b3b5822c46ca..000000000000
--- a/caffe2/python/convert.py
+++ /dev/null
@@ -1,2 +0,0 @@
-## @package workspace
-# Module caffe2.python.workspace
diff --git a/caffe2/python/convert_test.py b/caffe2/python/convert_test.py
deleted file mode 100644
index d9d82bf5e6c4..000000000000
--- a/caffe2/python/convert_test.py
+++ /dev/null
@@ -1,14 +0,0 @@
-
-
-
-
-
-from caffe2.python import workspace
-import unittest
-
-class TestOperator(unittest.TestCase):
-    def setUp(self):
-        workspace.ResetWorkspace()
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py
deleted file mode 100644
index 3aac78c18df1..000000000000
--- a/caffe2/python/convnet_benchmarks.py
+++ /dev/null
@@ -1,727 +0,0 @@
-## @package convnet_benchmarks
-# Module caffe2.python.convnet_benchmarks
-"""
-Benchmark for common convnets.
-
-Speed on Titan X, with 10 warmup steps and 10 main steps and with different
-versions of cudnn, are as follows (time reported below is per-batch time,
-forward / forward+backward):
-
-                    CuDNN V3        CuDNN v4
-AlexNet         32.5 / 108.0    27.4 /  90.1
-OverFeat       113.0 / 342.3    91.7 / 276.5
-Inception      134.5 / 485.8   125.7 / 450.6
-VGG (batch 64) 200.8 / 650.0   164.1 / 551.7
-
-Speed on Inception with varied batch sizes and CuDNN v4 is as follows:
-
-Batch Size   Speed per batch     Speed per image
- 16             22.8 /  72.7         1.43 / 4.54
- 32             38.0 / 127.5         1.19 / 3.98
- 64             67.2 / 233.6         1.05 / 3.65
-128            125.7 / 450.6         0.98 / 3.52
-
-Speed on Tesla M40, which 10 warmup steps and 10 main steps and with cudnn
-v4, is as follows:
-
-AlexNet         68.4 / 218.1
-OverFeat       210.5 / 630.3
-Inception      300.2 / 1122.2
-VGG (batch 64) 405.8 / 1327.7
-
-(Note that these numbers involve a "full" backprop, i.e. the gradient
-with respect to the input image is also computed.)
-
-To get the numbers, simply run:
-
-for MODEL in AlexNet OverFeat Inception; do
-  PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-    --batch_size 128 --model $MODEL --forward_only True
-done
-for MODEL in AlexNet OverFeat Inception; do
-  PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-    --batch_size 128 --model $MODEL
-done
-PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-  --batch_size 64 --model VGGA --forward_only True
-PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-  --batch_size 64 --model VGGA
-
-for BS in 16 32 64 128; do
-  PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-    --batch_size $BS --model Inception --forward_only True
-  PYTHONPATH=../gen:$PYTHONPATH python convnet_benchmarks.py \
-    --batch_size $BS --model Inception
-done
-
-Note that VGG needs to be run at batch 64 due to memory limit on the backward
-pass.
-"""
-
-import argparse
-
-from caffe2.python import workspace, brew, model_helper
-
-
-def MLP(order, cudnn_ws):
-    model = model_helper.ModelHelper(name="MLP")
-    d = 256
-    depth = 20
-    width = 3
-    for i in range(depth):
-        for j in range(width):
-            current = "fc_{}_{}".format(i, j) if i > 0 else "data"
-            next_ = "fc_{}_{}".format(i + 1, j)
-            brew.fc(
-                model,
-                current,
-                next_,
-                dim_in=d,
-                dim_out=d,
-                weight_init=('XavierFill', {}),
-                bias_init=('XavierFill', {}),
-            )
-    brew.sum(
-        model, ["fc_{}_{}".format(depth, j) for j in range(width)], ["sum"]
-    )
-    brew.fc(
-        model,
-        "sum",
-        "last",
-        dim_in=d,
-        dim_out=1000,
-        weight_init=('XavierFill', {}),
-        bias_init=('XavierFill', {}),
-    )
-    xent = model.net.LabelCrossEntropy(["last", "label"], "xent")
-    model.net.AveragedLoss(xent, "loss")
-    return model, d
-
-
-def AlexNet(order, cudnn_ws):
-    my_arg_scope = {
-        'order': order,
-        'use_cudnn': True,
-        'cudnn_exhaustive_search': True,
-    }
-    if cudnn_ws:
-        my_arg_scope['ws_nbytes_limit'] = cudnn_ws
-    model = model_helper.ModelHelper(
-        name="alexnet",
-        arg_scope=my_arg_scope,
-    )
-    conv1 = brew.conv(
-        model,
-        "data",
-        "conv1",
-        3,
-        64,
-        11, ('XavierFill', {}), ('ConstantFill', {}),
-        stride=4,
-        pad=2
-    )
-    relu1 = brew.relu(model, conv1, "conv1")
-    pool1 = brew.max_pool(model, relu1, "pool1", kernel=3, stride=2)
-    conv2 = brew.conv(
-        model,
-        pool1,
-        "conv2",
-        64,
-        192,
-        5,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=2
-    )
-    relu2 = brew.relu(model, conv2, "conv2")
-    pool2 = brew.max_pool(model, relu2, "pool2", kernel=3, stride=2)
-    conv3 = brew.conv(
-        model,
-        pool2,
-        "conv3",
-        192,
-        384,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu3 = brew.relu(model, conv3, "conv3")
-    conv4 = brew.conv(
-        model,
-        relu3,
-        "conv4",
-        384,
-        256,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu4 = brew.relu(model, conv4, "conv4")
-    conv5 = brew.conv(
-        model,
-        relu4,
-        "conv5",
-        256,
-        256,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu5 = brew.relu(model, conv5, "conv5")
-    pool5 = brew.max_pool(model, relu5, "pool5", kernel=3, stride=2)
-    fc6 = brew.fc(
-        model,
-        pool5, "fc6", 256 * 6 * 6, 4096, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    relu6 = brew.relu(model, fc6, "fc6")
-    fc7 = brew.fc(
-        model, relu6, "fc7", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    relu7 = brew.relu(model, fc7, "fc7")
-    fc8 = brew.fc(
-        model, relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    pred = brew.softmax(model, fc8, "pred")
-    xent = model.net.LabelCrossEntropy([pred, "label"], "xent")
-    model.net.AveragedLoss(xent, "loss")
-    return model, 224
-
-
-def OverFeat(order, cudnn_ws):
-    my_arg_scope = {
-        'order': order,
-        'use_cudnn': True,
-        'cudnn_exhaustive_search': True,
-    }
-    if cudnn_ws:
-        my_arg_scope['ws_nbytes_limit'] = cudnn_ws
-    model = model_helper.ModelHelper(
-        name="overfeat",
-        arg_scope=my_arg_scope,
-    )
-    conv1 = brew.conv(
-        model,
-        "data",
-        "conv1",
-        3,
-        96,
-        11,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        stride=4,
-    )
-    relu1 = brew.relu(model, conv1, "conv1")
-    pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
-    conv2 = brew.conv(
-        model, pool1, "conv2", 96, 256, 5, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    relu2 = brew.relu(model, conv2, "conv2")
-    pool2 = brew.max_pool(model, relu2, "pool2", kernel=2, stride=2)
-    conv3 = brew.conv(
-        model,
-        pool2,
-        "conv3",
-        256,
-        512,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu3 = brew.relu(model, conv3, "conv3")
-    conv4 = brew.conv(
-        model,
-        relu3,
-        "conv4",
-        512,
-        1024,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu4 = brew.relu(model, conv4, "conv4")
-    conv5 = brew.conv(
-        model,
-        relu4,
-        "conv5",
-        1024,
-        1024,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu5 = brew.relu(model, conv5, "conv5")
-    pool5 = brew.max_pool(model, relu5, "pool5", kernel=2, stride=2)
-    fc6 = brew.fc(
-        model, pool5, "fc6", 1024 * 6 * 6, 3072, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    relu6 = brew.relu(model, fc6, "fc6")
-    fc7 = brew.fc(
-        model, relu6, "fc7", 3072, 4096, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    relu7 = brew.relu(model, fc7, "fc7")
-    fc8 = brew.fc(
-        model, relu7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    pred = brew.softmax(model, fc8, "pred")
-    xent = model.net.LabelCrossEntropy([pred, "label"], "xent")
-    model.net.AveragedLoss(xent, "loss")
-    return model, 231
-
-
-def VGGA(order, cudnn_ws):
-    my_arg_scope = {
-        'order': order,
-        'use_cudnn': True,
-        'cudnn_exhaustive_search': True,
-    }
-    if cudnn_ws:
-        my_arg_scope['ws_nbytes_limit'] = cudnn_ws
-    model = model_helper.ModelHelper(
-        name="vgga",
-        arg_scope=my_arg_scope,
-    )
-    conv1 = brew.conv(
-        model,
-        "data",
-        "conv1",
-        3,
-        64,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu1 = brew.relu(model, conv1, "conv1")
-    pool1 = brew.max_pool(model, relu1, "pool1", kernel=2, stride=2)
-    conv2 = brew.conv(
-        model,
-        pool1,
-        "conv2",
-        64,
-        128,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu2 = brew.relu(model, conv2, "conv2")
-    pool2 = brew.max_pool(model, relu2, "pool2", kernel=2, stride=2)
-    conv3 = brew.conv(
-        model,
-        pool2,
-        "conv3",
-        128,
-        256,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu3 = brew.relu(model, conv3, "conv3")
-    conv4 = brew.conv(
-        model,
-        relu3,
-        "conv4",
-        256,
-        256,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu4 = brew.relu(model, conv4, "conv4")
-    pool4 = brew.max_pool(model, relu4, "pool4", kernel=2, stride=2)
-    conv5 = brew.conv(
-        model,
-        pool4,
-        "conv5",
-        256,
-        512,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu5 = brew.relu(model, conv5, "conv5")
-    conv6 = brew.conv(
-        model,
-        relu5,
-        "conv6",
-        512,
-        512,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu6 = brew.relu(model, conv6, "conv6")
-    pool6 = brew.max_pool(model, relu6, "pool6", kernel=2, stride=2)
-    conv7 = brew.conv(
-        model,
-        pool6,
-        "conv7",
-        512,
-        512,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu7 = brew.relu(model, conv7, "conv7")
-    conv8 = brew.conv(
-        model,
-        relu7,
-        "conv8",
-        512,
-        512,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu8 = brew.relu(model, conv8, "conv8")
-    pool8 = brew.max_pool(model, relu8, "pool8", kernel=2, stride=2)
-
-    fcix = brew.fc(
-        model, pool8, "fcix", 512 * 7 * 7, 4096, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    reluix = brew.relu(model, fcix, "fcix")
-    fcx = brew.fc(
-        model, reluix, "fcx", 4096, 4096, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    relux = brew.relu(model, fcx, "fcx")
-    fcxi = brew.fc(
-        model, relux, "fcxi", 4096, 1000, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    pred = brew.softmax(model, fcxi, "pred")
-    xent = model.net.LabelCrossEntropy([pred, "label"], "xent")
-    model.net.AveragedLoss(xent, "loss")
-    return model, 231
-
-
-def _InceptionModule(
-    model, input_blob, input_depth, output_name, conv1_depth, conv3_depths,
-    conv5_depths, pool_depth
-):
-    # path 1: 1x1 conv
-    conv1 = brew.conv(
-        model, input_blob, output_name + ":conv1", input_depth, conv1_depth, 1,
-        ('XavierFill', {}), ('ConstantFill', {})
-    )
-    conv1 = brew.relu(model, conv1, conv1)
-    # path 2: 1x1 conv + 3x3 conv
-    conv3_reduce = brew.conv(
-        model, input_blob, output_name + ":conv3_reduce", input_depth,
-        conv3_depths[0], 1, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    conv3_reduce = brew.relu(model, conv3_reduce, conv3_reduce)
-    conv3 = brew.conv(
-        model,
-        conv3_reduce,
-        output_name + ":conv3",
-        conv3_depths[0],
-        conv3_depths[1],
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    conv3 = brew.relu(model, conv3, conv3)
-    # path 3: 1x1 conv + 5x5 conv
-    conv5_reduce = brew.conv(
-        model, input_blob, output_name + ":conv5_reduce", input_depth,
-        conv5_depths[0], 1, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    conv5_reduce = brew.relu(model, conv5_reduce, conv5_reduce)
-    conv5 = brew.conv(
-        model,
-        conv5_reduce,
-        output_name + ":conv5",
-        conv5_depths[0],
-        conv5_depths[1],
-        5,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=2,
-    )
-    conv5 = brew.relu(model, conv5, conv5)
-    # path 4: pool + 1x1 conv
-    pool = brew.max_pool(
-        model,
-        input_blob,
-        output_name + ":pool",
-        kernel=3,
-        stride=1,
-        pad=1,
-    )
-    pool_proj = brew.conv(
-        model, pool, output_name + ":pool_proj", input_depth, pool_depth, 1,
-        ('XavierFill', {}), ('ConstantFill', {})
-    )
-    pool_proj = brew.relu(model, pool_proj, pool_proj)
-    output = brew.concat(model, [conv1, conv3, conv5, pool_proj], output_name)
-    return output
-
-
-def Inception(order, cudnn_ws):
-    my_arg_scope = {
-        'order': order,
-        'use_cudnn': True,
-        'cudnn_exhaustive_search': True,
-    }
-    if cudnn_ws:
-        my_arg_scope['ws_nbytes_limit'] = cudnn_ws
-    model = model_helper.ModelHelper(
-        name="inception",
-        arg_scope=my_arg_scope,
-    )
-    conv1 = brew.conv(
-        model,
-        "data",
-        "conv1",
-        3,
-        64,
-        7,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        stride=2,
-        pad=3,
-    )
-    relu1 = brew.relu(model, conv1, "conv1")
-    pool1 = brew.max_pool(model, relu1, "pool1", kernel=3, stride=2, pad=1)
-    conv2a = brew.conv(
-        model, pool1, "conv2a", 64, 64, 1, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    conv2a = brew.relu(model, conv2a, conv2a)
-    conv2 = brew.conv(
-        model,
-        conv2a,
-        "conv2",
-        64,
-        192,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1,
-    )
-    relu2 = brew.relu(model, conv2, "conv2")
-    pool2 = brew.max_pool(model, relu2, "pool2", kernel=3, stride=2, pad=1)
-    # Inception modules
-    inc3 = _InceptionModule(
-        model, pool2, 192, "inc3", 64, [96, 128], [16, 32], 32
-    )
-    inc4 = _InceptionModule(
-        model, inc3, 256, "inc4", 128, [128, 192], [32, 96], 64
-    )
-    pool5 = brew.max_pool(model, inc4, "pool5", kernel=3, stride=2, pad=1)
-    inc5 = _InceptionModule(
-        model, pool5, 480, "inc5", 192, [96, 208], [16, 48], 64
-    )
-    inc6 = _InceptionModule(
-        model, inc5, 512, "inc6", 160, [112, 224], [24, 64], 64
-    )
-    inc7 = _InceptionModule(
-        model, inc6, 512, "inc7", 128, [128, 256], [24, 64], 64
-    )
-    inc8 = _InceptionModule(
-        model, inc7, 512, "inc8", 112, [144, 288], [32, 64], 64
-    )
-    inc9 = _InceptionModule(
-        model, inc8, 528, "inc9", 256, [160, 320], [32, 128], 128
-    )
-    pool9 = brew.max_pool(model, inc9, "pool9", kernel=3, stride=2, pad=1)
-    inc10 = _InceptionModule(
-        model, pool9, 832, "inc10", 256, [160, 320], [32, 128], 128
-    )
-    inc11 = _InceptionModule(
-        model, inc10, 832, "inc11", 384, [192, 384], [48, 128], 128
-    )
-    pool11 = brew.average_pool(model, inc11, "pool11", kernel=7, stride=1)
-    fc = brew.fc(
-        model, pool11, "fc", 1024, 1000, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    # It seems that Soumith's benchmark does not have softmax on top
-    # for Inception. We will add it anyway so we can have a proper
-    # backward pass.
-    pred = brew.softmax(model, fc, "pred")
-    xent = model.net.LabelCrossEntropy([pred, "label"], "xent")
-    model.net.AveragedLoss(xent, "loss")
-    return model, 224
-
-
-def AddParameterUpdate(model):
-    """ Simple plain SGD update -- not tuned to actually train the models """
-    ITER = brew.iter(model, "iter")
-    LR = model.net.LearningRate(
-        ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999)
-    ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
-    for param in model.params:
-        param_grad = model.param_to_grad[param]
-        model.net.WeightedSum([param, ONE, param_grad, LR], param)
-
-
-def Benchmark(model_gen, arg):
-    model, input_size = model_gen(arg.order, arg.cudnn_ws)
-    model.Proto().type = arg.net_type
-    model.Proto().num_workers = arg.num_workers
-
-    # In order to be able to run everything without feeding more stuff, let's
-    # add the data and label blobs to the parameter initialization net as well.
-    if arg.order == "NCHW":
-        input_shape = [arg.batch_size, 3, input_size, input_size]
-    else:
-        input_shape = [arg.batch_size, input_size, input_size, 3]
-    if arg.model == "MLP":
-        input_shape = [arg.batch_size, input_size]
-
-    model.param_init_net.GaussianFill(
-        [],
-        "data",
-        shape=input_shape,
-        mean=0.0,
-        std=1.0
-    )
-    model.param_init_net.UniformIntFill(
-        [],
-        "label",
-        shape=[arg.batch_size, ],
-        min=0,
-        max=999
-    )
-
-    if arg.forward_only:
-        print('{}: running forward only.'.format(arg.model))
-    else:
-        print('{}: running forward-backward.'.format(arg.model))
-        model.AddGradientOperators(["loss"])
-        AddParameterUpdate(model)
-        if arg.order == 'NHWC':
-            print(
-                '==WARNING==\n'
-                'NHWC order with CuDNN may not be supported yet, so I might\n'
-                'exit suddenly.'
-            )
-
-    if not arg.cpu:
-        model.param_init_net.RunAllOnGPU()
-        model.net.RunAllOnGPU()
-
-    if arg.engine:
-        for op in model.net.Proto().op:
-            op.engine = arg.engine
-
-    if arg.dump_model:
-        # Writes out the pbtxt for benchmarks on e.g. Android
-        with open(
-            "{0}_init_batch_{1}.pbtxt".format(arg.model, arg.batch_size), "w"
-        ) as fid:
-            fid.write(str(model.param_init_net.Proto()))
-        with open("{0}.pbtxt".format(arg.model, arg.batch_size), "w") as fid:
-            fid.write(str(model.net.Proto()))
-
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-    workspace.BenchmarkNet(
-        model.net.Proto().name, arg.warmup_iterations, arg.iterations,
-        arg.layer_wise_benchmark)
-
-
-def GetArgumentParser():
-    parser = argparse.ArgumentParser(description="Caffe2 benchmark.")
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=128,
-        help="The batch size."
-    )
-    parser.add_argument("--model", type=str, help="The model to benchmark.")
-    parser.add_argument(
-        "--order",
-        type=str,
-        default="NCHW",
-        help="The order to evaluate."
-    )
-    parser.add_argument(
-        "--cudnn_ws",
-        type=int,
-        help="The cudnn workspace size."
-    )
-    parser.add_argument(
-        "--iterations",
-        type=int,
-        default=10,
-        help="Number of iterations to run the network."
-    )
-    parser.add_argument(
-        "--warmup_iterations",
-        type=int,
-        default=10,
-        help="Number of warm-up iterations before benchmarking."
-    )
-    parser.add_argument(
-        "--forward_only",
-        action='store_true',
-        help="If set, only run the forward pass."
-    )
-    parser.add_argument(
-        "--layer_wise_benchmark",
-        action='store_true',
-        help="If True, run the layer-wise benchmark as well."
-    )
-    parser.add_argument(
-        "--cpu",
-        action='store_true',
-        help="If True, run testing on CPU instead of GPU."
-    )
-    parser.add_argument(
-        "--engine",
-        type=str,
-        default="",
-        help="If set, blindly prefer the given engine(s) for every op.")
-    parser.add_argument(
-        "--dump_model",
-        action='store_true',
-        help="If True, dump the model prototxts to disk."
-    )
-    parser.add_argument("--net_type", type=str, default="dag")
-    parser.add_argument("--num_workers", type=int, default=2)
-    parser.add_argument("--use-nvtx", default=False, action='store_true')
-    parser.add_argument("--htrace_span_log_path", type=str)
-    return parser
-
-
-if __name__ == '__main__':
-    args, extra_args = GetArgumentParser().parse_known_args()
-    if (
-        not args.batch_size or not args.model or not args.order
-    ):
-        GetArgumentParser().print_help()
-    else:
-        workspace.GlobalInit(
-            ['caffe2', '--caffe2_log_level=0'] + extra_args +
-            (['--caffe2_use_nvtx'] if args.use_nvtx else []) +
-            (['--caffe2_htrace_span_log_path=' + args.htrace_span_log_path]
-                if args.htrace_span_log_path else []))
-
-        model_map = {
-            'AlexNet': AlexNet,
-            'OverFeat': OverFeat,
-            'VGGA': VGGA,
-            'Inception': Inception,
-            'MLP': MLP,
-        }
-        Benchmark(model_map[args.model], args)
diff --git a/caffe2/python/convnet_benchmarks_test.py b/caffe2/python/convnet_benchmarks_test.py
deleted file mode 100644
index 3b60092499b2..000000000000
--- a/caffe2/python/convnet_benchmarks_test.py
+++ /dev/null
@@ -1,23 +0,0 @@
-import unittest
-from caffe2.python import convnet_benchmarks as cb
-from caffe2.python import test_util, workspace
-
-
-# TODO: investigate why this randomly core dump in ROCM CI
-@unittest.skipIf(not workspace.has_cuda_support, "no cuda gpu")
-class TestConvnetBenchmarks(test_util.TestCase):
-    def testConvnetBenchmarks(self):
-        all_args = [
-            '--batch_size 16 --order NCHW --iterations 1 '
-            '--warmup_iterations 1',
-            '--batch_size 16 --order NCHW --iterations 1 '
-            '--warmup_iterations 1 --forward_only',
-        ]
-        for model in [cb.AlexNet, cb.OverFeat, cb.VGGA, cb.Inception]:
-            for arg_str in all_args:
-                args = cb.GetArgumentParser().parse_args(arg_str.split(' '))
-                cb.Benchmark(model, args)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
deleted file mode 100644
index f04a58a24739..000000000000
--- a/caffe2/python/core.py
+++ /dev/null
@@ -1,3070 +0,0 @@
-## @package core
-# Module caffe2.python.core
-
-
-
-
-
-from collections import namedtuple, OrderedDict, defaultdict
-from past.builtins import basestring
-from itertools import chain
-from typing import Dict, Set
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import scope, utils, workspace
-from caffe2.python.lazy import TriggerLazyImport
-from caffe2.python.control_ops_grad import \
-    gen_do_gradient, gen_if_gradient, gen_while_gradient, disambiguate_grad_if_op_output
-
-import caffe2.python._import_c_extension as C
-
-import copy
-import pickle
-import numpy as np
-import sys
-import traceback
-import os
-
-# Mac os specific message
-if (sys.platform == 'darwin' and 'leveldb' in C.registered_dbs()):
-    print('If you are using homebrew leveldb on a Mac OS, you might see an '
-          'error warning you that malloc_zone_unregister() failed. This is '
-          'not a caffe2 issue but is due to the homebrew leveldb having an '
-          'incompatible memory allocator. It does not affect usage.')
-
-# Convenience redirections to functions inside scope.
-DeviceScope = scope.DeviceScope
-NameScope = scope.NameScope
-
-
-# Bring datatype enums to the main namespace
-class DataType:
-    UNDEFINED = 0
-    FLOAT = 1
-    INT32 = 2
-    BYTE = 3
-    STRING = 4
-    BOOL = 5
-    UINT8 = 6
-    INT8 = 7
-    UINT16 = 8
-    INT16 = 9
-    INT64 = 10
-    FLOAT16 = 12
-    DOUBLE = 13
-    ZERO_COLLISION_HASH = 14
-    REBATCHING_BUFFER = 15
-
-
-def _CheckDataType():
-    # Verify that the DataType values defined above match the ones defined in
-    # the caffe2.proto file
-    for name, value in caffe2_pb2.TensorProto.DataType.items():
-        py_value = getattr(DataType, name, None)
-        if py_value != value:
-            raise AssertionError(
-                f"DataType {name} does not match the value defined in "
-                f"caffe2.proto: {py_value} vs {value}"
-            )
-
-
-_CheckDataType()
-
-
-def _GetRegisteredOperators():
-    return set(workspace.RegisteredOperators())
-
-
-_REGISTERED_OPERATORS = _GetRegisteredOperators()
-
-
-def RefreshRegisteredOperators(trigger_lazy=True):
-    if trigger_lazy:
-        TriggerLazyImport()
-    global _REGISTERED_OPERATORS
-    _REGISTERED_OPERATORS = _GetRegisteredOperators()
-
-
-_GLOBAL_INIT_ARGS = []
-
-
-def GlobalInit(args):
-    TriggerLazyImport()
-    _GLOBAL_INIT_ARGS.extend(args[1:])
-    C.global_init(args)
-
-
-def GetGlobalInitArgs():
-    return _GLOBAL_INIT_ARGS[:]
-
-
-def IsOperator(op_type):
-    return IsOperatorWithEngine(op_type, engine='DEFAULT')
-
-
-def IsOperatorWithEngine(op_type, engine):
-    TriggerLazyImport()
-    return C.op_registry_key(op_type, engine) in _REGISTERED_OPERATORS
-
-
-def IsGPUDeviceType(device_type):
-    return device_type in {caffe2_pb2.CUDA, caffe2_pb2.HIP}
-
-
-def DeviceOption(
-    device_type,
-    device_id=0,
-    random_seed=None,
-    node_name=None,
-    numa_node_id=None,
-    extra_info=None,
-):
-    option = caffe2_pb2.DeviceOption()
-    option.device_type = device_type
-    option.device_id = device_id
-    if node_name is not None:
-        option.node_name = node_name
-    if random_seed is not None:
-        option.random_seed = random_seed
-    if numa_node_id is not None:
-        assert device_type == caffe2_pb2.CPU
-        option.numa_node_id = numa_node_id
-    if extra_info is not None:
-        option.extra_info.extend(extra_info)
-    return option
-
-
-def device_option_equal(opt1, opt2, ignore_node_name=True, ignore_random_seed=True):
-    if not opt1 or not opt2:
-        return opt1 == opt2
-    if not ignore_node_name and opt1.node_name != opt2.node_name:
-        return False
-    if not ignore_random_seed and opt1.random_seed != opt2.random_seed:
-        return False
-    if not opt1.device_type or not opt2.device_type:
-        # At least one option is for CPU, check if both are for CPU.
-        return not opt1.device_type and not opt2.device_type
-    return opt1.device_id == opt2.device_id
-
-
-def InferBlobDevices(net):
-    '''
-    Compute mapping from parameters to devices by looking at the
-    device option of the op that creates the blob has
-    '''
-    mapping = {}
-    for op in net.Proto().op:
-        op_device = op.device_option
-        if op_device is None:
-            op_device = caffe2_pb2.DeviceOption(caffe2_pb2.CPU)
-        # TODO: T18892922, use device annotations
-        for b in op.output:
-            mapping[b] = op_device
-    return mapping
-
-
-def InferOpBlobDevicesAsDict(op):
-    input_dev_list, output_dev_list = InferOpBlobDevices(op)
-    input_dict = {
-        op.input[i]: input_dev_list[i]
-        for i in range(len(op.input))
-    }
-    output_dict = {
-        op.output[i]: output_dev_list[i]
-        for i in range(len(op.output))
-    }
-    return input_dict, output_dict
-
-
-def InferOpBlobDevices(op):
-    device_info = C.infer_op_input_output_device(op.SerializeToString())
-    input_info = []
-    output_info = []
-    for dev_str in device_info[0]:
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.ParseFromString(dev_str)
-        input_info.append(device_option)
-    for dev_str in device_info[1]:
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.ParseFromString(dev_str)
-        output_info.append(device_option)
-    return input_info, output_info
-
-
-def InferOpDeviceAsBlobDevices(op):
-    op_dev = op.device_option if op.device_option else caffe2_pb2.DeviceOption()
-    input_dev = [op_dev] * len(op.input)
-    output_dev = [op_dev] * len(op.output)
-    return input_dev, output_dev
-
-
-GradientSlice = namedtuple('GradientSlice', ['indices', 'values'])
-
-
-class BlobReference:
-    """A wrapper around a blob in a net.
-
-    BlobReference gives us a way to refer to the network that the blob is
-    generated from. Note that blobs are, essentially, just strings in the
-    current workspace.
-    """
-
-    def __init__(self, name, net=None):
-        """Initializes a blob reference.
-
-        Note that this does not prepends the namescope. If needed, use
-        ScopedBlobReference() to prepend the existing namespace.
-        """
-        if isinstance(name, str):
-            self._name = name
-        elif isinstance(name, bytes):
-            self._name = name.decode('utf-8')
-        else:
-            self._name = str(name)
-        self._from_net = net
-        # meta allows helper functions to put whatever metainformation needed
-        # there.
-        self.meta = {}
-
-    def __hash__(self):
-        return hash(self._name)
-
-    def __eq__(self, other):
-        if isinstance(other, str):
-            return self._name == other
-        elif isinstance(other, bytes):
-            return self._name == other.decode('utf-8')
-        elif isinstance(other, BlobReference):
-            return self._name == other._name
-        else:
-            return False
-
-    def __ne__(self, other):
-        return not(self == other)
-
-    def __str__(self):
-        return self._name
-
-    def __repr__(self):
-        return 'BlobReference("{}")'.format(self._name)
-
-    def __add__(self, other):
-        if not isinstance(other, str):
-            raise RuntimeError('Cannot add BlobReference to a non-string.')
-        return BlobReference(self._name + other, self._from_net)
-
-    def __radd__(self, other):
-        if not isinstance(other, str):
-            raise RuntimeError('Cannot add a non-string to BlobReference.')
-        return BlobReference(other + self._name, self._from_net)
-
-    def Net(self):
-        return self._from_net
-
-    def GetNameScope(self):
-        return self._name[:self._name.rfind(scope._NAMESCOPE_SEPARATOR) + 1]
-
-    def GetUnscopedName(self):
-        return self._name[self._name.rfind(scope._NAMESCOPE_SEPARATOR) + 1:]
-
-    def _CreateAndAddToNet(self, op_type, inputs=None, *args, **kwargs):
-        """Internal function that routes the operator generation to the
-        network's __getattr__ function.
-        """
-        inputs = [] if inputs is None else inputs
-        if isinstance(inputs, BlobReference) or isinstance(inputs, str):
-            inputs = [inputs]
-        # add self to the input list.
-        inputs.insert(0, self)
-        return self._from_net.__getattr__(op_type)(inputs, *args, **kwargs)
-
-    def __getattr__(self, op_type):
-        """A wrapper allowing one to initiate operators from a blob reference.
-
-        Example: for a blob reference b that comes from network n, doing
-            b.Relu(...)
-        is equivalent to doing
-            net.Relu([b], ...)
-        """
-        if op_type.startswith('__'):
-            raise AttributeError('Attribute {} not found.'.format(op_type))
-        if self._from_net is None:
-            raise AttributeError(
-                'You cannot use a blob reference that does not have a net '
-                'source to create operators. Create the operator from an '
-                'explicit net object.')
-        if not IsOperator(op_type):
-            raise AttributeError(
-                'Method ' + op_type + ' is not a registered operator.' +
-                ' Did you mean: [' +
-                ",".join(workspace.C.nearby_opnames(op_type)) + ']'
-            )
-        return lambda *args, **kwargs: self._CreateAndAddToNet(
-            op_type, *args, **kwargs)
-
-    def __dir__(self):
-        TriggerLazyImport()
-        additional_methods = [
-            op
-            for op in _REGISTERED_OPERATORS
-            if '_ENGINE_' not in op or '_ENGINE_CUDNN' in op]
-        return sorted(set(chain(
-            dir(type(self)),
-            self.__dict__.keys(),
-            additional_methods
-        )))
-
-
-def ScopedName(name):
-    """prefix the name with the current scope."""
-    if isinstance(name, bytes):
-        name = name.decode('ascii')
-    return scope.CurrentNameScope() + name
-
-
-def ScopedBlobReference(name, *args, **kwargs):
-    """Returns a blob reference with scope prefixed."""
-    return BlobReference(ScopedName(name), *args, **kwargs)
-
-
-def _RectifyInputOutput(blobs, net=None):
-    """A helper function to rectify the input or output of the CreateOperator
-    interface.
-    """
-    if isinstance(blobs, (bytes, str)):
-        # If blobs is a single string, prepend scope.CurrentNameScope()
-        # and put it as a list.
-        # TODO(jiayq): enforce using BlobReference instead of raw strings.
-        return [ScopedBlobReference(blobs, net=net)]
-    elif type(blobs) is BlobReference:
-        # If blob is a BlobReference, simply put it as a list.
-        return [blobs]
-    elif type(blobs) in (list, tuple):
-        # If blob is a list, we go through it and type check.
-        rectified = []
-        for blob in blobs:
-            if isinstance(blob, (bytes, str)):
-                rectified.append(ScopedBlobReference(blob, net=net))
-            elif type(blob) is BlobReference:
-                rectified.append(blob)
-            else:
-                raise TypeError(
-                    "I/O blob #{} of unsupported type: {} of type {}"
-                    .format(len(rectified), str(blob), type(blob)))
-        return rectified
-    else:
-        raise TypeError(
-            "Unknown input/output type: %s of type %s." %
-            (str(blobs), type(blobs))
-        )
-
-
-def CreateOperator(
-    operator_type,
-    inputs,
-    outputs,
-    name='',
-    control_input=None,
-    device_option=None,
-    arg=None,
-    engine=None,
-    debug_info=None,
-    **kwargs
-):
-    """A function wrapper that allows one to create operators based on the
-    operator type. The type should be a string corresponding to an operator
-    registered with Caffe2.
-    """
-    operator = caffe2_pb2.OperatorDef()
-    if (os.environ.get('CAFFE2_DEBUG')):
-        stack = traceback.format_stack()
-        operator.debug_info = "".join(stack[:-1])
-
-    operator.type = operator_type
-    operator.name = name
-    # Add rectified inputs and outputs
-    inputs = _RectifyInputOutput(inputs)
-    outputs = _RectifyInputOutput(outputs)
-    operator.input.extend(map(str, inputs))
-    operator.output.extend(map(str, outputs))
-    if control_input:
-        control_input = _RectifyInputOutput(control_input)
-        operator.control_input.extend(map(str, control_input))
-    # Set device option:
-    # (1) If device_option is explicitly set, use device_option.
-    # (2) If not, but scope.CurrentDeviceScope() is set,
-    #     then we use scope.CurrentDeviceScope().
-    # (3) Otherwise, do not set device option.
-    if device_option is not None:
-        operator.device_option.CopyFrom(device_option)
-    elif scope.CurrentDeviceScope() is not None:
-        operator.device_option.CopyFrom(scope.CurrentDeviceScope())
-    if engine is not None:
-        operator.engine = engine
-    if debug_info is not None:
-        operator.debug_info = debug_info
-    # random seed is defined in the device option, so we need to do special
-    # care.
-
-    if 'random_seed' in kwargs:
-        operator.device_option.random_seed = kwargs['random_seed']
-        del kwargs['random_seed']
-    # Add given arguments that do not need parsing
-    if arg is not None:
-        operator.arg.extend(arg)
-    # Add all other arguments
-    for key, value in kwargs.items():
-        if value is not None:
-            operator.arg.add().CopyFrom(utils.MakeArgument(key, value))
-
-    if workspace.IsImmediate():
-        workspace.RunOperatorImmediate(operator)
-    return operator
-
-
-def _RegisterPythonImpl(
-    f, grad_f=None, python_func_type=None, pass_workspace=False
-):
-    if python_func_type:
-        func = python_func_type(f)
-        f = func.forward
-        grad_f = func.backward
-    else:
-        if isinstance(f, tuple):
-            f = f[0](*f[1], **f[2])
-        if isinstance(grad_f, tuple):
-            grad_f = grad_f[0](*grad_f[1], **grad_f[2])
-
-    token = C.register_python_op(f, pass_workspace, '')
-    if grad_f:
-        C.register_python_gradient_op(token, grad_f)
-    return token
-
-
-def CreatePythonOperator(
-    f, inputs,
-    outputs,
-    grad_f=None,
-    pass_workspace=False,
-    python_func_type=None,
-    *args,
-    **kwargs
-):
-    """
-    `f` should have a signature (inputs, outputs)
-
-    If `pass_workspace` is True, the signature is changed to
-    (inputs, outputs, workspace) where `workspace` is the workspace the op
-    is going to run on. This is potentially dangerous (as the op can manipulate
-    the workspace directly), use on your own risk.
-    """
-    kwargs["token"] = _RegisterPythonImpl(
-        f, grad_f, python_func_type, pass_workspace=pass_workspace
-    )
-    return CreateOperator("Python", inputs, outputs, *args, **kwargs)
-
-
-def GetIndexFromGradientList(g_list, name):
-    """A helper function to get the index from a gradient list, None if not
-    matching."""
-    for i, g in enumerate(g_list):
-        if g == name:
-            return i
-        elif type(g) is GradientSlice:
-            if (g.indices == name or g.values == name):
-                return i
-    return None
-
-
-OpSSA = namedtuple('OpSSA', ['op', 'in_versions', 'out_versions'])
-GradGenMeta = namedtuple('GradGenMeta',
-                         ['grad_op', 'idx', 'gradient', 'device_option'])
-SparseGradGenMeta = namedtuple('SparseGradGenMeta', [
-    'grad_op_indices', 'idx_indices',
-    'grad_op_values', 'idx_values',
-    'gradient', 'device_option',
-])
-
-
-class IR:
-    """A simple IR class to keep track of all intermediate representations used
-    in the gradient computation.
-    """
-
-    def __init__(self, operators):
-        # The IR class holds multiple metadata from the forward pass:
-        # a) ssa: a list of [op, in_versions, out_versions] recording the
-        #    input and the output version of each operator, similar
-        #    to a normal SSA form.
-        # b) input_usages: a dictionary specifying for each blob and
-        #    each of its version, how many times it is used as input for another
-        #    op.
-        # c) frontier: maintaining the current versions of the blobs
-        #    we are having in the workspace, after the execution of all the ops
-        #    added to the IR so far. This is useful because if a gradient is
-        #    trying to access an earlier version of a blob, we can sanity check
-        #    that it is no longer there, and thus throw an error.
-        # d) gradient_frontier: maps the names of blobs to its version that the
-        #    gradient corresponds to.
-        # e) gradient_generators: for each blob and each of its version, maps to
-        #    a list of operators that generates its gradient together with the
-        #    gradient name.
-        self.ssa = []
-        self.input_usages = defaultdict(lambda: defaultdict(list))
-        self.frontier = defaultdict(int)
-        self.gradient_frontier = {}
-        self.gradient_generators = defaultdict(lambda: defaultdict(list))
-        self.out_version_history = defaultdict(list)
-        self.in_version_history = defaultdict(list)
-
-        for op in operators:
-            self.Play(op)
-
-        self.SanityCheck(operators)
-
-    def SanityCheck(self, operators):
-        # Validate StopGradient usage by checking that StopGradient's output
-        # is actually passed forward
-        for op in operators:
-            if op.type == 'StopGradient':
-                if op.output[0] not in self.input_usages:
-                    raise ValueError("""StopGradient's output '{}' is orphan.
-You typically want to specify same input and output for
-StopGradient. Op:\n\n{}""".format(op.output[0], str(op)))
-
-    def Play(self, op):
-        """"Adds an op to the current IR, and update the internal states to
-        reflect the blobs and versions after the execution of the op.
-        """
-        # For input, they are the current version in the dict.
-        in_versions = {}
-        for s in op.input:
-            in_versions[s] = self.frontier[s]
-            self.input_usages[s][self.frontier[s]].append(len(self.ssa))
-            self.in_version_history[s].append((op, self.frontier[s]))
-        # For output, they are the current version plus one. If this is a
-        # newly created blob, its version starts with zero.
-        out_versions = {}
-        for s in op.output:
-            if s in self.frontier:
-                self.frontier[s] += 1
-            out_versions[s] = self.frontier[s]
-            self.out_version_history[s].append((op, self.frontier[s]))
-        # Add to SSA for bookkeeping.
-        self.ssa.append(OpSSA(op, in_versions, out_versions))
-
-    def CheckGradientOperatorInput(
-            self, grad_op_input, g_output, fwd_op_idx, locally_generated_blobs):
-        """Checks if the gradient operators can be correctly carried out."""
-        forward_op, in_versions, out_versions = self.ssa[fwd_op_idx]
-        original_index = GetIndexFromGradientList(g_output, grad_op_input)
-
-        # Functions to generate debug help for version-mismatches
-        def versionMismatchInfoOut(name):
-            s = "DEBUG HELP:\n"
-            s += "Maybe you use same output blob twice for different ops?\n"
-            s += "== Version history of blob [{}]\n".format(name)
-            for (op, vers) in self.out_version_history[name]:
-                s += "Version (out) {} <-- {}".format(vers, op)
-                s += "\n"
-            return s
-
-        def versionMismatchInfoIn(name):
-            s = "DEBUG HELP:\n"
-            s += "Maybe the blob was overwritten by another op?\n"
-            s += "== Version history of blob [{}]\n".format(name)
-            for (op, vers) in self.in_version_history[name]:
-                s += "version (in) {} <-- {}".format(vers, op)
-                s += "\n"
-            return s
-
-        # If it is a dense or sparse gradient name, it should match the
-        # version of the corresponding output.
-        if original_index is not None:
-            original_name = forward_op.output[original_index]
-            if (out_versions[original_name] !=
-                    self.gradient_frontier[original_name]):
-                raise RuntimeError(
-                    'Gradient name "%s" is expected to correspond '
-                    'to version %d of "%s", but currently we have '
-                    'version %d.\n\n' % (
-                        grad_op_input, out_versions[original_name],
-                        original_name,
-                        self.gradient_frontier[original_name]) +
-                    versionMismatchInfoOut(original_name))
-        # If it is an output name, the current version should match the
-        # version when the operator was run.
-        elif grad_op_input in out_versions:
-            if self.frontier[grad_op_input] != out_versions[grad_op_input]:
-                raise RuntimeError(
-                    'Gradient operator needs output "%s" at version'
-                    ' %d, but currently we have version %d.\n\n' % (
-                        grad_op_input, out_versions[grad_op_input],
-                        self.frontier[grad_op_input]
-                    ) + versionMismatchInfoOut(grad_op_input)
-                )
-        # If it is an input name, the current version should match the
-        # version when the operator was run.
-        elif grad_op_input in in_versions:
-            if (self.frontier[grad_op_input] != in_versions[grad_op_input]):
-                raise RuntimeError(
-                    'Gradient operator needs input "%s" at version '
-                    '%d, but currently we have version %d.\n\n' % (
-                        grad_op_input, in_versions[grad_op_input],
-                        self.frontier[grad_op_input]
-                    ) + versionMismatchInfoIn(grad_op_input)
-                )
-        # If it is none of the above, it should be a blob that is
-        # generated locally by one of the previous gradient operators.
-        else:
-            if grad_op_input not in locally_generated_blobs:
-                raise RuntimeError(
-                    'Blob name "%s" not in the scope of operator: '
-                    '%s\nand is not generated by any of the local '
-                    'gradient operators.' % (grad_op_input, str(forward_op))
-                )
-
-    def AppendSparseGenerators(self, sparse_generators):
-        # merge indices and values generators for sparse gradients
-        for name, input_generators in sparse_generators.items():
-            for version, generators in input_generators.items():
-                if len(generators) == 1:
-                    # either indices or values are generated (but not both)
-                    generator = generators[0]
-                else:
-                    # both indices and values are generated
-                    assert(len(generators) == 2)
-                    op1_i, idx1_i, op1_v, idx1_v, g1, dev_1 = generators[0]
-                    op2_i, idx2_i, op2_v, idx2_v, g2, dev_2 = generators[1]
-                    assert(g1 == g2)
-                    assert dev_1 == dev_2, (
-                        "Unequal devices for sparse generators: "
-                        "{} and {}".format(dev_1, dev_2)
-                    )
-                    assert(op1_i is None or op2_i is None)
-                    assert(op1_v is None or op2_v is None)
-                    assert(idx1_i == 0 or idx2_i == 0)
-                    assert(idx1_v == 0 or idx2_v == 0)
-                    generator = SparseGradGenMeta(
-                        op1_i or op2_i, idx1_i + idx2_i,
-                        op1_v or op2_v, idx1_v + idx2_v,
-                        g1, dev_1)
-                self.gradient_generators[name][version].append(generator)
-
-    def BuildGradientGenerators(  # NOQA
-            self, fwd_op_idx, gradient_ops, g_output, g_input):
-        """Updates gradient_generators and gradient_frontier"""
-        forward_op, in_versions, out_versions = self.ssa[fwd_op_idx]
-        locally_generated_blobs = []
-        sparse_generators = defaultdict(lambda: defaultdict(list))
-
-        for grad_op in gradient_ops:
-            # (1) check that inputs are valid
-            for s in grad_op.input:
-                self.CheckGradientOperatorInput(
-                    s, g_output, fwd_op_idx, locally_generated_blobs)
-
-            # (2) add outputs to the locally generated blobs
-            # If an output corresponds to the gradient of an input, we also
-            # record it to gradient_generators
-            locally_generated_blobs.extend(map(str, grad_op.output))
-            for i, output in enumerate(grad_op.output):
-                input_index = GetIndexFromGradientList(g_input, output)
-                if input_index is not None:
-                    input_name = forward_op.input[input_index]
-                    input_version = in_versions[input_name]
-                    g = g_input[input_index]
-                    if type(g) is GradientSlice:
-                        # the output corresponds either to the indices or the
-                        # values of the sparse gradient. In either case we
-                        # create a (partial) SparseGradGenMeta. If necessary,
-                        # we'll merge indices and values generators
-                        # corresponding to the same gradient in step (3)
-                        if g.indices == output:
-                            m = SparseGradGenMeta(
-                                grad_op, i, None, 0, g, grad_op.device_option)
-                        else:
-                            assert(g.values == output)
-                            m = SparseGradGenMeta(
-                                None, 0, grad_op, i, g, grad_op.device_option)
-                        sparse_generators[input_name][input_version].append(m)
-                    else:
-                        self.gradient_generators[input_name][input_version] \
-                            .append(GradGenMeta(
-                                grad_op, i, g, grad_op.device_option))
-
-        # (3) merge indices and values generators for sparse gradients, and
-        # add them to gradient_generators
-        self.AppendSparseGenerators(sparse_generators)
-
-        # (4) for ops (e.g., Add, Sum, Sub) which have gradient outputs directly
-        # passed from inputs (not computed from gradient ops), we create an
-        # GradGenMeta with None grad_op and idx so that the gradient_generators
-        # knows where the gradients are coming from. This is needed for creating
-        # Sum op to accumulate the gradients from multiple parents.
-        for input_index, g in enumerate(g_input):
-            input_name = forward_op.input[input_index]
-            input_version = in_versions[input_name]
-            if not g:
-                continue
-            if type(g) is GradientSlice:
-                if str(g.indices) not in locally_generated_blobs and \
-                        str(g.values) not in locally_generated_blobs:
-                    self.gradient_generators[input_name][input_version].append(
-                        SparseGradGenMeta(None, 0, None, 0, g, forward_op.device_option))
-            else:
-                if str(g) not in locally_generated_blobs:
-                    self.gradient_generators[input_name][input_version].append(
-                        GradGenMeta(None, 0, g, forward_op.device_option))
-
-        # Finally, for the gradients specified in g_input, we update the
-        # gradient frontier to reflect the input versions that the gradients
-        # correspond to.
-        for i, g in enumerate(g_input):
-            if g is not None:
-                input_name = forward_op.input[i]
-                input_version = in_versions[input_name]
-                self.gradient_frontier[input_name] = input_version
-
-    def _GetSumOpOutputName(self, generator, input_name):
-        def remove_suffix(s, suffix):
-            if s.endswith(suffix):
-                return s[:-len(suffix)]
-            return s
-
-        for g in generator:
-            if type(g) is GradGenMeta:
-                grad_op, idx, _, _ = g
-                if grad_op:
-                    return grad_op.output[idx]
-            else:
-                assert(type(g) is SparseGradGenMeta)
-                op_i, idx_i, op_v, idx_v, _, _ = g
-                if op_i:
-                    return remove_suffix(op_i.output[idx_i], '_indices')
-                if op_v:
-                    return remove_suffix(op_v.output[idx_v], '_values')
-
-        return input_name + '_grad'
-
-    IS_AUTO_GEN_SUM_OPS_TAG = "is_auto_gen_sum_ops"
-    ONLY_KEEP_IS_AUTO_GEN_SUM_OPS_TAG = "only_keep_is_auto_gen_sum_ops_tag"
-
-    def _SetSumOpsDeviceOption(self, sum_ops, generators):
-        only_keep_is_auto_gen_sum_ops_tag = False
-        for generator in generators:
-            # we already checked that device options are consistent so we can just
-            # break after finding the first clear_info request
-            for extra_info in generator.device_option.extra_info:
-                if extra_info == "{}:1".format(IR.ONLY_KEEP_IS_AUTO_GEN_SUM_OPS_TAG):
-                    only_keep_is_auto_gen_sum_ops_tag = True
-                    break
-
-        if only_keep_is_auto_gen_sum_ops_tag:
-            # if we find that device_option in the generator that
-            # requires clear the extra info for the auto gen sum
-            # Then we will try to clear them and only leave the
-            # IS_AUTO_GEN_SUM_OPS_TAG
-            for op in sum_ops:
-                op.device_option.extra_info.extend([
-                    "{}:1".format(IR.IS_AUTO_GEN_SUM_OPS_TAG)
-                ])
-        else:
-            # we already checked that device options are consistent so we can just
-            # use the first one we find
-            for generator in generators:
-                for op in sum_ops:
-                    op.device_option.CopyFrom(generator.device_option)
-                    op.device_option.extra_info.extend([
-                        "{}:1".format(IR.IS_AUTO_GEN_SUM_OPS_TAG)
-                    ])
-                break
-
-    def _DisambiguateGradOpOutput(self, grad_op, idx, cnt):
-        new_grad_output = (
-            '_' + grad_op.output[idx] + '_autosplit_{}'.format(cnt))
-        if grad_op.type == "If":
-            disambiguate_grad_if_op_output(grad_op, idx, new_grad_output)
-        else:
-            grad_op.output[idx] = new_grad_output
-        return grad_op.output[idx], cnt + 1
-
-    def _CheckSumOpsConflict(self, out_base_name, g):
-        if str(out_base_name) == str(g):
-            # TODO not sure what this message really means
-            raise RuntimeError(
-                'The gradient output of empty gradient op can not '
-                'be the same as the normal name of the current '
-                'input gradient.')
-
-    def _MakeDenseSumOps(self, generators, out_base_name):
-        sum_op_input = []
-        cnt = 0
-
-        assert len(generators) > 1
-
-        first_grad_op = True
-        for generator in generators:
-            grad_op, idx, g, _ = generator
-            assert(type(g) is not GradientSlice)
-            if grad_op:
-                if first_grad_op:
-                    first_grad_op = False
-                    out = grad_op.output[idx]
-                else:
-                    out, cnt = self._DisambiguateGradOpOutput(grad_op, idx, cnt)
-                sum_op_input.append(out)
-            else:
-                self._CheckSumOpsConflict(out_base_name, g)
-                sum_op_input.append(str(g))
-
-        if out_base_name in sum_op_input:
-            # Sum inplace mode works only for the first input
-            # So we do a swap
-            idx = sum_op_input.index(out_base_name)
-            sum_op_input[0], sum_op_input[idx] = (
-                sum_op_input[idx], sum_op_input[0]
-            )
-        sum_ops = [CreateOperator(
-            "Sum",
-            [BlobReference(x) for x in sum_op_input],
-            BlobReference(out_base_name))]
-        return sum_ops, out_base_name
-
-    def _MakeSparseSumOps(self, generators, out_base_name):
-        indices_concat_input = []
-        values_concat_input = []
-        cnt_i = 0
-        cnt_v = 0
-
-        for generator in generators:
-            assert(type(generator) is SparseGradGenMeta)
-            op_i, idx_i, op_v, idx_v, g, _ = generator
-            if op_i:
-                out, cnt_i = self._DisambiguateGradOpOutput(op_i, idx_i, cnt_i)
-                indices_concat_input.append(out)
-            else:
-                self._CheckSumOpsConflict(out_base_name, g.indices)
-                indices_concat_input.append(g.indices)
-            if op_v:
-                out, cnt_v = self._DisambiguateGradOpOutput(op_v, idx_v, cnt_v)
-                values_concat_input.append(out)
-            else:
-                self._CheckSumOpsConflict(out_base_name, g.values)
-                values_concat_input.append(g.values)
-
-        indices_concat_output = out_base_name + '_indices_concat'
-        indices_concat_split = out_base_name + '_indices_concat_split'
-        values_concat_output = out_base_name + '_values_concat'
-        values_concat_split = out_base_name + '_values_concat_split'
-        # Sum the given sparse representations by simply concatenating the
-        # indices (resp. values) tensors together. We don't do any deduplication
-        # of indices at this point. This will be done as needed before the
-        # optimizer is called
-        sum_ops = [
-            CreateOperator(
-                "Concat",
-                [BlobReference(x) for x in indices_concat_input],
-                [BlobReference(x) for x in
-                    [indices_concat_output, indices_concat_split]],
-                axis=0
-            ),
-            CreateOperator(
-                "Concat",
-                [BlobReference(x) for x in values_concat_input],
-                [BlobReference(x) for x in
-                    [values_concat_output, values_concat_split]],
-                axis=0
-            ),
-        ]
-        sum_op_output = GradientSlice(
-            indices=indices_concat_output,
-            values=values_concat_output,
-        )
-        return sum_ops, sum_op_output
-
-    def _MakeSumOps(self, input_name, input_version):
-        generators = self.gradient_generators[input_name][input_version]
-        out_base_name = self._GetSumOpOutputName(generators, input_name)
-        types = list(set(type(x) for x in generators))
-        assert(len(types) == 1)
-        if types[0] is GradGenMeta:
-            sum_ops, g = self._MakeDenseSumOps(generators, out_base_name)
-        else:
-            assert(types[0] is SparseGradGenMeta)
-            sum_ops, g = self._MakeSparseSumOps(generators, out_base_name)
-        self._SetSumOpsDeviceOption(sum_ops, generators)
-        return sum_ops, g
-
-    def _VerifyGradientGenerators(self, generator):
-        # (1) check if all gradients are of the same type. Aggregating a mix of
-        # sparse and dense gradients is not supported yet
-        if len({type(g) for g in generator}) > 1:
-            raise RuntimeError(
-                'Automatic aggregation of a mix of sparse and dense gradients '
-                'is not supported yet')
-
-        # If for all the operators that used the operator, none or only one
-        # produced the gradient, then no additional sum needs to be carried
-        # out.
-        if len(generator) < 2:
-            return False
-
-        all_gradient_names = []
-        all_device_options = []
-        for g in generator:
-            if g.device_option:
-                all_device_options.append(g.device_option)
-            if type(g) is GradGenMeta:
-                if g.grad_op:
-                    all_gradient_names.append(g.gradient)
-            else:
-                assert(type(g) is SparseGradGenMeta)
-                if g.gradient.values:
-                    all_gradient_names.append(g.gradient.values)
-
-        # Check if all grad op device options are the same.
-        if len(all_device_options) >= 2 and not all(
-                device_option_equal(d, all_device_options[0])
-                for d in all_device_options[1:]):
-            raise RuntimeError('Unexpected behavior: not all grad ops '
-                               'have the same device option.')
-        return True
-
-    def DoGradientAccumulation(self, fwd_op_idx):
-        """For each input name in the forward op, check if we will need to
-        add gradient accumulation. If so, do gradient accumulation and return
-        the list of gradient operators.
-
-        The criteria for doing gradient accumulation is:
-        (1) the specific input version has been used by multiple operators.
-        (2) the current fwd_op_idx is the first to use that input, i.e. in the
-            backward pass, is the last to optionally generate the gradient for
-            the op.
-        (3) For the operators that used the input, their gradient operators
-            have generated more than 1 gradient.
-
-        When accumulating operators, our current solution is to rename all the
-        created gradients with an internal intermediate name, and then add a
-        Sum() operator that adds up all the gradients. This may use more memory
-        due to intermediate storage, but is usually the fastest approach as one
-        can do one single sum for multiple intermediate gradients.
-        """
-        forward_op, in_versions, out_versions = self.ssa[fwd_op_idx]
-        additional_sum_ops = []
-        grad_map = {}
-        for _i, input_name in enumerate(set(forward_op.input)):
-            input_version = in_versions[input_name]
-            input_usage = self.input_usages[input_name][input_version]
-            if (len(input_usage) <= 1 or fwd_op_idx != input_usage[0]):
-                # We do not need to do gradient accumulation yet.
-                continue
-            generator = self.gradient_generators[input_name][input_version]
-            try:
-                if not self._VerifyGradientGenerators(generator):
-                    continue
-            except RuntimeError as err:
-                raise RuntimeError(
-                    "Gradients for param ''{}'' failed to verify: {}".format(
-                        input_name,
-                        err
-                    )
-                ) from err
-
-            # Finally, let's create the sum operator.
-            sum_ops, g = self._MakeSumOps(input_name, input_version)
-            additional_sum_ops.extend(sum_ops)
-            grad_map[input_name] = g
-        return additional_sum_ops, grad_map
-
-    def _AppendAutoGradGenerator(self, y, grad, autograd_op):
-        # Gradient here is not sparse  as it was generated by
-        # a ConstantFill operator. Autogeneration for sparse gradients is
-        # not supported
-        generator = GradGenMeta(
-            autograd_op, 0 if autograd_op else None, str(grad),
-            autograd_op.device_option)
-
-        self.gradient_generators[str(y)][self.frontier[str(y)]].append(
-            generator)
-
-    AUTOGEN_GRAD_SUFFIX = "_autogen_grad"
-
-    def _GetInitGradients(self, ys):
-        input_to_grad = {}
-        gradient_ops = []
-
-        for y, g in ys.items():
-            autograd_op = None
-            if g is None:
-                autograd_op = CreateOperator(
-                    "ConstantFill", [y], [str(y) + IR.AUTOGEN_GRAD_SUFFIX],
-                    value=1.0)
-                gradient_ops.append(autograd_op)
-                g = autograd_op.output[0]
-            # Since the C++ gradient registry does not have notion of
-            # NameScopes, we will convert all references to strings.
-            input_to_grad[str(y)] = (
-                GradientSlice(str(g[0]), str(g[1]))
-                if isinstance(g, GradientSlice) else str(g))
-            # Autogenerated gradients are assumed to be provided for the last
-            # input version
-            if autograd_op is not None:
-                self._AppendAutoGradGenerator(y, g, autograd_op)
-
-        return input_to_grad, gradient_ops
-
-    def _GenerateGradientsForForwardOp(
-            self, forward_op_idx, input_to_grad):
-        new_input_to_grad = {}
-        gradient_ops = []
-        forward_op, in_versions, out_versions = self.ssa[forward_op_idx]
-        g_output = list(
-            input_to_grad.get(name, None) for name in forward_op.output)
-
-        if not all(g is None for g in g_output) or (
-                forward_op.type == "ZeroGradient"):
-            gradient_ops, g_input = GradientRegistry.GetGradientForOp(
-                forward_op, g_output)
-            # Check if the gradient operators are legal, and update
-            # gradient_generators and gradient_frontier
-            self.BuildGradientGenerators(
-                forward_op_idx, gradient_ops, g_output, g_input)
-            # Record the gradient map to all_input_to_grad.
-            for name, grad in zip(forward_op.input, g_input):
-                # Do not overwrite an existing gradient with a None
-                # unless the input is also an output of the op, since
-                # we update the blob version when blob is output of an
-                # operator.
-                if grad is not None or \
-                    name not in input_to_grad or \
-                        name in list(forward_op.output):
-                    new_input_to_grad[name] = grad
-
-        return new_input_to_grad, gradient_ops
-
-    def GetBackwardPass(self, ys):
-        """Gets the backward pass that computes the derivatives of given blobs.
-
-        Inputs:
-          ys: a list or a dictionary specifying what blobs we want to compute
-              derivatives of. If the input is a list, we will automatically
-              generate their gradients with all-one values; if the input is a
-              dictionary, for any dictionary entries that are not None, we will
-              take the corresponding blobs as their gradients; for all those
-              that are None, we will auto-fill them with 1.
-        """
-        if isinstance(ys, list):
-            ys = dict((y, None) for y in ys)
-        elif not isinstance(ys, dict):
-            raise TypeError("ys should either be a list or a dict.")
-
-        # Set the gradient frontier with the initialized external
-        # gradients.
-        for y in ys.keys():
-            self.gradient_frontier[y] = self.frontier[y]
-            self.input_usages[str(y)][self.frontier[str(y)]].append(
-                len(self.ssa))
-
-        all_input_to_grad, all_gradient_ops = self._GetInitGradients(ys)
-
-        # (2) Now, after having the virtual play above, we now play the ops
-        # backwards, creating the gradients along the path. Note that although
-        # we are playing it backwards, we cannot refer to variables that are
-        # at a version older than current_versions because it is already been
-        # overwritten.
-        for forward_op_idx in reversed(range(len(self.ssa))):
-            input_to_grad, gradient_ops = self._GenerateGradientsForForwardOp(
-                forward_op_idx, all_input_to_grad)
-            all_input_to_grad.update(input_to_grad)
-            all_gradient_ops += gradient_ops
-
-            # If there are multiple use blobs, do gradient accumulation.
-            additional_sum_ops, grad_map = self.DoGradientAccumulation(
-                forward_op_idx)
-            # This line is so that if in an accumulation some of the operators
-            # have not produced gradients, they still do not overwrite the
-            # general all_input_to_grad map.
-            all_input_to_grad.update(grad_map)
-            all_gradient_ops += additional_sum_ops
-
-        # (3) Post-processing.
-        # After we have done computation for each op, we now have the gradient
-        # operators ready. For the output map, we will convert everything to
-        # BlobReferences for easier handling in python.
-        all_input_to_grad_out = {}
-        for key, val in all_input_to_grad.items():
-            if val is not None:
-                if isinstance(val, (bytes, str)):
-                    grad_out = BlobReference(val)
-                else:
-                    grad_out = GradientSlice(BlobReference(val[0]),
-                                             BlobReference(val[1]))
-                all_input_to_grad_out[BlobReference(key)] = grad_out
-        return all_gradient_ops, all_input_to_grad_out
-
-
-class GradientRegistry:
-    """GradientRegistry holds the mapping from operators to their gradients."""
-    gradient_registry_ = {}
-
-    @classmethod
-    def RegisterGradient(cls, op_type):
-        """A decorator for registering gradient mappings."""
-
-        def Wrapper(func):
-            cls.gradient_registry_[op_type] = func
-            return func
-
-        return Wrapper
-
-    @classmethod
-    def _GetGradientForOpCC(cls, op_def, g_output):
-        # TODO(tulloch) - Propagate GradientWrapper up through the stack.
-        def from_untyped(grad):
-            if grad is None:
-                w = C.GradientWrapper()
-                assert w.is_empty()
-                return w
-            try:
-                (indices, values) = grad
-                w = C.GradientWrapper()
-                w.indices = indices
-                w.values = values
-                assert w.is_sparse()
-                return w
-            except ValueError:
-                w = C.GradientWrapper()
-                w.dense = grad
-                assert w.is_dense()
-                return w
-
-        g_output = [from_untyped(grad) for grad in g_output]
-        grad_defs_str, g_input = C.get_gradient_defs(
-            op_def.SerializeToString(), g_output)
-
-        def to_untyped(grad_wrapper):
-            if grad_wrapper.is_empty():
-                return None
-            if grad_wrapper.is_sparse():
-                return GradientSlice(grad_wrapper.indices, grad_wrapper.values)
-            assert grad_wrapper.is_dense()
-            return grad_wrapper.dense
-
-        g_input = [to_untyped(grad_wrapper) for grad_wrapper in g_input]
-        grad_defs = []
-        for grad_def_str in grad_defs_str:
-            grad_def = caffe2_pb2.OperatorDef()
-            grad_def.ParseFromString(grad_def_str)
-            grad_defs.append(grad_def)
-        return grad_defs, g_input
-
-    @classmethod
-    def GetGradientForOp(cls, op, g_output):
-        try:
-            gradient_ops, g_input = cls._GetGradientForOpCC(op, g_output)
-        except Exception as e:
-            # Not supported in C++; will try python registration next.
-            if op.type in cls.gradient_registry_:
-                gradient_ops, g_input = cls.gradient_registry_[op.type](
-                    op, g_output
-                )
-            else:
-                raise Exception(
-                    "Exception when creating gradient for [{}]:{}.\nOp: \n{}".
-                    format(op.type, e, str(op))
-                ) from e
-
-        if gradient_ops is None:
-            return [], g_input
-        if type(gradient_ops) is not list:
-            gradient_ops = [gradient_ops]
-        return gradient_ops, g_input
-
-    @classmethod
-    def GetBackwardPass(cls, operators, ys, ys_generate_gradient=False):
-        """Gets the backward pass for the list of operators.
-
-        Args:
-            operators: a list of operators constituting the forward pass.
-            ys: a list or a dictionary specifying what blobs we want to compute
-                derivatives of. If the input is a list, we will automatically
-                generate their gradients with all-one values; if the input is a
-                dictionary, for any dictionary entries that are not None, we'll
-                take the corresponding blobs as their gradients; for all those
-                that are None, we will auto-fill them with 1.
-        Returns:
-            gradient_ops: a list of gradient operators to run.
-            all_input_to_grads: a map from input to their corresponding
-                gradients.
-        """
-        ir = IR(operators)
-        return ir.GetBackwardPass(ys)
-
-
-GradientRegistry.RegisterGradient('Do')(gen_do_gradient)
-GradientRegistry.RegisterGradient('If')(gen_if_gradient)
-GradientRegistry.RegisterGradient('While')(gen_while_gradient)
-
-
-def get_ssa(net, blob_versions=None):
-    """
-    Given a net, return a structure containing the version of each input and
-    output blob used by each operator.
-
-    Args:
-        net:            either a Net or a NetDef
-        blob_versions:  (optional) map with current version number for given
-                        blob names. If not provided or blob not found, start
-                        from version 0.
-    Returns:
-        Tuple (ssa, blob_versions)
-        ssa:            list of tuples (versioned_inputs, versioned_outputs)
-                        for each op in the net. A versioned input is a tuple
-                        (blob_name, version).
-        blob_versions:  updated map with latest version of each blob found in
-                        the net.
-    """
-    proto = net.Proto() if isinstance(net, Net) else net
-    assert isinstance(proto, caffe2_pb2.NetDef)
-    if blob_versions is None:
-        blob_versions = {}
-    if isinstance(net, list):
-        return [get_ssa(n, blob_versions) for n in net], blob_versions
-    for i in proto.external_input:
-        if i not in blob_versions:
-            blob_versions[str(i)] = 0
-    ssa = []
-    for op in proto.op:
-        if not proto.external_input:
-            for i in op.input:
-                if i not in blob_versions:
-                    blob_versions[i] = 0
-        inputs = [(str(i), blob_versions.get(str(i), 0)) for i in op.input]
-        for o in op.output:
-            blob_versions[str(o)] = blob_versions.get(str(o), 0) + 1
-        outputs = [(str(o), blob_versions[str(o)]) for o in op.output]
-        ssa.append((inputs, outputs))
-    return ssa, blob_versions
-
-
-def get_undefined_blobs(ssa):
-    """
-    Given a ssa in the format produced by get_ssa(), return a set of blobs that
-    are used before they are defined, which corresponds to inputs at version 0.
-    """
-    undef_blobs = set()
-    for inputs, _outputs in ssa:
-        undef_blobs |= set(name for (name, ver) in inputs if ver == 0)
-    return undef_blobs
-
-
-def get_output_producers(ssa):
-    """
-    Given a ssa in the format produced by get_ssa(), returns a map from
-    versioned blob into the operator index that produces that version of
-    the blob. A versioned blob is a tuple (blob_name, version).
-    """
-    producers = {}
-    for i, (_inputs, outputs) in enumerate(ssa):
-        for o in outputs:
-            producers[o] = i
-    return producers
-
-
-def get_op_ids_in_path(ssa, blob_versions, inputs, outputs):
-    """
-    Given a ssa and blob_versions as produced by get_ssa(), returns the list
-    of op indices that are necessary in order to generate the blobs in
-    `outputs`, given blobs in `inputs`.
-    Consider that the `inputs` are given in their latest version.
-    """
-    inputs_set = set((str(i), blob_versions[str(i)]) for i in inputs)
-    producers = get_output_producers(ssa)
-    queue = [(str(o), blob_versions[str(o)]) for o in outputs]
-    used_op_ids = set()
-    while len(queue) > 0:
-        o = queue.pop()
-        if (o not in inputs_set) and (o in producers):
-            op_id = producers[o]
-            if op_id not in used_op_ids:
-                used_op_ids |= {op_id}
-                inputs, _ = ssa[op_id]
-                queue.extend(inputs)
-    return sorted(used_op_ids)
-
-
-def recurrent_network_op_remap(op, prefix, blob_remap):
-    """
-    Parameters
-    ----------
-    op : Caffe2 operator (RecurrentNetworkOp or RecurrentNetworkGradientOp).
-    prefix: this argument is not used in this function, just for legacy support.
-    blob_remap : Dictionary that represents the map from old blob name to new.
-
-    Updates blob names in arguments of RecurrentNetworkOp and
-    RecurrentNetworkGradientOp to conform to cloned input and output of both
-    operators and also makes sure names of locally generated blobs in arguments
-    have the same prefix as the input and output of the operators.
-    """
-
-    def get_remapped_str(blob_str):
-        if isinstance(blob_str, bytes):
-            blob_str = blob_str.decode('utf-8')
-        return blob_remap.get(blob_str, blob_str).encode('utf-8')
-
-    for argument in op.arg:
-        if len(argument.strings) > 0:
-            for i in range(len(argument.strings)):
-                argument.strings[i] = get_remapped_str(argument.strings[i])
-        elif argument.name == 'timestep':
-            argument.s = get_remapped_str(argument.s)
-        elif argument.name.endswith('step_net'):
-            # argument is a proto
-            remap_proto(argument, blob_remap)
-
-
-def control_op_remap(op, prefix, blob_remap):
-    net_arg_names = []
-    if op.type == "If" or op.type == "AsyncIf":
-        net_arg_names = ['then_net', 'else_net']
-    else:
-        net_arg_names = ['loop_net', 'cond_net']
-    for argument in op.arg:
-        if argument.name in net_arg_names:
-            assert argument.n, \
-                "Expected non empty net in " + op.type + "'s " + argument.name + " argument"
-            subnet = Net(argument.n)
-            remapped_subnet = subnet.Clone(
-                name=(subnet._net.name if subnet._net.name else '') + '_remapped',
-                blob_remap=blob_remap)
-            argument.n.CopyFrom(remapped_subnet.Proto())
-
-
-DEFAULT_REMAP_FUNCS = {
-    'RecurrentNetwork': recurrent_network_op_remap,
-    'RecurrentNetworkGradient': recurrent_network_op_remap,
-    'If': control_op_remap,
-    'While': control_op_remap,
-    'AsyncIf': control_op_remap,
-}
-
-
-def remap_proto(argument, blob_remap):
-    subnet = Net(argument.n)
-
-    cloned_sub_net = subnet.Clone(
-        'cloned_sub_net',
-        blob_remap,
-    )
-
-    argument.n.CopyFrom(cloned_sub_net.Proto())
-
-
-def clone_and_bind_net(net, name, prefix, blob_remap=None, inputs=None,
-                       keep_schema=True):
-    """
-    Clone the given Net, binding its input schema to the given `inputs` record.
-    Blob names defined by the net are prepended with the given `prefix`.
-
-    Args:
-        net:        the net to clone
-        name:       the name of the new net
-        prefix:     the prefix to append to local blobs
-        blob_remap: (optional) dict with additional blob name remapping.
-        inputs:     (optional) input record that will provide actual input
-                    values for the cloned net. Must be compatible with the
-                    net's input schema or be a strict superset of it
-        keep_schema: by default (True), the original schema will be kept and
-                     remapped accordingly. otherwise, the schema will be set as
-                     inputs or left empty if inputs is not given.
-    Returns:
-        Tuple (cloned_net, blob_remap)
-        clone_net:  the cloned Net
-        blob_remap: a map from original blob names into remapped blob names
-    """
-    from caffe2.python import schema
-    assert isinstance(net, Net)
-    if blob_remap is None:
-        blob_remap = {}
-    if inputs is not None:
-        assert isinstance(inputs, schema.Field)
-        original = net.input_record()
-        assert original is not None
-        # TODO(azzolini): improve schema type checking
-        diff = set(original.field_names()) - set(inputs.field_names())
-        assert len(diff) == 0, (
-            "Schemas don't match, extra fields {diff} found in the net {name}. "
-            "original: {original}; inputs: {inputs}"
-            .format(
-                diff=diff, name=net.Name(), original=original.field_names(),
-                inputs=inputs.field_names()
-            )
-        )
-        original_mapping = dict(zip(original.field_names(),
-                                    original.field_blobs()))
-        for fn, fb in zip(inputs.field_names(), inputs.field_blobs()):
-            if fn in original_mapping:
-                blob_remap[str(original_mapping[fn])] = str(fb)
-    proto = net.Proto()
-    ssa, blob_versions = get_ssa(proto)
-    undef_blobs = get_undefined_blobs(ssa)
-
-    for blob in blob_versions.keys():
-        if blob in blob_remap:
-            continue
-        elif blob in undef_blobs:
-            blob_remap[blob] = blob
-        else:
-            blob_remap[blob] = prefix + blob
-    cloned_net = net.Clone(name, blob_remap, keep_schema=keep_schema)
-    if not keep_schema and inputs:
-        cloned_net.set_input_record(inputs)
-    return cloned_net, blob_remap
-
-
-def _get_blob_ref(blob_name_or_ref):
-    return (
-        blob_name_or_ref if isinstance(input, BlobReference)
-        else BlobReference(blob_name_or_ref)
-    )
-
-
-def _recover_record_by_prefix(names, prefix=''):
-    """
-    Tries to recover record by taking a subset of blob names with
-    a given prefix name and interpreting them as schema column names
-    """
-    from caffe2.python import schema
-    column_names = [name[len(prefix):] for name in names
-                    if name.startswith(prefix)]
-    if not column_names:
-        return None
-    return schema.from_column_list(
-        column_names,
-        col_blobs=[_get_blob_ref(prefix + name) for name in column_names])
-
-
-class Net:
-    _net_names_used_counters: Dict[str, int] = {}
-    _net_names_used: Set[str] = set()
-    operator_registry_ = {}
-
-    @staticmethod
-    def current_prefix():
-        from caffe2.python.net_builder import NetBuilder
-        builder = NetBuilder.current(required=False)
-        return builder.name if builder else ''
-
-    @staticmethod
-    def _reset_used_names() -> None:
-        Net._net_names_used_counters = {}
-        Net._net_names_used = set()
-
-    @staticmethod
-    def _get_next_net_name(basename):
-        basename = "/".join(x for x in [Net.current_prefix(), basename] if x)
-        idx = Net._net_names_used_counters.get(basename, 0)
-        while (
-            name := basename if idx == 0 else f"{basename}_{idx}"
-        ) in Net._net_names_used:
-            idx += 1
-        Net._net_names_used_counters[basename] = idx + 1
-        Net._net_names_used.add(name)
-        return name
-
-    def __init__(self, name_or_proto, inplace=False):
-        """
-        Create a Net.
-        Args:
-            name_or_proto:  If a NetDef is provided, clone it (or take ownership,
-                            depending on the value of `inplace`). Otherwise,
-                            create an empty net with the given name.
-            inplace: If a NetDef is provided, take ownership when `inplace` is True;
-                     otherwise, clone it.
-        """
-        self._input_record = None
-        self._output_record = None
-        # Register blobs so that it's guaranteed that different calls to
-        # NextBlob/NextScopedBlob always return blobs with different names
-        self._registered_blob_names = set()
-        self._recreate_lookup_tables = False
-        self._op_outputs = set()
-        self._external_input_map = set()
-        self._attr_dict = defaultdict(list)
-        if type(name_or_proto) is caffe2_pb2.NetDef:
-            proto = name_or_proto
-            # We are initializing a network by a NetDef. In this case, we will
-            # initialize our network with the given netdef.
-            if inplace:
-                self._net = proto
-            else:
-                self._net = caffe2_pb2.NetDef()
-                self._net.CopyFrom(proto)
-
-            existing_outputs = [list(op.output) for op in self._net.op]
-
-            self._external_input_map.update(list(self._net.external_input))
-
-            # Set the next name index properly.
-            existing_names = set()
-            for op in self._net.op:
-                existing_names.update(list(op.input))
-            for output in existing_outputs:
-                existing_names.update(output)
-
-            for outs in existing_outputs:
-                self._op_outputs.update(outs)
-
-            prefix_len = len(self._net.name + '_blob_')
-            autogen_indices = []
-            for s in existing_names:
-                if s.startswith(self._net.name + '_blob_'):
-                    try:
-                        autogen_indices.append(int(s[prefix_len]))
-                    except ValueError:
-                        pass
-            if len(autogen_indices):
-                self._next_name_index = max(autogen_indices) + 1
-            else:
-                self._next_name_index = 0
-            name = self._net.name
-        else:
-            name = name_or_proto
-            self._net = caffe2_pb2.NetDef()
-            self._next_name_index = 0
-
-        # make sure that this net name hasn't been used before
-        self._net.name = Net._get_next_net_name(name)
-
-        # a map between prefix and ID for fast generation of blob names
-        self._next_blob_name_ids = {}
-
-
-    def AppendNet(self, net, device_option=None):
-        assert isinstance(net, Net)
-        for i in net.Proto().external_input:
-            if (
-                i not in self.Proto().external_input and
-                i not in self._op_outputs
-            ):
-                self.Proto().external_input.append(i)
-
-        self.Proto().external_output.extend(
-            [
-                o for o in net.Proto().external_output
-                if o not in self.Proto().external_output
-            ]
-        )
-        ops = net.Proto().op
-        if device_option is not None:
-            ops = [copy.deepcopy(op) for op in ops]
-            for op in ops:
-                op.device_option.CopyFrom(device_option)
-            for op in ops:
-                if op.type == "RecurrentNetwork":
-                    for arg in op.arg:
-                        if arg.name.endswith('step_net'):
-                            for step_op in arg.n.op:
-                                step_op.device_option.CopyFrom(device_option)
-
-        self._ExtendOps(ops)
-        return self
-
-    def LogInfo(self, *msg_or_blobs):
-        for msg_or_blob in msg_or_blobs:
-            if not isinstance(msg_or_blob, BlobReference):
-                blob = self.GivenTensorStringFill(
-                    [], self.NextName('log'),
-                    shape=[], values=[msg_or_blob])
-            else:
-                blob = msg_or_blob
-            self.Print(blob, [])
-
-    def add_attribute(self, name, obj):
-        """
-        Add `obj` to the list of attributes in this net under the given `name`.
-        Attributes are user-defined objects and have no pre-defined semantics.
-        """
-        self._attr_dict[name].append(obj)
-
-    def get_attributes(self, name):
-        """
-        Returns the list of attributes in this net for a given `name`.
-        Attributes are user-defined objects added with `add_attribute'.
-        """
-        return self._attr_dict.get(name, [])
-
-    def set_rand_seed(self, seed=100, sequence_seed=True, seed_on_op_def=False):
-        """
-        Adds a random seed to each op in the net.
-        If sequence_seed is set, the i-th op has rand_seed=`seed + i`
-        If seed_on_op_def is set, the op rand_seed=hash(str(op))
-        sequence_seed and seed_on_op_def cannot be both set to True.
-        """
-        assert not (sequence_seed and seed_on_op_def), (
-            'sequence_seed and seed_on_op_def cannot be both set to True.')
-        for i, op in enumerate(self.Proto().op):
-            if sequence_seed:
-                curr_seed = seed + i
-            elif seed_on_op_def:
-                curr_seed = hash(str(op) + str(seed)) % np.iinfo(np.uint32).max
-            else:
-                curr_seed = seed
-            op.device_option.random_seed = curr_seed
-
-    def Name(self):
-        return self._net.name
-
-    def __str__(self):
-        return self.Name()
-
-    def Const(self, array, blob_out=None, dtype=None):
-        if isinstance(array, bool):
-            return self.ConstantFill(
-                [],
-                blob_out or 1,
-                dtype=DataType.BOOL,
-                value=array)
-
-        if dtype is None:
-            array = np.array(array)
-        else:
-            array = np.array(array, dtype=dtype)
-
-        def do_set(operator):
-            return operator(
-                [],
-                blob_out or 1,
-                shape=array.shape,
-                values=array.flatten().tolist())
-
-        if array.dtype == np.int32:
-            return do_set(self.GivenTensorIntFill)
-        elif array.dtype == np.int64:
-            return do_set(self.GivenTensorInt64Fill)
-        elif array.dtype == str:
-            return do_set(self.GivenTensorStringFill)
-        elif array.dtype == bool:
-            return do_set(self.GivenTensorBoolFill)
-        else:
-            return do_set(self.GivenTensorFill)
-
-    def BlobIsDefined(self, blob):
-        """
-        Returns true if the given BlobReference is produced as output of
-        an operator in this net, or if it is provided as an external input.
-        """
-        if self._recreate_lookup_tables:
-            self._RecreateLookupTables()
-        name = str(blob)
-        return (name in self._op_outputs) or (name in self._external_input_map)
-
-    def UsesBlob(self, blob):
-        """
-        Returns true iff the given BlobReference is used by any operator
-        or this net, or if it is one of the external inputs of the net.
-        """
-        blob_name = str(blob)
-        for op in self._net.op:
-            for input in op.input:
-                if input == blob_name:
-                    return True
-        return blob_name in self._external_input_map
-
-    def UsedBlobNames(self):
-        """
-        Returns a set of blob names used in the net
-        """
-        blob_names = set()
-        for op in self._net.op:
-            blob_names |= set(op.input)
-            blob_names |= set(op.output)
-        if self._net.external_input:
-            blob_names |= set(self._net.external_input)
-        if self._net.external_output:
-            blob_names |= set(self._net.external_output)
-        return blob_names
-
-    def GetBlobRef(self, blob_name):
-        """
-        Given the name of a blob produced by this net, return a BlobReference
-        to it. If the blob is not produced by any op in this net,
-        raises KeyError.
-        """
-        blob_name = str(blob_name)
-        if not self.BlobIsDefined(blob_name):
-            raise KeyError('Net does not define blob %s' % blob_name)
-        return BlobReference(blob_name, self)
-
-    def Clone(
-        self,
-        name,
-        blob_remap=None,
-        op_id_mask=None,
-        remap_funcs=None,
-        keep_schema=True,
-        update_external_list=False,
-    ):
-        """
-        Clone this net.
-        Args:
-            name:        name of the cloned net
-            blob_remap:  optional map with list of blob names to replace
-            op_id_mask:  optional list of operator indices to include in
-                         the cloned net. If not provided, all ops are included.
-        """
-        orig_remap_funcs = {} if remap_funcs is None else remap_funcs
-        # by default we want to put RecurrentNetworkOp and
-        # RecurrentNetworkGradientOp into remap_funcs, as these two operators
-        # also take blobs and proto into the arguments.
-        remap_funcs = DEFAULT_REMAP_FUNCS.copy()
-        remap_funcs.update(orig_remap_funcs)
-        proto = self._net
-        new_proto = caffe2_pb2.NetDef()
-        new_proto.CopyFrom(proto)
-        new_proto.name = name
-
-        if blob_remap is None:
-            blob_remap = {}
-        if op_id_mask is None:
-            op_id_mask = list(range(0, len(proto.op)))
-
-        def get_remapped_str(blob):
-            blob_str = str(blob)
-            return str(blob_remap.get(blob_str, blob_str))
-
-        def remap_list(proto_list):
-            new_list = [get_remapped_str(b) for b in proto_list]
-            del proto_list[:]
-            proto_list.extend(new_list)
-
-        def remap_op(op):
-            new_op = caffe2_pb2.OperatorDef()
-            new_op.CopyFrom(op)
-            remap_list(new_op.input)
-            remap_list(new_op.output)
-            if new_op.type in remap_funcs:
-                remap_funcs[new_op.type](
-                    new_op,
-                    (name + '/') if name else '',
-                    blob_remap,
-                )
-            return new_op
-
-        del new_proto.op[:]
-        new_proto.op.extend([remap_op(proto.op[op_id]) for op_id in op_id_mask])
-        remap_list(new_proto.external_input)
-        remap_list(new_proto.external_output)
-        new_net = Net(new_proto)
-
-        if keep_schema:
-            from caffe2.python import schema
-            if self._input_record:
-                new_net._input_record = schema.from_blob_list(
-                    self._input_record,
-                    [
-                        BlobReference(get_remapped_str(blob), net=new_net)
-                        for blob in self._input_record.field_blobs()
-                    ],
-                )
-            if self._output_record:
-                new_net._output_record = schema.from_blob_list(
-                    self._output_record,
-                    [
-                        BlobReference(get_remapped_str(blob), net=new_net)
-                        for blob in self._output_record.field_blobs()
-                    ],
-                )
-
-        new_net._attr_dict.update(self._attr_dict)
-        if update_external_list:
-            # external input list
-            existing_outputs = set()
-            used_outputs = set()
-            del new_net.Proto().external_input[:]
-            del new_net.Proto().external_output[:]
-            for op in new_net.Proto().op:
-                for ib in op.input:
-                    if ib not in existing_outputs:
-                        new_net.Proto().external_input.extend([ib])
-                    else:
-                        used_outputs.add(ib)
-                for ob in op.output:
-                    existing_outputs.add(ob)
-            # external outputs
-            for ob in existing_outputs:
-                if ob not in used_outputs:
-                    new_net.Proto().external_output.extend([ob])
-        return new_net
-
-    def ClonePartial(self, name, inputs, outputs, remap_funcs=None):
-        """
-        Clone this net, including only ops that are necessary in order to
-        compute `outputs` given `inputs`. Return references to the cloned
-        outputs. Internal blobs (blobs that are produced and consumed inside
-        the net but not used as outputs) will be remapped to avoid name
-        conflict.
-
-        Args:
-            name:    the name of the cloned net
-            inputs:  map where the keys correspond to BlobReferences in the
-                     original net, and the values correspond to external inputs
-                     in the partially cloned net. If `inputs` is a list, don't
-                     remap input names.
-            outputs: outputs to be produced by the cloned net.
-
-        Returns:
-            Tuple (new_net, new_outputs)
-                new_net:       a new Net object.
-                new_outputs:   list of BlobReferences corresponding to the
-                               outputs produced by new_net.
-        """
-        input_is_pair_list = isinstance(inputs, list) and all(
-            isinstance(i, tuple) and len(i) == 2 for i in inputs)
-        inputs = (
-            inputs if isinstance(inputs, (dict, OrderedDict)) else
-            OrderedDict(inputs) if input_is_pair_list else
-            OrderedDict(zip(inputs, inputs)))
-        for output in outputs:
-            assert self.BlobIsDefined(output), "{} is not defined".format(output)
-        input_names = {str(k): str(v) for k, v in inputs.items()}
-        output_names = [str(o) for o in outputs]
-        proto = self._net
-        blob_versions = {str(i): 0 for i in inputs}
-        ssa, blob_versions = get_ssa(proto, blob_versions)
-        used_op_ids = get_op_ids_in_path(ssa, blob_versions, inputs, outputs)
-        disallowed_op_ids = get_op_ids_in_path(ssa, blob_versions, [], inputs)
-        assert len(set(used_op_ids) & set(disallowed_op_ids)) == 0, (
-            'Cannot partially clone net: some of the ops required would ' +
-            'generate the given input.')
-
-        sub_ssa = [op for i, op in enumerate(ssa) if i in used_op_ids]
-        undef_blobs = get_undefined_blobs(sub_ssa) - set(input_names.keys())
-        prefix = (name + '/') if name else ''
-
-        def remap(blob_name):
-            if blob_name in input_names:
-                return input_names[blob_name]
-            elif blob_name in undef_blobs:
-                return blob_name
-            else:
-                return prefix + blob_name
-
-        blob_mapping = {b: remap(b) for b in blob_versions.keys()}
-        new_net = self.Clone(name, blob_mapping, used_op_ids, remap_funcs)
-        new_in = [
-            blob_mapping[i] for i in input_names.keys()] + list(undef_blobs)
-        new_out = [blob_mapping[o] for o in output_names]
-        del new_net.Proto().external_input[:]
-        new_net.Proto().external_input.extend(new_in)
-        new_net._external_input_map = set(list(new_in))
-        del new_net.Proto().external_output[:]
-        new_net.Proto().external_output.extend(new_out)
-        return new_net, [new_net.GetBlobRef(o) for o in new_out]
-
-    def Proto(self):
-        self._InvalidateLookupTables()
-        return self._net
-
-    def insert_op_at_idx(self, op, op_idx):
-        r""" inserting operator at index. Will update external blob list.
-        """
-        assert op_idx >= 0
-        temp_ops = self.Proto().op[op_idx:]
-        del self.Proto().op[op_idx:]
-        self.Proto().op.extend([op])
-        self.Proto().op.extend(temp_ops)
-        self.external_outputs.extend(op.output)
-        self.external_inputs.extend(op.input)
-
-    def reroute_tensor(self, tensor, new_producer, can_modify=None):
-        r""" reroute tensor to new_producer. And feed new tensor to consumers
-        and interseciton with can_modify if provided.
-        Inputs:
-            tensor: str or blob_reference the tensor to reroute
-            new_producer: an op takes in tensor gives new_tesnor
-            can_modify: a list/set of operators that consumes tensor and can be
-            modified
-
-        Returns:
-            reroute_cnt: how many consumer op has been changed
-
-        Note: assume no inplace blob in net
-        """
-        def _find_tensor_input_op(tensor):
-            if tensor in self.external_inputs:
-                op_idx = -1
-            else:
-                assert tensor in new_producer.input, \
-                    "new producer {} is not taking in {}".format(
-                        new_producer.type, tensor)
-                # assuming that the net has no inplace blob
-                op_idx = -2
-                for index, op in enumerate(self.Proto().op):
-                    if_found = False
-                    for o in op.output:
-                        if o == tensor:
-                            # tensor should not be modified yet.
-                            if_found = True
-                            op_idx = index
-                            break
-                    if if_found:
-                        break
-            return op_idx
-
-        # the place to inject new_producer is not just determined by tensor
-        op_idx = max(_find_tensor_input_op(t) for t in new_producer.input)
-        self.insert_op_at_idx(new_producer, op_idx + 1)
-        new_tensor = new_producer.output[0]
-        # modify external outputs
-        if tensor in self.external_outputs:
-            new_list = [new_tensor if b == tensor else b for b in self.external_outputs]
-            del self.Proto().external_output[:]
-            self.Proto().external_output.extend(new_list)
-
-        # modify consumers
-        reroute_cnt = 0
-        if can_modify:
-            for op in self.Proto().op:
-                if op in can_modify:  # this is not necessarily true
-                    remap_input(op, {tensor: new_tensor})
-                    reroute_cnt = reroute_cnt + 1
-        return reroute_cnt
-
-    def PopulateProtoWithFileName(self):
-        net_tb = workspace.operator_tracebacks.get(self.Name(), None)
-        if net_tb is not None:
-            for idx, op in enumerate(self.Proto().op):
-                if idx in net_tb:
-                    op.name = ':'.join(map(str, net_tb[idx][0]))
-
-    def NextScopedBlob(self, prefix='unnamed'):
-        """Return the blob that has not been defined or registered in the
-        current net. It returns `ScopedBlobReference(prefix)`, if it's valid,
-        otherwise `ScopedBlobReference(prefix) + '_auto_' + ?`. Different calls
-        is guaranteed to return blob with different names.
-        """
-        output_blob_base = ScopedName(prefix)
-        return self.NextBlob(output_blob_base)
-
-    def NextBlob(self, prefix='unnamed'):
-        """Return the blob that has not been defined or registered in the
-        current net. It returns `BlobReference(prefix)`, if it's valid,
-        otherwise `BlobReference(prefix) + '_auto_' + ?`. Different calls
-        is guaranteed to return blob with different names."""
-        output_blob_base = BlobReference(prefix)
-        output_blob = output_blob_base
-        index = 0
-        while str(output_blob) in self._registered_blob_names or (
-                self.BlobIsDefined(output_blob)):
-            output_blob = output_blob_base + '_auto_' + str(index)
-            index += 1
-
-        self._registered_blob_names.add(str(output_blob))
-        return output_blob
-
-    def NextName(self, prefix=None, output_id=None):
-        """Returns the next name to be used, if you do not want to explicitly
-        name your blob. [Deprecated, use NextBlob, NextScopedBlob instead]"""
-        if prefix:
-            output_name_base = self._net.name + '/' + prefix
-            output_name = output_name_base
-            if output_id is not None:
-                output_name += ':' + str(output_id)
-            key = output_name
-            index = self._next_blob_name_ids.get(key, 2)
-            while self.BlobIsDefined(str(ScopedBlobReference(output_name))):
-                output_name = output_name_base + '_' + str(index)
-                if output_id is not None:
-                    output_name += ':' + str(output_id)
-                index += 1
-                self._next_blob_name_ids[key] = index
-        else:
-            output_name = self._net.name + '_blob_' + str(self._next_name_index)
-            self._next_name_index += 1
-        return str(output_name)
-
-    def _ExtendOps(self, new_ops):
-        self._net.op.extend(new_ops)
-        for op in new_ops:
-            self._op_outputs.update([str(o) for o in op.output])
-
-    def _CheckLookupTables(self):
-        '''
-        Called from unit tests to validate the internal lookup tables
-        match the protobuf contents.
-        '''
-        test_op_outputs = set()
-        for op in self._net.op:
-            for o in op.output:
-                test_op_outputs.add(o)
-
-        test_external_inp = set()
-        for inp in self._net.external_input:
-            test_external_inp.add(inp)
-
-        assert test_op_outputs.difference(self._op_outputs) == set()
-        assert test_external_inp.difference(self._external_input_map) == set()
-
-    def _InvalidateLookupTables(self):
-        self._recreate_lookup_tables = True
-
-    def _RecreateLookupTables(self):
-        self._op_outputs = {o for op in self._net.op for o in op.output}
-        self._external_input_map = {inp for inp in self._net.external_input}
-        self._recreate_lookup_tables = False
-
-    def AddGradientOperators(self, ys, skip=0):
-        """Add the gradient for operators in the net.
-
-        Inputs:
-          ys: a list or a dictionary specifying what blobs we want to compute
-              derivatives of. If the input is a list, we will automatically
-              generate their gradients with all-one values; if the input is a
-              dictionary, for any dictionary entries that are not None, we will
-              take the corresponding blobs as their gradients; for all those
-              that are None, we will auto-fill them with 1.
-          skip: skips the first n operators. This is provided mainly because a
-              lot of nets may use the first few operators for data generation
-              like stuff which really do not need to have gradients.
-
-        Outputs:
-          returns a map from the blob name in the input network to a blob
-          containing gradient or a GradientSlice in case of sparse gradient
-
-        Currently, this is hard-coded for float operators if there are branches
-        (i.e. a blob is used as input to multiple operators). This is because
-        the gradient accumulation (Sum) is float only right now.
-        """
-
-        grad_ops, input_to_grad = GradientRegistry.GetBackwardPass(
-            self._net.op[skip:], ys)
-        # Check if in immediate mode: the grad_ops are actually being produced
-        # by C++ and bypasses the CreateOperator() call, so in immediate mode
-        # we will have to explicitly run them.
-        if workspace.IsImmediate():
-            for op in grad_ops:
-                workspace.RunOperatorImmediate(op)
-        self._ExtendOps(grad_ops)
-        return input_to_grad
-
-    def AddArgument(self, arg_name, arg_value):
-        self._net.arg.extend([utils.MakeArgument(arg_name, arg_value)])
-
-    def AddExternalInput(self, *inputs):
-        assert len(inputs) > 0
-        refs = []
-        input_name_set = set()
-        for input in inputs:
-            input_name = str(input)
-            assert (
-                input_name not in self._external_input_map
-                and input_name not in input_name_set
-            ), ("Net already contains an input named %s" % input_name)
-            input_name_set.add(input_name)
-        for input in inputs:
-            input_name = str(input)
-            self._net.external_input.extend([input_name])
-            self._external_input_map.update([input_name])
-            refs.append(_get_blob_ref(input_name))
-
-        return refs[0] if len(refs) == 1 else refs
-
-    def AddExternalOutput(self, *outputs):
-        for output in outputs:
-            assert isinstance(output, BlobReference)
-            assert self.BlobIsDefined(output), "{} is not defined".format(output)
-        for output in outputs:
-            self.Proto().external_output.extend([str(output)])
-
-    def AddScopedExternalInputs(self, *inputs):
-        res = self.AddExternalInput(
-            * [ScopedBlobReference(b) for b in inputs]
-        )
-        if not isinstance(res, list):
-            res = [res]
-        return res
-
-    def AddScopedExternalOutputs(self, *outputs):
-        return self.AddExternalOutput(
-            * [ScopedBlobReference(b) for b in outputs]
-        )
-
-    # This returns a reference to the observer
-    def AddObserver(self, observer_type):
-        return C.add_observer_to_net(self._net.name, observer_type)
-
-    def RemoveObserver(self, observer):
-        C.remove_observer_from_net(self._net.name, observer)
-
-    def NumObservers(self):
-        return C.num_observers_on_net(self._net.name)
-
-    @property
-    def external_inputs(self):
-        return [_get_blob_ref(x) for x in self._net.external_input]
-
-    @property
-    def external_outputs(self):
-        return [_get_blob_ref(x) for x in self._net.external_output]
-
-    def set_input_record(self, input_record):
-        from caffe2.python import schema
-        assert self._input_record is None or (input_record.has_blobs() and
-            set(input_record.field_blobs()) ==
-            set(self._input_record.field_blobs())), (
-            'Input schema cannot be reset')
-        if not input_record.has_blobs():
-            with NameScope(self.Name()):
-                self._input_record = schema.NewRecord(self, input_record)
-        else:
-            self._input_record = input_record
-
-        for blob in self._input_record.field_blobs():
-            if not self.is_external_input(blob):
-                self.AddExternalInput(blob)
-        return self._input_record
-
-    def recover_input_record_by_prefix(self, prefix):
-        """
-        Tries to recover input record by taking a subset of external_inputs with
-        a given prefix name and interpreting them as schema column names
-        """
-        record = _recover_record_by_prefix(self._net.external_input, prefix)
-        if record:
-            self.set_input_record(record)
-
-    def set_output_record(self, record):
-        assert self._output_record is None or (record.has_blobs() and
-            set(record.field_blobs()) ==
-            set(self._output_record.field_blobs())), (
-            'Output schema cannot be reset')
-        for blob in record.field_blobs():
-            assert self.BlobIsDefined(blob), "{} is not defined in net {}".format(
-                blob,
-                self.Proto()
-            )
-        for blob in record.field_blobs():
-            if blob not in self.external_outputs:
-                self.AddExternalOutput(blob)
-        self._output_record = record
-
-    def recover_output_record_by_prefix(self, prefix):
-        """
-        Tries to recover out record by taking a subset of external_outputs with
-        a given prefix name and interpreting them as schema column names
-        """
-        record = _recover_record_by_prefix(self._net.external_output, prefix)
-        if record:
-            self.set_output_record(record)
-
-    def AppendOutputRecordField(self, field_name, record):
-        from caffe2.python import schema
-        assert self._output_record is not None, (
-            'Tried to append to missing output record'
-        )
-        for blob in record.field_blobs():
-            assert self.BlobIsDefined(blob), "{} is not defined".format(blob)
-        for blob in record.field_blobs():
-            self.AddExternalOutput(blob)
-        self._output_record = self._output_record + schema.Struct(
-            (field_name, record)
-        )
-
-    def input_record(self):
-        return self._input_record
-
-    def output_record(self):
-        return self._output_record
-
-    def AddExternalInputs(self, *inputs):
-        return self.AddExternalInput(*inputs)
-
-    def AddExternalOutputs(self, *outputs):
-        self.AddExternalOutput(*outputs)
-
-    def DeduplicateGradientSlices(self, g, aggregator='sum'):
-        assert isinstance(g, GradientSlice)
-        unique, remapping = self.Unique([g.indices], 2, engine='SparseHash')
-        if aggregator.lower() == 'sum':
-            new_g = self.UnsortedSegmentSum([g.values, remapping], 1)
-        elif aggregator.lower() == 'mean':
-            new_g = self.UnsortedSegmentMean([g.values, remapping], 1)
-        else:
-            raise ValueError('{} is not supported'.format(aggregator))
-        return GradientSlice(indices=unique, values=new_g)
-
-    @staticmethod
-    def _RunAllOnGPU(net, gpu_id=0, use_cudnn=False):
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = workspace.GpuDeviceType
-        device_option.device_id = gpu_id
-        net.device_option.CopyFrom(device_option)
-        if use_cudnn:
-            for op in net.op:
-                op.engine = "CUDNN"
-        # Move RecurrentNetwork operators on GPU as well
-        for op in net.op:
-            if op.type != "RecurrentNetwork":
-                continue
-            for arg in op.arg:
-                if arg.name == "step_net":
-                    Net._RunAllOnGPU(arg.n, gpu_id, use_cudnn)
-
-    def RunAllOnGPU(self, gpu_id=0, use_cudnn=False):
-        """A convenient function to run everything on the GPU."""
-        self._RunAllOnGPU(self._net, gpu_id, use_cudnn)
-
-
-
-    def RunAllOnMKL(self):
-        """A convenient function to run everything using MKLDNN."""
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.MKLDNN
-        self._net.device_option.CopyFrom(device_option)
-
-    def RunAllOnIDEEP(self):
-        """A convenient function to run everything using IDEEP."""
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.IDEEP
-        self._net.device_option.CopyFrom(device_option)
-
-    def _CreateAndAddToSelf(self, op_type, inputs, outputs=None, **kwargs):
-        """A helper function to create an operator and add it to self.
-        """
-        inputs = _RectifyInputOutput(inputs)
-        for input in inputs:
-            if not self.BlobIsDefined(input):
-                assert input.Net() != self
-                self.AddExternalInput(input)
-        if outputs is None:
-            # If we do not specify an output, we will assume that this op
-            # produces one output in this case.
-            outputs = self.NextName(prefix=op_type)
-        elif type(outputs) is int:
-            # In this case, we will auto-fill the given number of outputs
-            # with auto-generated names.
-            outputs = [
-                self.NextName(prefix=op_type, output_id=i)
-                for i in range(outputs)]
-        outputs = _RectifyInputOutput(outputs, net=self)
-        op = CreateOperator(op_type, inputs, outputs, **kwargs)
-        self._ExtendOps([op])
-
-        workspace.operator_tracebacks[self.Name()][
-            len(self._net.op) - 1] = _extract_stacktrace()
-
-        if len(op.output) == 0:
-            return
-        elif len(op.output) == 1:
-            return BlobReference(op.output[0], self)
-        else:
-            return tuple(BlobReference(o, self) for o in op.output)
-
-    def __getattr__(self, op_type):
-        if op_type.startswith('__'):
-            raise AttributeError('Attribute {} not found.'.format(op_type))
-        if not IsOperator(op_type) and not IsOperatorWithEngine(op_type, "CUDNN"):
-            raise AttributeError(
-                'Method ' + op_type + ' is not a registered operator.' +
-                ' Did you mean: [' +
-                ",".join(workspace.C.nearby_opnames(op_type)) + ']'
-            )
-        return lambda *args, **kwargs: self._CreateAndAddToSelf(
-            op_type, *args, **kwargs)
-
-    def __dir__(self):
-        TriggerLazyImport()
-        additional_methods = [
-            op
-            for op in _REGISTERED_OPERATORS
-            if '_ENGINE_' not in op]
-        return sorted(set(chain(
-            dir(type(self)),
-            self.__dict__.keys(),
-            additional_methods
-        )))
-
-    def Python(
-        self,
-        f,
-        grad_f=None,
-        python_func_type=None,
-        pass_workspace=False,
-        grad_output_indices=None,
-        grad_input_indices=None
-    ):
-        """
-        Registers and returns a python operator.
-
-        `f` and `grad_f` can be one of the following:
-            - a function with signature (inputs, outputs), where inputs and
-              outputs are a list of CPUTensor objects. This function will be
-              called from C++ everytime the operator is executed.
-            - a tuple (func, args, kwargs), here `func` is a callable, args is
-              an argument list, and kwargs is a dict list. The call:
-                  f = func(*args, kwargs)
-              will be performed locally at node initialization time, on all of
-              the nodes of the job, returning `f`, a callable that will be used
-              as the python operator function to be called during Net execution.
-              This is to be used when using python operator in a distributed
-              context, and allows to create and keep local python state across
-              calls to the operator.
-
-        `python_func_type` is a type of an object that constructed as
-        python_func_type(f) and provides an implementation to forward and
-        backward functions. Its useful in such a case where users needs
-        a statefull PythonOp (ex: use autograd for computing grad_f).
-
-        If `pass_workspace` is True, the signature is changed to
-        (inputs, outputs, workspace) where `workspace` is the workspace the op
-        is going to run on. This is potentially dangerous (as the op can
-        manipulate the workspace directly), use on your own risk.
-
-        If a gradient function is specified (`grad_f`), by default its inputs
-        will be: (1) all inputs to `f`, (2) followed by all outputs of `f`, (3)
-        and then all gradient outputs of `f`. The outputs of `grad_f` will be
-        (by default) all gradient inputs to `f`. If a subset of the gradient
-        outputs or gradient inputs is desired instead, then the subsets can be
-        specified by providing `grad_output_indices` and/or `grad_input_indices`
-        which identify the indices of `f`'s inputs and outputs which have
-        gradients.
-        """
-        assert(IsOperator('Python'))
-
-        def make_builder(t):
-            if not isinstance(t, tuple):
-                return ''
-            assert len(t) == 3, 'Expected builder tuple (func, args, kwargs)'
-            func, args, kwargs = t
-            normalized = (func, tuple(args), dict(kwargs))
-            return pickle.dumps(normalized)
-
-        f_builder = make_builder(f)
-        grad_f_builder = make_builder(grad_f)
-
-        assert (not grad_f) or ((not f_builder) == (not grad_f_builder)), (
-            'A tuple has to be passed to both f and grad_f or neither.')
-
-        core_kwargs = {}
-        if f_builder:
-            core_kwargs['pickled_builder'] = f_builder
-            core_kwargs['pickled_grad_builder'] = grad_f_builder
-            core_kwargs['pass_workspace'] = pass_workspace
-        else:
-            core_kwargs['token'] = _RegisterPythonImpl(
-                f, grad_f, python_func_type, pass_workspace=pass_workspace)
-
-        grad_output_indices = grad_output_indices or []
-        grad_input_indices = grad_input_indices or []
-        return lambda *args, **kwargs: self._CreateAndAddToSelf(
-            'Python',
-            grad_output_indices=grad_output_indices,
-            grad_input_indices=grad_input_indices,
-            *args,
-            **dict(chain(kwargs.items(), core_kwargs.items()))
-        )
-
-    def is_external_input(self, blob):
-        if self._recreate_lookup_tables:
-            self._RecreateLookupTables()
-
-        name = str(blob)
-        return name in self._external_input_map
-
-    def extend_ops(self, new_ops):
-        return self._ExtendOps(new_ops)
-
-
-def remap_input(op, blob_name_remapping):
-    new_list = [blob_name_remapping.get(b, b) for b in op.input]
-    del op.input[:]
-    op.input.extend(new_list)
-
-
-def copy_func_between_devices(src, dst):
-    CPU = caffe2_pb2.CPU
-    is_src_gpu = IsGPUDeviceType(src.device_type)
-    is_dst_gpu = IsGPUDeviceType(dst.device_type)
-
-    if src.device_type == CPU and dst.device_type == CPU:
-        return None
-
-    if is_src_gpu and is_dst_gpu:
-        if src.device_id == dst.device_id:
-            return None
-        else:
-            def fun(net, *args, **kw):
-                with DeviceScope(dst):
-                    return net.Copy(*args, **kw)
-            return fun
-
-    if is_src_gpu and dst.device_type == CPU:
-        def fun(net, *args, **kw):
-            with DeviceScope(src):
-                return net.CopyGPUToCPU(*args, **kw)
-        return fun
-
-    if src.device_type == CPU and is_dst_gpu:
-        def fun(net, *args, **kw):
-            with DeviceScope(dst):
-                return net.CopyCPUToGPU(*args, **kw)
-        return fun
-
-    raise ValueError('Non-supported devices: %s and %s' % (src, dst))
-
-
-def device_equal(src, dst):
-    '''
-    We are using this fucntion instead of == operator because optional-value
-    comparison between empty device_options and {device_type:0, device_id:0}
-    returns not equal in some cases.
-    '''
-    return src.device_type == dst.device_type and src.device_id == dst.device_id
-
-
-def update_placeholder_op_output(op, blob_to_device):
-    '''
-    Placeholder ops (for e.g. Recv) always runs on CPU. So ensure their
-    output blobs reside on CPU.
-    '''
-    outputs = []
-    for output in op.output:
-        if (output in blob_to_device and
-                blob_to_device[output].device_type != caffe2_pb2.CPU):
-            output += '_cpu'
-        outputs.append(output)
-    del op.output[:]
-    op.output.extend(outputs)
-
-
-class RemapEntry:
-    def __init__(self, blob, device):
-        self.blob = blob
-        self.device = device
-
-    def __eq__(self, other):
-        return self.blob == other.blob and self.device == other.device
-
-    def __hash__(self):
-        return hash(self.blob + str(self.device))
-
-
-def InjectCrossDeviceCopies(net, blob_to_device=None, blob_remap=None,
-                            placeHolderOps=None):
-    '''
-    Injecting Copy functions between device within a net. Users can provide
-    a net with part of operators using different device_options. This method
-    will automatically create a new net with Copy ops inserted in it.
-
-    Inputs:
-      blob_to_device: If not None, it is a map of blobs and their device locations.
-      blob_remap: If not None, it is a map from a pair (blob, device) to
-                  the name of the blob in the given device. Blobs found in this
-                  map are assumed to be cached and don't need to be copied.
-    Outputs:
-      new_net: A new net with CopyCPUToGPU inserted with correct device option
-
-      required_external_to_device:
-               A mapping between unresolved external inputs and their
-               required device options.
-    Assumptions:
-      1. every external inputs of this net is already in blob_to_device!
-      2. if not, this function will use net device option
-      3. InferOpBlobDevices might fail to get the correct inference for ops like
-         EnsureCPUOutput that could take in input from multiple places.
-    '''
-    new_net = net.Clone(net._net.name + '_cross_device', keep_schema=True)
-    del new_net._net.op[:]
-    if blob_to_device is None:
-        blob_to_device = {}
-    # remapping of input blobs for each op.
-    if blob_remap is None:
-        blob_remap = {}
-    temp_remap = {}
-    net_option = net._net.device_option or caffe2_pb2.DeviceOption()
-
-    # if external_inputs have device remappings generated by previous nets,
-    # then add those remappings as external inputs as well.
-    all_remaps = defaultdict(list)
-    for entry, mapped_blob in blob_remap.items():
-        all_remaps[entry.blob].append(mapped_blob)
-    mapped_external_inputs = []
-    for input in new_net._net.external_input:
-        mapped_external_inputs.extend(all_remaps.get(input) or [])
-    new_net._net.external_input.extend(mapped_external_inputs)
-
-    for op in net._net.op:
-        temp_remap.clear()
-        # Get where inputs and outputs should be. If it is a Placeholder
-        # (i.e. fake) op, then set op's device as blob's devices.
-        input_dev = None
-        output_dev = None
-        if placeHolderOps is not None and op.type in placeHolderOps:
-            input_dev, output_dev = InferOpDeviceAsBlobDevices(op)
-        else:
-            input_dev, output_dev = InferOpBlobDevices(op)
-
-        for dev, input in zip(input_dev, op.input):
-            assert net.BlobIsDefined(input), \
-                "input {} should be defined in the net.".format(input)
-            if input not in blob_to_device:
-                if net.is_external_input(input):
-                    blob_to_device[input] = net_option
-                else:
-                    raise AttributeError(
-                        "No device information found for blob {}.".
-                        format(input)
-                    )
-
-            if not device_equal(blob_to_device[input], dev):
-                # reuse already moved input
-                if (RemapEntry(input, dev) in blob_remap and
-                        blob_to_device[blob_remap[RemapEntry(input, dev)]] == dev):
-                    temp_remap[input] = blob_remap[RemapEntry(input, dev)]
-                else:
-                    # need to make input on correct device.
-                    copy_func = copy_func_between_devices(
-                        blob_to_device[input], dev
-                    )
-
-                    def _gen_new_name(blob, device_option):
-                        CPU = caffe2_pb2.CPU
-                        if device_option.device_type == CPU:
-                            suffix = '_cpu'
-                        elif IsGPUDeviceType(device_option.device_type):
-                            suffix = '_gpu_' + str(device_option.device_id)
-                        else:
-                            raise RuntimeError(
-                                "Unknown device type: {}".
-                                format(device_option.device_type)
-                            )
-                        return blob + suffix
-
-                    new_name = _gen_new_name(input, dev)
-                    copy_func(new_net, input, new_name)
-                    blob_remap[RemapEntry(input, dev)] = new_name
-                    temp_remap[input] = new_name
-                    blob_to_device[new_name] = dev
-
-        if placeHolderOps is not None and op.type in placeHolderOps:
-            update_placeholder_op_output(op, blob_to_device)
-
-        # Enforcing no reuse blob between operators. In-place blob usage in an
-        # op is allowed. This is based on the assumption that in-place op has
-        # same device info
-        for dev, output in zip(output_dev, op.output):
-            if output in blob_to_device and (
-                output not in op.input and
-                not device_equal(blob_to_device[output], dev)
-            ):
-                raise RuntimeError(
-                    "In-place blob: {} is not supported between operators "
-                    "with different device option previous:{} now: {}. "
-                    "Failed op:\n {}".format(
-                        output, blob_to_device[output], dev, op
-                    )
-                )
-        new_op = caffe2_pb2.OperatorDef()
-        new_op.CopyFrom(op)
-
-        new_list = [temp_remap.get(b, b) for b in new_op.input]
-        del new_op.input[:]
-        new_op.input.extend(new_list)
-
-        # keep inplace blobs inplace
-        original_inputs = list(op.input)
-        for i, out in enumerate(new_op.output):
-            try:
-                input_idx = original_inputs.index(out)
-                new_op.output[i] = new_op.input[input_idx]
-            except ValueError:
-                pass
-
-        blob_to_device.update(
-            {o: d for d, o in zip(output_dev, new_op.output)})
-        new_net.extend_ops([new_op])
-
-    return new_net, blob_to_device
-
-
-def InjectDeviceCopiesAmongNets(nets, blob_to_device_init=None):
-    """
-    Takes in a list of nets. They usually represent your whole execution graph.
-    This function will insert cross device copy functions to all nets, and resolve
-    inter-net external inputs dependencies. This method will insert Copy funcitons if
-    external inputs of a net is produced on different device than it is required.
-    Inputs:
-      nets: a list of nets
-    Outputs:
-      new_nets: a list of new nets with device difference solved.
-
-    Some notes from wyiming:
-      1. You MUST pass nets in execution order. e.g. [train_init, train]
-    """
-    assert isinstance(nets, list), \
-        "nets {} should be a list of nets.".format(str(nets))
-    assert all(isinstance(net, Net) for net in nets), \
-        "nets {} should be a list of nets.".format(str(nets))
-    # A holistic blob to device mapping.
-    blob_to_device = blob_to_device_init or {}
-    blob_remap = {}
-    new_nets = []
-
-    for net in nets:
-        new_net, blob_to_device = InjectCrossDeviceCopies(
-            net,
-            blob_to_device=blob_to_device,
-            blob_remap=blob_remap,
-        )
-        new_nets.append(new_net)
-
-    return new_nets, blob_to_device
-
-
-def InjectDeviceCopiesAmongNetsWithoutB2D(nets, blob_to_device_init=None):
-    new_nets, _ = InjectDeviceCopiesAmongNets(nets, blob_to_device_init)
-    return new_nets
-
-
-def get_net_name(netlike):
-    if isinstance(netlike, Net):
-        return netlike.Proto().name
-    elif isinstance(netlike, caffe2_pb2.NetDef):
-        return netlike.name
-    else:
-        return netlike
-
-
-def output_to_list(op_output):
-    """
-    Ensures that the output of an operator is a list.
-    Use when an operator has a variable number of outputs, but a list of
-    outputs is desired even when number of outputs is 1.
-
-    Args:
-        op_output: Either a BlobReferenece or an iterable of BlobReferences.
-
-    Returns:
-        A list of BlobReferences.
-    """
-    assert type(op_output) in (list, tuple, BlobReference)
-    return (
-        [op_output]
-        if isinstance(op_output, BlobReference) else list(op_output))
-
-
-def _add_net_to_dict(net_dict, net):
-    name = get_net_name(net)
-    if name in net_dict:
-        assert net_dict[name] is None or net == net_dict[name], (
-            'Different nets with same name: ' + name)
-        return False
-    else:
-        net_dict[name] = net if isinstance(net, Net) else None
-        return True
-
-
-class ExecutionStep:
-    _step_names_used = set()
-
-    @staticmethod
-    def _get_next_step_name(basename):
-        name = basename
-        next_idx = 1
-        while name in ExecutionStep._step_names_used:
-            name = basename + '_' + str(next_idx)
-            next_idx += 1
-        ExecutionStep._step_names_used |= set([name])
-        return name
-
-    def __init__(self, name, nets=None, num_iter=None):
-        self._step = caffe2_pb2.ExecutionStep()
-        self._step.name = name or ExecutionStep._get_next_step_name('step')
-        self._net_dict = OrderedDict()
-        self._is_used = False
-        self._substeps = []
-        if nets is not None:
-            if type(nets) is Net:
-                nets = [nets]
-            for net in nets:
-                if _add_net_to_dict(self._net_dict, net):
-                    self._step.network.extend([get_net_name(net)])
-        if num_iter is not None:
-            self._step.num_iter = num_iter
-
-    def get_net(self, name):
-        return self._net_dict[name]
-
-    def Name(self):
-        return self._step.name
-
-    def __str__(self):
-        return self._step.name
-
-    def _assert_can_mutate(self):
-        assert not self._is_used, (
-            'Cannot mutate a step that has already been added to a plan/step.')
-
-    def _notify_is_used(self):
-        self._is_used = True
-
-    def Proto(self):
-        return self._step
-
-    def HasNets(self):
-        return self._step.network is not None and (
-            len(self._step.network) > 0)
-
-    def HasSubsteps(self):
-        return self._step.substep is not None and (
-            len(self._step.substep) > 0)
-
-    def Nets(self):
-        return list(self._net_dict.values())
-
-    def Substeps(self):
-        return self._substeps
-
-    def SetIter(self, num_iter):
-        self._assert_can_mutate()
-        self._step.num_iter = num_iter
-
-    def SetCreateWorkspace(self, create_workspace):
-        self._assert_can_mutate()
-        self._step.create_workspace = create_workspace
-
-    def SetNumConcurrentInstances(self, num_concurrent_instances):
-        self._assert_can_mutate()
-        self._step.num_concurrent_instances = num_concurrent_instances
-
-    def SetOnlyOnce(self, only_once):
-        self._assert_can_mutate()
-        self._step.only_once = only_once
-
-    def SetShouldStopBlob(self, should_stop_blob):
-        assert isinstance(should_stop_blob, BlobReference), (
-            "expects BlobReference here, got {}".format(type(should_stop_blob)))
-        self._assert_can_mutate()
-        self._step.should_stop_blob = str(should_stop_blob)
-
-    def RunEveryMillis(self, interval):
-        """
-        Run this step every interval millisecods, as long as its
-        siblings are still running. It is guaranteed that, after all
-        siblings finish, this step will run at least one.
-
-        This property is ignored for top-level ExecutionSteps.
-        """
-        self._step.run_every_ms = interval
-
-    def SetReportNet(self, report_net, report_interval):
-        """ DEPRECATED. Use RunEveryMillis instead. """
-        self._assert_can_mutate()
-        _add_net_to_dict(self._net_dict, report_net)
-        self._step.report_net = get_net_name(report_net)
-        self._step.report_interval = report_interval
-
-    def AddSubstep(self, substep):
-        self._assert_can_mutate()
-        assert not self.HasNets(), 'Cannot have both network and substeps.'
-        if isinstance(substep, ExecutionStep):
-            substep._notify_is_used()
-            if not substep.HasNets() and not substep.HasSubsteps():
-                return self
-            for net in substep.Nets():
-                _add_net_to_dict(self._net_dict, net)
-            self._substeps.append(substep)
-            proto = substep.Proto()
-        else:
-            proto = substep
-        self._step.substep.add().CopyFrom(proto)
-        return self
-
-    def SetConcurrentSubsteps(self, concurrent_substeps):
-        self._assert_can_mutate()
-        assert not self.HasNets(), 'Cannot have both network and substeps.'
-        self._step.concurrent_substeps = concurrent_substeps
-
-    def AddNet(self, net):
-        self._assert_can_mutate()
-        assert not self.HasSubsteps(), 'Cannot have both network and substeps.'
-        assert isinstance(net, Net)
-        _add_net_to_dict(self._net_dict, net)
-        self._step.network.extend([get_net_name(net)])
-        return self
-
-    def get_all_attributes(self, name):
-        """
-        Return the list of all attributes under the given `name`, present in
-        all of the nets used in this execution step and its children.
-        """
-        return [
-            attr
-            for net in self._net_dict.values()
-            for attr in net.get_attributes(name)
-        ]
-
-    @classmethod
-    def create_from_proto(cls, step_proto, net_obj_dict, net_proto_dict):
-        """
-        Create ExecutionStep from ExecutionStep protobuf recursively
-        """
-        assert isinstance(step_proto, caffe2_pb2.ExecutionStep)
-        assert (len(step_proto.network) > 0 and len(step_proto.substep) == 0) or \
-            (len(step_proto.network) == 0 and len(step_proto.substep) > 0)
-
-        steps_or_nets = []
-        if len(step_proto.substep) > 0:
-            for substep_proto in step_proto.substep:
-                steps_or_nets.append(ExecutionStep.create_from_proto(
-                    substep_proto, net_obj_dict, net_proto_dict))
-        else:
-            for net_name in step_proto.network:
-                if net_name not in net_obj_dict:
-                    assert net_name in net_proto_dict
-                    net = Net(net_proto_dict[net_name])
-                    net_obj_dict[net_name] = net
-                net = net_obj_dict[net_name]
-                assert isinstance(net, Net)
-                steps_or_nets.append(net)
-
-        num_iter = step_proto.num_iter if step_proto.HasField('num_iter') else None
-        concurrent_substeps = step_proto.concurrent_substeps if\
-            step_proto.HasField('concurrent_substeps') else None
-        should_stop_blob = BlobReference(step_proto.should_stop_blob) if\
-            step_proto.HasField('should_stop_blob') else None
-        only_once = step_proto.only_once if\
-            step_proto.HasField('only_once') else None
-        num_concurrent_instances = step_proto.num_concurrent_instances if\
-            step_proto.HasField('num_concurrent_instances') else None
-        create_workspace = step_proto.create_workspace if\
-            step_proto.HasField('create_workspace') else None
-        run_every_ms = step_proto.run_every_ms if\
-            step_proto.HasField('run_every_ms') else None
-
-        return execution_step(
-            step_proto.name,
-            steps_or_nets,
-            num_iter=num_iter,
-            report_net=None,        # DEPRECATED
-            report_interval=None,   # DEPRECATED
-            concurrent_substeps=concurrent_substeps,
-            should_stop_blob=should_stop_blob,
-            only_once=only_once,
-            num_concurrent_instances=num_concurrent_instances,
-            create_workspace=create_workspace,
-            run_every_ms=run_every_ms)
-
-
-def add_nets_in_order(step, net_list):
-    proto = step.Proto()
-    for substep in step.Substeps():
-        add_nets_in_order(substep, net_list)
-    for net in proto.network:
-        if net not in net_list:
-            net_list.append(net)
-    # FIXME(azzolini): This is actually wrong. Report nets should be
-    # instantiated first since they may run before any substep is run.
-    # However, curerntly, Reporter depends on this behavior.
-    if proto.report_net and proto.report_net not in net_list:
-        net_list.append(proto.report_net)
-
-
-class Plan:
-
-    def __init__(self, name_or_step):
-        self._plan = caffe2_pb2.PlanDef()
-        self._net_dict = OrderedDict()
-        self._steps = []    # A list of ExecutionStep
-        if isinstance(name_or_step, ExecutionStep):
-            self._plan.name = name_or_step.Name()
-            self.AddStep(name_or_step)
-        elif isinstance(name_or_step, basestring):
-            self._plan.name = name_or_step
-        else:
-            raise ValueError('name_or_step must be a string or ExecutionStep')
-
-    def __str__(self):
-        return self._plan.name
-
-    def Proto(self):
-        return self._plan
-
-    def AddNets(self, nets):
-        for net in nets:
-            if _add_net_to_dict(self._net_dict, net):
-                assert isinstance(net, Net)
-                self._plan.network.add().CopyFrom(net.Proto())
-
-    def Nets(self):
-        return list(self._net_dict.values())
-
-    def AddStep(self, step):
-        assert isinstance(step, ExecutionStep)
-        step._notify_is_used()
-        if not step.HasNets() and not step.HasSubsteps():
-            return
-        self._plan.execution_step.add().CopyFrom(step.Proto())
-        self._steps.append(step)
-        # nets need to be added to the plan in order of usage
-        net_list = []
-        add_nets_in_order(step, net_list)
-        self.AddNets([step.get_net(n) for n in net_list])
-
-    def Steps(self):
-        return self._steps
-
-    def get_all_attributes(self, name):
-        """
-        Return the list of all attributes under the given `name`, present in
-        all of the nets used in this plan.
-        """
-        return [
-            attr
-            for net in self._net_dict.values()
-            for attr in net.get_attributes(name)
-        ]
-
-    @classmethod
-    def create_from_proto(cls, plan_proto):
-        assert isinstance(plan_proto, caffe2_pb2.PlanDef)
-        plan = Plan(plan_proto.name)
-        plan._plan.CopyFrom(plan_proto)
-        del plan._plan.network[:]
-        del plan._plan.execution_step[:]
-
-        net_obj_dict = {}
-        net_proto_dict = {}
-        for net_proto in plan_proto.network:
-            assert net_proto.name not in net_proto_dict
-            net_proto_dict[net_proto.name] = net_proto
-
-        for step_proto in plan_proto.execution_step:
-            step = ExecutionStep.create_from_proto(
-                step_proto, net_obj_dict, net_proto_dict)
-            plan.AddStep(step)
-
-        return plan
-
-
-def to_execution_step(step_or_nets, default_name=None):
-    from caffe2.python.net_builder import NetBuilder
-    if isinstance(step_or_nets, ExecutionStep):
-        return step_or_nets
-
-    stop_blob = None
-    if not default_name and hasattr(step_or_nets, 'name'):
-        default_name = step_or_nets.name
-    if isinstance(step_or_nets, NetBuilder):
-        stop_blob = step_or_nets._stop_blob
-        step_or_nets = step_or_nets.get()
-    return execution_step(
-        default_name, step_or_nets, should_stop_blob=stop_blob)
-
-
-def execution_step(default_name,
-                   steps_or_nets,
-                   num_iter=None,
-                   report_net=None,
-                   report_interval=None,
-                   concurrent_substeps=None,
-                   should_stop_blob=None,
-                   only_once=None,
-                   num_concurrent_instances=None,
-                   create_workspace=False,
-                   run_every_ms=None):
-    """
-    Helper for creating an ExecutionStep.
-    - steps_or_nets can be:
-      - None
-      - Net
-      - ExecutionStep
-      - list<Net>
-      - list<ExecutionStep>
-    - should_stop_blob is either None or a scalar boolean blob.
-      - This blob is checked AFTER every substeps/subnets.
-      - If specified and true, then this step will return immediately.
-      - Be sure to handle race conditions if setting from concurrent threads.
-    - if no should_stop_blob or num_iter is provided, defaults to num_iter=1
-    """
-    assert should_stop_blob is None or num_iter is None, (
-        'Cannot set both should_stop_blob and num_iter.')
-    if should_stop_blob is None and num_iter is None:
-        num_iter = 1
-
-    step = ExecutionStep(default_name)
-    if should_stop_blob is not None:
-        step.SetShouldStopBlob(should_stop_blob)
-    if num_iter is not None:
-        step.SetIter(num_iter)
-    if only_once is not None:
-        step.SetOnlyOnce(only_once)
-    if concurrent_substeps is not None:
-        step.SetConcurrentSubsteps(concurrent_substeps)
-    if report_net is not None:
-        assert report_interval is not None
-        step.SetReportNet(report_net, report_interval)
-    if num_concurrent_instances is not None:
-        step.SetNumConcurrentInstances(num_concurrent_instances)
-    if create_workspace:
-        step.SetCreateWorkspace(True)
-    if run_every_ms:
-        step.RunEveryMillis(run_every_ms)
-
-    if isinstance(steps_or_nets, ExecutionStep):
-        step.AddSubstep(steps_or_nets)
-    elif isinstance(steps_or_nets, Net):
-        step.AddNet(steps_or_nets)
-    elif isinstance(steps_or_nets, list):
-        if all(isinstance(x, Net) for x in steps_or_nets):
-            for x in steps_or_nets:
-                step.AddNet(x)
-        else:
-            for x in steps_or_nets:
-                step.AddSubstep(to_execution_step(x))
-    elif steps_or_nets:
-        raise ValueError(
-            'steps_or_nets must be a step, a net, or a list of nets or steps.')
-    return step
-
-
-def scoped_execution_step(name, *args, **kwargs):
-    """Same as execution_step() except that the step name is scoped."""
-    default_name = ScopedName(name) if name else name
-    return execution_step(default_name, *args, **kwargs)
-
-
-def _extract_stacktrace():
-    '''
-    This function extracts stacktrace without file system access
-    by purely using sys._getframe() and removes part that belongs to
-    this file (core.py). We are not using inspect module because
-    its just a wrapper on top of sys._getframe() whose
-    logic is based on accessing source files on disk - exactly what
-    we are trying to avoid here. Same stands for traceback module
-
-    The reason for file system access avoidance is that
-    if code is located on an NFS, file access might be slow
-
-    Function returns a list of tuples (file_name, line_number, function)
-    '''
-
-    result = []
-    # Ignore top 3 layers of stack: this function, _CreateAndAddToSelf, and
-    # whatever calls _CreateAndAddToSelf (either __getattr__ or Python)
-    frame = sys._getframe(3)
-    # We just go down the frame stack in a loop
-    while frame:
-        # Its important to extract information from the frame here
-        # as frame's current line most probably will change later.
-        result.append((frame.f_code.co_filename, frame.f_lineno, frame.f_code.co_name))
-        frame = frame.f_back
-    return result
-
-
-SetPerOpEnginePref = C.set_per_op_engine_pref
-SetGlobalEnginePref = C.set_global_engine_pref
-SetEnginePref = C.set_engine_pref
-SetOpEnginePref = C.set_op_engine_pref
diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py
deleted file mode 100644
index b8433c644155..000000000000
--- a/caffe2/python/core_gradients_test.py
+++ /dev/null
@@ -1,1010 +0,0 @@
-
-
-
-
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import unittest
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, test_util, workspace
-from caffe2.python.core import CreateOperator, GradientRegistry, IR
-
-import numpy as np
-
-
-# First, we will set up a few gradient registry entries so that we can manually
-# construct some test cases.
-
-
-def NeedAll(op, g_output):
-    """A sanity check to make sure that all the gradient are given."""
-    for name, g in zip(op.output, g_output):
-        if g is None:
-            raise RuntimeError(
-                'Need gradient for "%s" but it is not provided.' % name)
-    return g_output
-
-
-def GIS(op):
-    """A test util function to generate the gradient name for input."""
-    return [s + '_grad' for s in op.input]
-
-
-def CopyDeviceOption(op, src_op):
-    if src_op.HasField('device_option'):
-        op.device_option.CopyFrom(src_op.device_option)
-    return op
-
-
-# First gradient: (in -> out) leading to (out_grad -> in_grad)
-@GradientRegistry.RegisterGradient('Direct')
-def AddDirectGradient(op, g_output):
-    return (
-        CopyDeviceOption(
-            CreateOperator('DirectGradient', NeedAll(op, g_output), GIS(op)),
-            op),
-        GIS(op)
-    )
-
-
-# Second gradient: (in -> out) leading to (out, out_grad -> in_grad)
-@GradientRegistry.RegisterGradient('UseOutput')
-def AddUseOutputGradient(op, g_output):
-    return (
-        CopyDeviceOption(
-            CreateOperator(
-                'UseOutputGradient',
-                list(op.output) + NeedAll(op, g_output), GIS(op)),
-            op),
-        GIS(op)
-    )
-
-
-@GradientRegistry.RegisterGradient('UseInput')
-def AddUseInputGradient(op, g_output):
-    return (
-        CopyDeviceOption(
-            CreateOperator(
-                'UseInputGradient',
-                list(op.input) + NeedAll(op, g_output), GIS(op)),
-            op),
-        GIS(op)
-    )
-
-
-@GradientRegistry.RegisterGradient('Nogradient')
-def AddNogradient(op, g_output):
-    return (
-        [],
-        [None for s in op.input]
-    )
-
-
-class TestGradientCalculation(test_util.TestCase):
-    def assertOperatorListEqual(self, operatorDefList1, operatorDefList2):
-        for op in operatorDefList1:
-            op.debug_info = ""
-            if op.device_option:
-                del op.device_option.extra_info[:]
-        for op in operatorDefList2:
-            op.debug_info = ""
-            if op.device_option:
-                del op.device_option.extra_info[:]
-        self.assertEqual(operatorDefList1, operatorDefList2)
-
-    @given(device_option=st.sampled_from([
-        None,
-        core.DeviceOption(workspace.GpuDeviceType, 1)]))
-    @settings(deadline=10000)
-    def testDirect(self, device_option):
-        operators = [
-            CreateOperator('Direct', 'in', 'hidden'),
-            CreateOperator('Direct', 'hidden', 'out'),
-        ]
-        if device_option:
-            for op in operators:
-                op.device_option.CopyFrom(device_option)
-        desired_grad_operators = [
-            CreateOperator('DirectGradient', 'out_grad', 'hidden_grad'),
-            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
-        ]
-        if device_option:
-            for op in desired_grad_operators:
-                op.device_option.CopyFrom(device_option)
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {'out': 'out_grad'})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testDirectImplicitGradientSource(self):
-        operators = [
-            CreateOperator('Direct', 'in', 'hidden'),
-            CreateOperator('Direct', 'hidden', 'out'),
-        ]
-        desired_grad_operators = [
-            CreateOperator(
-                "ConstantFill", 'out', "out_autogen_grad", value=1.0),
-            CreateOperator(
-                'DirectGradient', 'out_autogen_grad', 'hidden_grad'),
-            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
-        ]
-        for op in desired_grad_operators:
-            op.debug_info = ""
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, ['out'])
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testDoesNotGenerateUnnecessaryGradients(self):
-        operators = [
-            CreateOperator('Direct', 'in', 'hidden'),
-            CreateOperator('Direct', 'hidden', 'out'),
-        ]
-        desired_grad_operators = [
-            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
-        ]
-        for op in desired_grad_operators:
-            op.debug_info = ""
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {'hidden': 'hidden_grad'})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testDirectButNoOutputGradientGiven(self):
-        operators = [
-            CreateOperator('Direct', 'in', 'hidden'),
-            CreateOperator('Direct', 'hidden', 'out'),
-        ]
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {})
-        self.assertOperatorListEqual(gradients, [])
-
-    def testDirectInPlace(self):
-        operators = [
-            CreateOperator('Direct', 'in', 'in'),
-            CreateOperator('Direct', 'in', 'out'),
-        ]
-        desired_grad_operators = [
-            CreateOperator('DirectGradient', 'out_grad', 'in_grad'),
-            CreateOperator('DirectGradient', 'in_grad', 'in_grad'),
-        ]
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {'out': 'out_grad'})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testVersionMismatch(self):
-        operators = [
-            CreateOperator('Direct', 'x', 'x'),
-            CreateOperator('Direct', 'y', 'x'),
-            CreateOperator('Direct', 'x', 'y'),
-        ]
-        try:
-            gradients, _ = GradientRegistry.GetBackwardPass(
-                operators, {'y': 'y_grad'})
-            self.assertFalse(True, "Should raise exception of incorrect version")
-        except RuntimeError as e:
-            print(e)
-            self.assertTrue("version" in str(e))
-            pass
-
-    def testUseOutput(self):
-        operators = [
-            CreateOperator('UseOutput', 'in', 'hidden'),
-            CreateOperator('UseOutput', 'hidden', 'out'),
-            CreateOperator('Direct', 'out', 'sink'),
-        ]
-        desired_grad_operators = [
-            CreateOperator('DirectGradient', 'sink_grad', 'out_grad'),
-            CreateOperator(
-                'UseOutputGradient',
-                ['out', 'out_grad'], 'hidden_grad'
-            ),
-            CreateOperator(
-                'UseOutputGradient',
-                ['hidden', 'hidden_grad'], 'in_grad'
-            ),
-        ]
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {'sink': 'sink_grad'})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testUseOutputInPlace(self):
-        operators = [
-            CreateOperator('UseOutput', 'in', 'in'),
-            CreateOperator('UseOutput', 'in', 'out'),
-            CreateOperator('Direct', 'out', 'sink'),
-        ]
-        desired_grad_operators = [
-            CreateOperator('DirectGradient', 'sink_grad', 'out_grad'),
-            CreateOperator(
-                'UseOutputGradient',
-                ['out', 'out_grad'], 'in_grad'
-            ),
-            CreateOperator(
-                'UseOutputGradient',
-                ['in', 'in_grad'], 'in_grad'
-            ),
-        ]
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {'sink': 'sink_grad'})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testUseOutputButOutputHasBeenChanged(self):
-        operators = [
-            CreateOperator('UseOutput', 'in', 'hidden'),
-            # Note here: we overwrite hidden, but hidden will be needed by the
-            # gradient calculation of the first operator, so the gradient
-            # registry should return an error.
-            CreateOperator('Direct', 'hidden', 'hidden'),
-            CreateOperator('UseOutput', 'hidden', 'out'),
-            CreateOperator('Direct', 'out', 'sink'),
-        ]
-        with self.assertRaises(RuntimeError):
-            gradients, _ = GradientRegistry.GetBackwardPass(
-                operators, {'sink': 'sink_grad'})
-
-    def testUseInput(self):
-        operators = [
-            CreateOperator('Direct', 'in', 'hidden'),
-            CreateOperator('UseInput', 'hidden', 'out'),
-            CreateOperator('Direct', 'out', 'sink'),
-        ]
-        desired_grad_operators = [
-            CreateOperator('DirectGradient', 'sink_grad', 'out_grad'),
-            CreateOperator(
-                'UseInputGradient',
-                ['hidden', 'out_grad'], 'hidden_grad'
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'hidden_grad', 'in_grad'
-            ),
-        ]
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {'sink': 'sink_grad'})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testUseInputButInputHasBeenChanged(self):
-        """Test gradient for the following case:
-
-        in -> out, with UseInput
-        in -> in
-
-        Since we overwrite in op#1, but in will be needed by the gradient
-        calculation of op#0, the gradient registry should raise an error.
-        """
-        operators = [
-            CreateOperator('UseInput', 'in', 'out'),
-            CreateOperator('Direct', 'in', 'in'),
-        ]
-        with self.assertRaises(RuntimeError):
-            gradients, _ = GradientRegistry.GetBackwardPass(
-                operators, {'out': 'out_grad'})
-
-    @given(device_option=st.sampled_from([
-        None,
-        core.DeviceOption(workspace.GpuDeviceType, 1)]))
-    @settings(deadline=10000)
-    def testMultiUseInput(self, device_option):
-        """Test gradient for the following case:
-
-        in -> hidden1
-        in -> hidden2
-        hidden1, hidden2 -> out
-        """
-        operators = [
-            CreateOperator('Direct', 'in', 'hidden1'),
-            CreateOperator('Direct', 'in', 'hidden2'),
-            CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'),
-        ]
-        if device_option:
-            for op in operators:
-                op.device_option.CopyFrom(device_option)
-        desired_grad_operators = [
-            CreateOperator(
-                'DirectGradient',
-                'out_grad', ['hidden1_grad', 'hidden2_grad']
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'hidden2_grad', 'in_grad'
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'hidden1_grad', '_in_grad_autosplit_0'
-            ),
-            CreateOperator(
-                'Sum',
-                ['in_grad', '_in_grad_autosplit_0'], 'in_grad'
-            ),
-        ]
-        if device_option:
-            for op in desired_grad_operators:
-                op.device_option.CopyFrom(device_option)
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {"out": "out_grad"})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testMultiUseInputButWithNoGradient(self):
-        """Test gradient for the following case:
-
-        in -> hidden1
-        in -(no gradient)-> hidden2
-        hidden1, hidden2 -> out
-        """
-        operators = [
-            CreateOperator('Direct', 'in', 'hidden1'),
-            CreateOperator('Nogradient', 'in', 'hidden2'),
-            CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'),
-        ]
-        desired_grad_operators = [
-            CreateOperator(
-                'DirectGradient',
-                'out_grad', ['hidden1_grad', 'hidden2_grad']
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'hidden1_grad', 'in_grad'
-            ),
-        ]
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {'out': 'out_grad'})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testMultiUseInputAndMultipleVersions(self):
-        """Test gradient for the following case:
-
-        in -> in
-        in -> hidden1, hidden2
-        hidden1, hidden2 -> out
-        """
-        operators = [
-            CreateOperator('Direct', 'in', 'in'),
-            CreateOperator('Direct', 'in', 'hidden1'),
-            CreateOperator('Direct', 'in', 'hidden2'),
-            CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'),
-        ]
-        desired_grad_operators = [
-            CreateOperator(
-                'DirectGradient',
-                'out_grad', ['hidden1_grad', 'hidden2_grad']
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'hidden2_grad', 'in_grad'
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'hidden1_grad', '_in_grad_autosplit_0'
-            ),
-            CreateOperator(
-                'Sum',
-                ['in_grad', '_in_grad_autosplit_0'], 'in_grad'
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'in_grad', 'in_grad'
-            ),
-        ]
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {'out': 'out_grad'})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testMultiUseInputAutoGenSumDevice(self):
-        parallel_tag = "parallelize:shard_by_1"
-        split_op_device_option_clear_auto_gen_sum = core.DeviceOption(
-            caffe2_pb2.CPU,
-            extra_info=[
-                parallel_tag,
-                "{}:1".format(IR.ONLY_KEEP_IS_AUTO_GEN_SUM_OPS_TAG),
-            ]
-        )
-        split_op_device_option_no_clear_auto_gen_sum = core.DeviceOption(
-            caffe2_pb2.CPU,
-            extra_info=[parallel_tag]
-        )
-        operators_clear_auto_gen_sum = [
-            CreateOperator(
-                'Direct', 'in', 'hidden1',
-                device_option=split_op_device_option_clear_auto_gen_sum
-            ),
-            CreateOperator(
-                'Direct', 'in', 'hidden2',
-                device_option=split_op_device_option_clear_auto_gen_sum
-            ),
-            CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'),
-        ]
-        gradients_clear_auto_gen_sum, _ = GradientRegistry.GetBackwardPass(
-            operators_clear_auto_gen_sum, {'out': 'out_grad'})
-        self.assertEqual(gradients_clear_auto_gen_sum[-1].type, "Sum")
-        self.assertNotIn(
-            parallel_tag,
-            gradients_clear_auto_gen_sum[-1].device_option.extra_info
-        )
-
-        operators_no_clear_auto_gen_sum = [
-            CreateOperator(
-                'Direct', 'in', 'hidden1',
-                device_option=split_op_device_option_no_clear_auto_gen_sum
-            ),
-            CreateOperator(
-                'Direct', 'in', 'hidden2',
-                device_option=split_op_device_option_no_clear_auto_gen_sum
-            ),
-            CreateOperator('Direct', ['hidden1', 'hidden2'], 'out'),
-        ]
-        gradients_no_clear_auto_gen_sum, _ = GradientRegistry.GetBackwardPass(
-            operators_no_clear_auto_gen_sum, {'out': 'out_grad'})
-        self.assertEqual(gradients_clear_auto_gen_sum[-1].type, "Sum")
-        self.assertIn(
-            parallel_tag,
-            gradients_no_clear_auto_gen_sum[-1].device_option.extra_info
-        )
-
-    def testMultiUseInputAndMultipleVersionsBig(self):
-        """Test gradient for the following case:
-
-        in -> in
-        in -> hidden1, hidden2
-        hidden1, hidden2 -> in
-        in -> hidden3, hidden4, hidden5
-        hidden3, hidden4, hidden5 -> out
-        """
-        operators = [
-            CreateOperator('Direct', 'in', 'in'),
-            CreateOperator('Direct', 'in', 'hidden1'),
-            CreateOperator('Direct', 'in', 'hidden2'),
-            CreateOperator('Direct', ['hidden1', 'hidden2'], 'in'),
-            CreateOperator('Direct', 'in', 'hidden3'),
-            CreateOperator('Direct', 'in', 'hidden4'),
-            CreateOperator('Direct', 'in', 'hidden5'),
-            CreateOperator('Direct', ['hidden3', 'hidden4', 'hidden5'], 'out'),
-        ]
-        desired_grad_operators = [
-            CreateOperator(
-                'DirectGradient',
-                'out_grad', ['hidden3_grad', 'hidden4_grad', 'hidden5_grad']
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'hidden5_grad', 'in_grad'
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'hidden4_grad', '_in_grad_autosplit_0'
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'hidden3_grad', '_in_grad_autosplit_1'
-            ),
-            CreateOperator(
-                'Sum',
-                ['in_grad', '_in_grad_autosplit_0',
-                 '_in_grad_autosplit_1'],
-                'in_grad'
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'in_grad', ['hidden1_grad', 'hidden2_grad']
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'hidden2_grad', 'in_grad'
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'hidden1_grad', '_in_grad_autosplit_0'
-            ),
-            CreateOperator(
-                'Sum',
-                ['in_grad', '_in_grad_autosplit_0'],
-                'in_grad'
-            ),
-            CreateOperator(
-                'DirectGradient',
-                'in_grad', 'in_grad'
-            ),
-        ]
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {'out': 'out_grad'})
-        for s in gradients:
-            print(str(s))
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testGradientMappingUsingSumOp(self):
-        """Since Sum is used in accumulating gradients, we will test if
-        it is OK to also explicitly use it in the graph."""
-        operators = [
-            CreateOperator('FC', ['in', 'w', 'b'], 'fc'),
-            CreateOperator('Sum', 'fc', 'agg'),
-            CreateOperator('AveragedLoss', 'agg', 'loss'),
-        ]
-        # This should run correctly.
-        gradient_ops, _ = GradientRegistry.GetBackwardPass(
-            operators, {'loss': 'loss_grad'})
-        for s in gradient_ops:
-            print(str(s))
-
-    def testGradientCalculationWithPrint(self):
-        """Test a common use case where we have Print in the forward pass."""
-        operators = [
-            CreateOperator('FC', ['in', 'w', 'b'], 'fc'),
-            CreateOperator('Print', 'fc', []),
-            CreateOperator('AveragedLoss', 'fc', 'loss'),
-        ]
-        desired_grad_operators = [
-            CreateOperator('AveragedLossGradient',
-                           ['fc', 'loss_grad'], 'fc_grad'),
-            CreateOperator('FCGradient', ['in', 'w', 'fc_grad'],
-                           ['w_grad', 'b_grad', 'in_grad']),
-        ]
-        for g in desired_grad_operators:
-            g.is_gradient_op = 1
-        # This should run correctly.
-        gradient_ops, _ = GradientRegistry.GetBackwardPass(
-            operators, {'loss': 'loss_grad'})
-        for s in gradient_ops:
-            print(str(s))
-        self.assertOperatorListEqual(gradient_ops, desired_grad_operators)
-
-    def testStopGradient(self):
-        operators = [
-            CreateOperator('Direct', 'in', 'hidden'),
-            CreateOperator('StopGradient', 'hidden', 'hidden2'),
-            CreateOperator('Direct', 'hidden2', 'out'),
-        ]
-        desired_grad_operators = [
-            CreateOperator('DirectGradient', 'out_grad', 'hidden2_grad'),
-        ]
-        gradients, _ = GradientRegistry.GetBackwardPass(
-            operators, {'out': 'out_grad'})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-
-    def testStopGradientOrphan(self):
-        operators = [
-            CreateOperator('Direct', 'in', 'hidden'),
-            CreateOperator('StopGradient', 'hidden', 'auto_blobx'),
-            CreateOperator('Direct', 'hidden', 'out'),
-        ]
-        with self.assertRaises(ValueError):
-            # This should complain about incorrect use of StopGradient
-            gradients, _ = GradientRegistry.GetBackwardPass(
-                operators, {'out': 'out_grad'})
-
-    def testStopGradientInplace(self):
-        operators = [
-            CreateOperator('Direct', 'in', 'hidden'),
-            CreateOperator('StopGradient', 'hidden', 'hidden'),
-            CreateOperator('Direct', 'hidden', 'out'),
-        ]
-        desired_grad_operators = [
-            CreateOperator('DirectGradient', 'out_grad', 'hidden_grad'),
-        ]
-        gradients, grad_map = GradientRegistry.GetBackwardPass(
-            operators, {'out': 'out_grad'})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-        self.assertEqual(grad_map, {'out': 'out_grad'})
-
-    def testStopGradientWithMultiUseOperators(self):
-        operators = [
-            CreateOperator('Direct', 'in', 'hidden'),
-            CreateOperator('Direct', 'hidden', 'hidden2'),
-            CreateOperator('StopGradient', 'hidden', 'hidden3'),
-            CreateOperator('Direct', ['hidden2', 'hidden3'], 'out'),
-        ]
-        desired_grad_operators = [
-            CreateOperator('DirectGradient', 'out_grad',
-                           ['hidden2_grad', 'hidden3_grad']),
-            CreateOperator('DirectGradient', 'hidden2_grad', 'hidden_grad'),
-            CreateOperator('DirectGradient', 'hidden_grad', 'in_grad'),
-        ]
-        gradients, grad_map = GradientRegistry.GetBackwardPass(
-            operators, {'out': 'out_grad'})
-        self.assertOperatorListEqual(gradients, desired_grad_operators)
-        self.assertEqual(
-            grad_map, {'out': 'out_grad', 'hidden2': 'hidden2_grad',
-                       'hidden3': 'hidden3_grad', 'hidden': 'hidden_grad',
-                       'in': 'in_grad'})
-
-    def test_zero_gradient(self):
-        net = core.Net("zero_grad_test")
-
-        hidden_prev, cell, gates, seq_lengths, timestep =\
-            net.AddExternalInput("h", "c", "g", "s", "t")
-        hidden, cell = net.LSTMUnit(
-            [hidden_prev, cell, gates, seq_lengths, timestep],
-            ["hidden_t", "cell_t"])
-        with self.assertRaises(Exception):
-            net.AddGradientOperators([hidden])
-        net.ZeroGradient(cell, [])
-        net.AddGradientOperators([hidden])
-
-    def test_two_grads(self):
-        net = core.Net("test_two_grads")
-        input, two, three = net.AddExternalInput("input", "two", "three")
-
-        m1 = net.Mul([input, two], "mul_1")
-        m2 = net.Mul([m1, three], "mul_2")
-        grad_map = net.AddGradientOperators([m2, m1])
-        workspace.ResetWorkspace()
-        workspace.blobs[input] = np.array([1]).astype(np.float32)
-        workspace.blobs[two] = np.array([2]).astype(np.float32)
-        workspace.blobs[three] = np.array([3]).astype(np.float32)
-        workspace.RunNetOnce(net)
-        print(net.Proto())
-        for blob in workspace.blobs:
-            print(blob, workspace.blobs[blob])
-        print("Input grad: ", workspace.blobs[grad_map[str(input)]])
-        assert workspace.blobs[grad_map[str(input)]] == 8.0
-
-
-# Skip if sparse operators are not available
-@unittest.skipIf(not core.IsOperator('SparseFunHash'),
-                 'Sparse operators not available')
-class TestSparseGradientsAccumulation(test_util.TestCase):
-    def testSparseAccumulationWithValues(self):
-        # The gradient for "Gather" only computes values. indices are directly
-        # passed from the input
-        #
-        # x1-->Gather-->x4-->
-        #        |          |
-        # x2-----+     DotProduct-->x6
-        #        |          |
-        # x3-->Gather-->x5-->
-        net = core.Net("test_net")
-        net.Gather(["x2", "x1"], "x4")
-        net.Gather(["x2", "x3"], "x5")
-        net.DotProduct(["x4", "x5"], "x6")
-        net.AddGradientOperators(["x6"])
-        sum_op_i = net.Proto().op[-2]
-        sum_op_v = net.Proto().op[-1]
-        self.assertEqual(sum_op_i.input[0], "x3")
-        self.assertEqual(sum_op_i.input[1], "x1")
-        self.assertEqual(sum_op_i.output[0], "x2_grad_indices_concat")
-        self.assertEqual(sum_op_v.input[0], "x5_grad")
-        self.assertEqual(sum_op_v.input[1], "x4_grad")
-        self.assertEqual(sum_op_v.output[0], "x2_grad_values_concat")
-
-    def testSparseGradientToDense(self):
-        #
-        #                                        x1-->Gather-->x4-->
-        #                                                 |        |
-        # x0, w, b-->FC-->x2-->EnsureDenseGradient-->x2---+  DotProduct-->x6
-        #                                                 |        |
-        #                                        x3-->Gather-->x5-->
-        net = core.Net("test_net")
-        net.FC(["x0", "w", "b"], "x2")
-        net.EnsureDense(["x2"], "x2")
-        net.Gather(["x2", "x1"], "x4")
-        net.Gather(["x2", "x3"], "x5")
-        net.DotProduct(["x4", "x5"], "x6")
-        net.AddGradientOperators(["x6"])
-        ensure_dense_op = net.Proto().op[-2]
-        self.assertEqual(ensure_dense_op.input[0], "x2_grad_indices_concat")
-        self.assertEqual(ensure_dense_op.input[1], "x2_grad_values_concat")
-        self.assertEqual(ensure_dense_op.output[0], "x2_grad")
-
-    def testSparseAccumulationWithIndicesAndValues(self):
-        # The gradient for "SparseFunHash" computes both indices and values
-        #
-        # x1-------->
-        #           |
-        # x2---->   |
-        #       |   |
-        # x3---SparseFunHash-->x8
-        #       /               \
-        # x4---+            DotProduct-->x10
-        #       \               /
-        # x5---SparseFunHash-->x9
-        #       |   |
-        # x6---->   |
-        #           |
-        # x7-------->
-        net = core.Net("test_net")
-        net.SparseFunHash(["x1", "x2", "x3", "x4"], "x8")
-        net.SparseFunHash(["x5", "x6", "x7", "x4"], "x9")
-        net.DotProduct(["x8", "x9"], "x10")
-        net.AddGradientOperators(["x10"])
-        sum_op_i = net.Proto().op[-2]
-        sum_op_v = net.Proto().op[-1]
-        self.assertEqual(sum_op_i.input[0], "_x4_grad_indices_autosplit_0")
-        self.assertEqual(sum_op_i.input[1], "_x4_grad_indices_autosplit_1")
-        self.assertEqual(sum_op_i.output[0], "x4_grad_indices_concat")
-        self.assertEqual(sum_op_v.input[0], "_x4_grad_values_autosplit_0")
-        self.assertEqual(sum_op_v.input[1], "_x4_grad_values_autosplit_1")
-        self.assertEqual(sum_op_v.output[0], "x4_grad_values_concat")
-
-
-class TestGradientsAccumulationWithNoGradientOps(test_util.TestCase):
-    def testNormalAccumulation(self):
-        #  x1-->Relu--x2----------------->DotProduct-->x4
-        #                |                 |
-        #                 -->Softmax-->x3-->
-        net = core.Net("test_net")
-        net.Relu("x1", "x2")
-        net.Softmax("x2", "x3")
-        net.DotProduct(["x2", "x3"], "x4")
-        net.AddGradientOperators(["x4"])
-        sum_op = net.Proto().op[-2]
-        self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
-        self.assertEqual(sum_op.output[0], "x2_grad")
-
-    def testAccumulationWithNoGradientBranch(self):
-        #                 -->PRINT
-        #                |
-        #  x1-->Relu--x2----------------->DotProduct-->x4
-        #                |                 |
-        #                 -->Softmax-->x3-->
-        net = core.Net("test_net")
-        net.Relu("x1", "x2")
-        net.Print("x2", [])
-        net.Softmax("x2", "x3")
-        net.DotProduct(["x2", "x3"], "x4")
-        net.AddGradientOperators(["x4"])
-        sum_op = net.Proto().op[-2]
-        self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
-        self.assertEqual(sum_op.output[0], "x2_grad")
-
-
-class TestGradientsAccumulationWithPassThroughGradients(test_util.TestCase):
-    def testAddOpInMiddle(self):
-        #  x1-->Relu--x2----------------->Add-->x4
-        #                |                 |
-        #                 -->Softmax-->x3-->
-        #
-        # Expected gradient graph:
-        #
-        #  x1_g<--ReluG<--x2_g<--Sum<------------<---------x4_g
-        #                          |                       |
-        #                           <--_x2_g_split_0<--SoftmaxG
-        net = core.Net("test_net")
-        net.Relu("x1", "x2")
-        net.Softmax("x2", "x3")
-        net.Add(["x2", "x3"], "x4")
-        input_to_grad = net.AddGradientOperators({"x4": "x4_grad"})
-        sum_op = net.Proto().op[-2]
-        self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
-        self.assertEqual(sum_op.output[0], "x2_grad")
-        self.assertEqual(input_to_grad["x1"], "x1_grad")
-
-    def testAddAndDynamicConstant(self):
-        net = core.Net("test_net")
-        net.FC(["x1", "x1_w", "x1_b"], ["x2"])
-        net.Relu("x2", "x2")
-        net.ConstantFill(["x2"], ["x3"])
-        net.Add(["x2", "x3"], "x4")
-        net.FC(["x4", "x4_w", "x4_b"], ["x5"])
-        net.SoftmaxWithLoss(["x5", "labels"], ["softmax", "loss"])
-        input_to_grad = net.AddGradientOperators(["loss"])
-        for op in net.Proto().op:
-            self.assertFalse(op.type == 'Sum')
-
-        self.assertTrue("x4" in input_to_grad)
-        self.assertTrue("x1" in input_to_grad)
-        self.assertEqual(input_to_grad["x1"], "x1_grad")
-
-    def testAddAndStaticConstant(self):
-        net = core.Net("test_net")
-        net.FC(["x1", "x1_w", "x1_b"], ["x2"])
-        net.Relu("x2", "x2")
-        net.ConstantFill([], ["x3"], shape=[1])
-        net.Add(["x2", "x3"], "x4", broadcast=1)
-        net.FC(["x4", "x4_w", "x4_b"], ["x5"])
-        net.SoftmaxWithLoss(["x5", "labels"], ["softmax", "loss"])
-        input_to_grad = net.AddGradientOperators(["loss"])
-        print(input_to_grad)
-
-        self.assertTrue("x1" in input_to_grad)
-        self.assertEqual(input_to_grad["x1"], "x1_grad")
-
-    def testSubOpInMiddle(self):
-        #  x1-->Relu--x2----------------->Sub-->x4
-        #                |                 |
-        #                 -->Softmax-->x3-->
-        #
-        # Expected gradient graph:
-        #
-        #  x1_g<--ReluG<--x2_g<--Sum<------------<-----------------------x4_g
-        #                          |                                      |
-        #                           <--_x2_g_split_0<--SoftmaxG<--x3_g<--neg
-        net = core.Net("test_net")
-        net.Relu("x1", "x2")
-        net.Softmax("x2", "x3")
-        net.Sub(["x2", "x3"], "x4")
-        input_to_grad = net.AddGradientOperators({"x4": "x4_grad"})
-        print(str(net.Proto()))
-        sum_op = net.Proto().op[-2]
-        self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
-        self.assertEqual(sum_op.output[0], "x2_grad")
-        self.assertEqual(input_to_grad["x1"], "x1_grad")
-
-    def testAddOpAtLeaf(self):
-        # x1
-        #   \
-        #    -->Add-->x4
-        #   /           \
-        # x2             -->DotProduct-->x6
-        #   \           /
-        #    -->Add-->x5
-        #   /
-        # x3
-        #
-        # Expected gradient graph:
-        #
-        #  x2_g<--Sum<--x4_g<--DotProductG<--x6_g
-        #          |                |                       |
-        #           <---x5_g<-------
-        net = core.Net("test_net")
-        net.Add(["x1", "x2"], "x4")
-        net.Add(["x2", "x3"], "x5")
-        net.DotProduct(["x4", "x5"], "x6")
-        input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
-        sum_op = net.Proto().op[-1]
-        self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
-        self.assertEqual(sum_op.output[0], "x2_grad")
-        self.assertEqual(input_to_grad["x1"], "x1_grad")
-        self.assertEqual(input_to_grad["x2"], "x2_grad")
-        self.assertEqual(input_to_grad["x3"], "x3_grad")
-
-    def testSubOpAtLeaf(self):
-        # x1
-        #   \
-        #    -->Sub-->x4
-        #   /           \
-        # x2             -->DotProduct-->x6
-        #   \           /
-        #    -->Sub-->x5
-        #   /
-        # x3
-        #
-        # Expected gradient graph:
-        #
-        #  x2_g<-------Sum<--x2_g_split_0<--neg<--x4_g<--DotProductG<--x6_g
-        #               |                                       |
-        #  x3_g<--neg<--<--x5_g<--------------------------------
-        net = core.Net("test_net")
-        net.Sub(["x1", "x2"], "x4")
-        net.Sub(["x2", "x3"], "x5")
-        net.DotProduct(["x4", "x5"], "x6")
-        input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
-        sum_op = net.Proto().op[-1]
-        self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
-        self.assertEqual(sum_op.output[0], "x2_grad")
-        self.assertEqual(input_to_grad["x1"], "x1_grad")
-        self.assertEqual(input_to_grad["x2"], "x2_grad")
-        self.assertEqual(input_to_grad["x3"], "x3_grad")
-
-    def testMultiLayerAddOps(self):
-        # x1
-        #   \
-        #    -->Add-->x4
-        #   /           \
-        # x2             -->Add-->x6
-        #   \           /
-        #    -->Add-->x5
-        #   /
-        # x3
-        #
-        # Expected gradient graph:
-        #
-        #  x2_g<--Sum<-----x6_g
-        #          |         |
-        #           <--------
-        net = core.Net("test_net")
-        net.Add(["x1", "x2"], "x4")
-        net.Add(["x2", "x3"], "x5")
-        net.Add(["x4", "x5"], "x6")
-        input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
-        sum_op = net.Proto().op[-1]
-        self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
-        self.assertEqual(sum_op.output[0], "x2_grad")
-        self.assertEqual(input_to_grad["x1"], "x1_grad")
-        self.assertEqual(input_to_grad["x2"], "x2_grad")
-        self.assertEqual(input_to_grad["x3"], "x3_grad")
-
-    def testMultiLayerSubOps(self):
-        # x1
-        #   \
-        #    -->Sub-->x4
-        #   /           \
-        # x2             -->Sub-->x6
-        #   \           /
-        #    -->Sub-->x5
-        #   /
-        # x3
-        #
-        # Expected gradient graph:
-        #
-        #  x2_g<--Sum<-----x6_g
-        #          |         |
-        #           <--------
-        net = core.Net("test_net")
-        net.Sub(["x1", "x2"], "x4")
-        net.Sub(["x2", "x3"], "x5")
-        net.Sub(["x4", "x5"], "x6")
-        input_to_grad = net.AddGradientOperators({"x6": "x6_grad"})
-        sum_op = net.Proto().op[-1]
-        self.assertEqual(sum_op.input[0], "x2_grad")
-        self.assertEqual(sum_op.input[1], "_x2_grad_autosplit_0")
-        self.assertEqual(sum_op.output[0], "x2_grad")
-        self.assertEqual(input_to_grad["x1"], "x1_grad")
-        self.assertEqual(input_to_grad["x2"], "x2_grad")
-        self.assertEqual(input_to_grad["x3"], "x3_grad")
-
-    def testAccumulationRuns(self):
-        net = core.Net("test_net")
-        input, one, two, three = net.AddExternalInput(
-            "input", "one", "two", "three")
-
-        m1 = net.Mul([input, two], "mul_1")
-        m2 = net.Mul([input, three], "mul_2")
-        sub = net.Sub([m1, one])
-        grad_map = net.AddGradientOperators([m2, sub])
-
-        workspace.ResetWorkspace()
-        workspace.blobs[one] = np.array([1]).astype(np.float32)
-        workspace.blobs[input] = np.array([1]).astype(np.float32)
-        workspace.blobs[two] = np.array([2]).astype(np.float32)
-        workspace.blobs[three] = np.array([3]).astype(np.float32)
-        workspace.RunNetOnce(net)
-        print("Input grad: ", workspace.blobs[grad_map[str(input)]])
-        assert workspace.blobs[grad_map[str(input)]] == 5.0
-
-    def testIncorrectOperator(self):
-        net = core.Net("test_net")
-        a, b, one = net.AddExternalInput("a", "b", "one")
-        m1 = net.Mul(a, b)  # does not have second output
-        sub = net.Sub([m1, one])
-        try:
-            net.AddGradientOperators([sub])
-            self.assertFalse(True, "Did not throw exception")
-        except Exception as e:
-            self.assertTrue("schema" in str(e))
-
-    def testDeviceOptionsPropagation(self):
-        '''
-        Test verifies that aggregation operators in a backward path will be in
-        the same device as the parameter.
-        '''
-        device_0 = 'node:0'
-
-        # init_net.
-        init_net = core.Net("init_net")
-        with core.DeviceScope(0, node_name=device_0):
-            w = init_net.UniformFill([], 'w', shape=[10000, 64])
-            ids = init_net.GivenTensorFill(
-                [],
-                'ids',
-                values=np.random.random_integers(low=0, high=10000, size=10),
-            )
-            ids_2 = init_net.GivenTensorFill(
-                [],
-                'ids_2',
-                values=np.random.random_integers(low=0, high=10000, size=10),
-            )
-
-        # train_net.
-        train_net = core.Net("train_net")
-        with core.DeviceScope(0, node_name=device_0):
-            vals = train_net.Gather([w, ids], "gathered")
-            r_vals = train_net.ReduceSum([vals], 1, axes=0)
-
-            vals_2 = train_net.Gather([w, ids_2], "gathered_2")
-            r_vals_2 = train_net.ReduceSum([vals_2], 1, axes=0)
-
-        loss = train_net.Sum([r_vals, r_vals_2], 1)
-        train_net.AddGradientOperators([loss])
-        # All concat operators should be on device_0
-        for op in train_net.Proto().op:
-            if op.type == 'Concat':
-                self.assertEqual(op.device_option.node_name, device_0)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
deleted file mode 100644
index 13cde2475f64..000000000000
--- a/caffe2/python/core_test.py
+++ /dev/null
@@ -1,1264 +0,0 @@
-
-
-
-
-
-from inspect import currentframe, getframeinfo
-import unittest
-
-import numpy as np
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, schema, test_util
-from caffe2.python.task import Node, Task
-
-
-class TestScopes(test_util.TestCase):
-    def testBlobReferenceIsIndependentFromNameScope(self):
-        blob_v = core.BlobReference("v")
-        with core.NameScope("foo"):
-            blob_w = core.BlobReference("w")
-            with core.NameScope("bar"):
-                blob_x = core.BlobReference("x")
-        self.assertEqual(str(blob_v), "v")
-        self.assertEqual(str(blob_w), "w")
-        self.assertEqual(str(blob_x), "x")
-
-    def testNameScopeWithOp(self):
-        global_x = core.BlobReference("x")
-        global_y = core.BlobReference("y")
-        with core.NameScope("foo"):
-            # Raw strings should have namescope prepended.
-            op = core.CreateOperator("Relu", "x", "y")
-            self.assertEqual(len(op.input), 1)
-            self.assertEqual(op.input[0], "foo/x")
-            self.assertEqual(len(op.output), 1)
-            self.assertEqual(op.output[0], "foo/y")
-            # BlobReferences should not.
-            op = core.CreateOperator("Relu", global_x, global_y)
-            self.assertEqual(len(op.input), 1)
-            self.assertEqual(op.input[0], "x")
-            self.assertEqual(len(op.output), 1)
-            self.assertEqual(op.output[0], "y")
-
-    def testNameScopeWithReset(self):
-        with core.NameScope("foo"):
-            # foo/
-            op = core.CreateOperator("Relu", "x", "y")
-            self.assertEqual(len(op.input), 1)
-            self.assertEqual(op.input[0], "foo/x")
-            self.assertEqual(len(op.output), 1)
-            self.assertEqual(op.output[0], "foo/y")
-            with core.NameScope("bar"):
-                # foo/bar/
-                op = core.CreateOperator("Relu", "x", "y")
-                self.assertEqual(len(op.input), 1)
-                self.assertEqual(op.input[0], "foo/bar/x")
-                self.assertEqual(len(op.output), 1)
-                self.assertEqual(op.output[0], "foo/bar/y")
-            # Back to foo/
-            op = core.CreateOperator("Relu", "x", "y")
-            self.assertEqual(len(op.input), 1)
-            self.assertEqual(op.input[0], "foo/x")
-            self.assertEqual(len(op.output), 1)
-            self.assertEqual(op.output[0], "foo/y")
-            with core.NameScope("bar", reset=True):
-                # bar/
-                op = core.CreateOperator("Relu", "x", "y")
-                self.assertEqual(len(op.input), 1)
-                self.assertEqual(op.input[0], "bar/x")
-                self.assertEqual(len(op.output), 1)
-                self.assertEqual(op.output[0], "bar/y")
-            # Back to foo/
-            op = core.CreateOperator("Relu", "x", "y")
-            self.assertEqual(len(op.input), 1)
-            self.assertEqual(op.input[0], "foo/x")
-            self.assertEqual(len(op.output), 1)
-            self.assertEqual(op.output[0], "foo/y")
-
-    def testDeviceScope(self):
-        # No device
-        op = core.CreateOperator("Relu", "x", "y")
-        self.assertFalse(op.HasField('device_option'))
-        # explicitly setting a device
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = workspace.GpuDeviceType
-        device_option.device_id = 1
-        op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
-        self.assertTrue(op.HasField('device_option'))
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 1)
-        with core.DeviceScope(device_option):
-            # from device scope
-            op = core.CreateOperator("Relu", "x", "y")
-            self.assertTrue(op.HasField('device_option'))
-            self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-            self.assertEqual(op.device_option.device_id, 1)
-            # from an overridden device option
-            override_device = caffe2_pb2.DeviceOption()
-            override_device.device_type = caffe2_pb2.CPU
-            op = core.CreateOperator(
-                "Relu", "x", "y", device_option=override_device)
-            self.assertTrue(op.HasField('device_option'))
-            self.assertEqual(op.device_option.device_type, caffe2_pb2.CPU)
-        # back from normal: no device
-        op = core.CreateOperator("Relu", "x", "y")
-        self.assertFalse(op.HasField('device_option'))
-        device_option = caffe2_pb2.DeviceOption()
-
-    def testNameAndDeviceScopeTogether(self):
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = workspace.GpuDeviceType
-        device_option.device_id = 1
-        with core.DeviceScope(device_option):
-            with core.NameScope("foo"):
-                op = core.CreateOperator("Relu", "x", "y")
-                self.assertTrue(op.HasField('device_option'))
-                self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-                self.assertEqual(op.device_option.device_id, 1)
-                self.assertEqual(len(op.input), 1)
-                self.assertEqual(op.input[0], "foo/x")
-                self.assertEqual(len(op.output), 1)
-                self.assertEqual(op.output[0], "foo/y")
-
-
-class TestCloneNet(test_util.TestCase):
-    def testPartialClone(self):
-        params = core.Net('params')
-        p1 = params.ConstantFill([], ['p1'])
-        workspace.CreateNet(params)
-        workspace.RunNetOnce(params)
-
-        n = core.Net('original')
-        a1 = n.AddExternalInput('a1')
-        a2 = n.AddExternalInput('a2')
-        b1, b2 = n.Concat([a1, a2], ['b1', 'b2'], axis=0)
-        c1 = n.Sum([b1, p1], ['c1'])
-        c2 = n.Sum([b2], ['c2'])
-        d = n.Sum([c1, c2], ['d'])
-
-        # test that gradient ops are ignored when partial-cloning
-        n.AddGradientOperators([d])
-
-        # test some in-place ops
-        k = n.Sum([p1], ['k'])
-        e = n.Sum([d], ['e'])
-        e = n.Sum([e, k], [e])
-        e = n.Sum([e], [e])
-        f = n.Sum(e, ['f'])
-
-        def net_assert(net, num_ops, inputs, outputs, internals):
-            self.assertEqual(len(net.Proto().op), num_ops)
-            self.assertEqual(set(net.Proto().external_input), inputs)
-            self.assertEqual(set(net.Proto().external_output), outputs)
-            all_blobs = set(net.Proto().external_input)
-            all_blobs |= set(net.Proto().external_output)
-            for op in net.Proto().op:
-                all_blobs |= set(op.input) | set(op.output)
-            self.assertEqual(all_blobs, inputs | outputs | internals)
-            # create net to make sure its valid
-            for input in inputs:
-                workspace.FeedBlob(input, np.array([]))
-            workspace.CreateNet(net)
-
-        n2, (d22, ) = n.ClonePartial('f1', {a1: 'a11', a2: 'a22'}, [d])
-        net_assert(
-            n2, 4, {'p1', 'a11', 'a22'}, {'f1/d'},
-            {'f1/b1', 'f1/b2', 'f1/c1', 'f1/c2', 'p1'})
-        self.assertTrue(isinstance(d22, core.BlobReference))
-        self.assertEqual(d22.Net(), n2)
-        self.assertEqual(str(d22), 'f1/d')
-
-        n3, (d22, ) = n.ClonePartial('f2', [b1, b2], [d])
-        net_assert(
-            n3, 3, {'p1', 'b1', 'b2'}, {'f2/d'}, {'f2/c1', 'f2/c2', 'p1'})
-        self.assertEqual(str(d22), 'f2/d')
-
-        n4, (c22, ) = n.ClonePartial('f3', [b1], [c1])
-        net_assert(n4, 1, {'p1', 'b1'}, {'f3/c1'}, {'p1'})
-        self.assertEqual(str(c22), 'f3/c1')
-
-        n5, (c11, c22) = n.ClonePartial('f4', [b1, b2], [c1, c2])
-        net_assert(n5, 2, {'p1', 'b1', 'b2'}, {'f4/c1', 'f4/c2'}, {'p1'})
-        self.assertEqual(str(c11), 'f4/c1')
-        self.assertEqual(str(c22), 'f4/c2')
-
-        with self.assertRaises(AssertionError):
-            n.ClonePartial('f4', [a1, a2, c2], [d])
-
-        n6, (e22, ) = n.ClonePartial('f5', [d], [e])
-        net_assert(n6, 4, {'p1', 'd'}, {'f5/e'}, {'f5/k', 'p1'})
-        self.assertEqual(str(e22), 'f5/e')
-
-        n8, (e22, f22) = n.ClonePartial('f7', [d], [e, f])
-        net_assert(n8, 5, {'p1', 'd'}, {'f7/e', 'f7/f'}, {'p1', 'f7/k'})
-        self.assertEqual(str(e22), 'f7/e')
-        self.assertEqual(str(f22), 'f7/f')
-
-        params._CheckLookupTables()
-        n._CheckLookupTables()
-
-    def test_mask_clone_update_external_list(self):
-        n = core.Net('original')
-        a1 = n.AddExternalInput('a1')
-        a2 = n.AddExternalInput('a2')
-        p1 = 'p1'
-        b1, b2 = n.Concat([a1, a2], ['b1', 'b2'], axis=0)
-        c1 = n.Sum([b1, p1], ['c1'])
-        c2 = n.Sum([b2], ['c2'])
-        n.Sum([c1, c2], ['d'])
-        new_net = n.Clone(
-            "new", op_id_mask=[0, 1], keep_schema=True, update_external_list=True)
-        self.assertEqual(
-            sorted(map(str, new_net.external_inputs)),
-            ["a1", "a2", "p1"],
-            "external input not matched",
-        )
-        self.assertEqual(
-            sorted(map(str, new_net.external_outputs)),
-            ["b2", "c1"],
-            "external output not matched",
-        )
-        new_net = n.Clone(
-            "new2", op_id_mask=[2, 3], keep_schema=True, update_external_list=True)
-        self.assertEqual(
-            sorted(map(str, new_net.external_inputs)),
-            ["b2", "c1"],
-            "external input not matched",
-        )
-        self.assertEqual(
-            sorted(map(str, new_net.external_outputs)),
-            ["d"],
-            "external output not matched",
-        )
-
-    def test_control_op_remap(self):
-        # Subnets under If/AsyncIf operators should get name remapping when cloned
-        n = core.Net("original")
-        then_net = core.Net("a")
-        then_net.FC(["inputA"], "fc_a")
-        else_net = core.Net("b")
-        else_net.FC(["inputB"], "fc_b")
-        n.If(
-            inputs=[],
-            outputs=[],
-            then_net=then_net.Proto(),
-            else_net=else_net.Proto(),
-        )
-        copied = n.Clone("copied", blob_remap={"inputA": "inputX"})
-        if_op = copied._net.op[0]
-        self.assertEqual(if_op.arg[0].n.op[0].input, ["inputX"])
-        self.assertEqual(if_op.arg[1].n.op[0].input, ["inputB"])
-
-
-class TestExternalInputs(test_util.TestCase):
-    def testAddExternalInputShouldRaiseIfDuplicate(self):
-        net = core.Net("test")
-        net.AddExternalInput(
-            schema.Struct(("x", schema.Scalar(np.float64))),
-        )
-        with self.assertRaises(AssertionError):
-            net.AddExternalInput(
-                schema.Struct(("x", schema.Scalar(np.float64))),
-            )
-
-    def testAddExternalInputShouldRaiseIfDuplicateInSameCall(self):
-        net = core.Net("test")
-        with self.assertRaises(AssertionError):
-            net.AddExternalInput(
-                schema.Struct(("x", schema.Scalar(np.float64))),
-                schema.Struct(("x", schema.Scalar(np.float64))),
-            )
-
-    def testSetInputRecordWithBlobs(self):
-        net = core.Net("test")
-        record = schema.NewRecord(net, schema.Struct(
-            ("x", schema.Scalar(np.float64)),
-        ))
-        input_record = net.set_input_record(record)
-        self.assertTrue(net.BlobIsDefined(input_record.x()))
-        self.assertIn(input_record.x(), net.external_inputs)
-
-    def testSetInputRecordWithoutBlobs(self):
-        net = core.Net("test")
-        record = schema.Struct(("x", schema.Scalar(np.float64)))
-        input_record = net.set_input_record(record)
-        self.assertTrue(net.BlobIsDefined(input_record.x()))
-        self.assertIn(input_record.x(), net.external_inputs)
-
-
-class TestCreateOperator(test_util.TestCase):
-    def testCreate(self):
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = workspace.GpuDeviceType
-        device_option.device_id = 1
-        op = core.CreateOperator(
-            "Ludicrous", "x", "y", name="ludicrous",
-            control_input="z", device_option=device_option,
-            engine="WARP", arg1=1, arg2="2", arg3=[1, 2, 3])
-        self.assertEqual(op.type, "Ludicrous")
-        self.assertEqual(op.name, "ludicrous")
-        self.assertEqual(op.engine, "WARP")
-        self.assertEqual(len(op.input), 1)
-        self.assertEqual(op.input[0], "x")
-        self.assertEqual(len(op.output), 1)
-        self.assertEqual(op.output[0], "y")
-        self.assertEqual(len(op.control_input), 1)
-        self.assertEqual(op.control_input[0], "z")
-        self.assertTrue(op.HasField('device_option'))
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(len(op.arg), 3)
-
-        # can't guarantee ordering of kwargs, so generate a set of args
-        # to test with
-        arg_map = {}
-        for arg in op.arg:
-            arg_map[arg.name] = arg
-
-        # Check all elements exist that should
-        self.assertEqual("arg1" in arg_map, True)
-        self.assertEqual("arg2" in arg_map, True)
-        self.assertEqual("arg3" in arg_map, True)
-
-        # Now test that all args were initialized correctly
-        self.assertEqual(arg_map["arg1"].i, 1)
-        self.assertEqual(arg_map["arg2"].s, b"2")
-        self.assertEqual(list(arg_map["arg3"].ints), [1, 2, 3])
-
-
-class TestAutoNaming(test_util.TestCase):
-    def assertOperatorListEqual(self, operatorDefList1, operatorDefList2):
-        for op in operatorDefList1:
-            op.debug_info = ""
-        for op in operatorDefList2:
-            op.debug_info = ""
-        self.assertEqual(operatorDefList1, operatorDefList2)
-    """
-    Test that operators are named with different names, and that automatically
-    named blob names don't clash intra or inter networks.
-    """
-    def test_next_blob(self):
-        def create_net():
-            net = core.Net('net')
-            with core.NameScope('foo'):
-                net.Add(['a', 'b'], net.NextScopedBlob('ab'))
-
-            net.Add(['c', 'd'], net.NextBlob('cd'))
-            return net
-
-        net_a = create_net()
-        net_b = create_net()
-        # created net proto is predicatable.
-        self.assertOperatorListEqual(net_a.Proto().op,
-                         net_b.Proto().op)
-        self.assertEqual(net_a.Proto().op[0].output[0], 'foo/ab')
-        self.assertEqual(net_a.Proto().op[1].output[0], 'cd')
-
-        net_c = core.Net('net')
-        # different calls return different blob names
-        self.assertNotEqual(str(net_c.NextBlob('b')), str(net_c.NextBlob('b')))
-
-    def test_auto_naming(self):
-        a = core.Net('net')
-        b = core.Net('net')
-        self.assertNotEqual(a.Proto().name, b.Proto().name)
-        a_in1 = a.AddExternalInput('a')
-        b_in1 = b.AddExternalInput('b')
-        all_outputs_single = []
-        all_outputs_list = []
-
-        def add_ops():
-            all_outputs_single.append(a.Sum([a_in1, a_in1]))
-            all_outputs_single.append(a.Sum([a_in1, a_in1]))
-            all_outputs_single.append(b.Sum([b_in1, b_in1]))
-            all_outputs_single.append(b.Sum([b_in1, b_in1]))
-            all_outputs_list.append(a.Sum([a_in1, a_in1], outputs=2))
-            all_outputs_list.append(a.Sum([a_in1, a_in1], outputs=2))
-            all_outputs_list.append(b.Sum([b_in1, b_in1], outputs=2))
-            all_outputs_list.append(b.Sum([b_in1, b_in1], outputs=2))
-
-        add_ops()
-        with core.NameScope('n1'):
-            add_ops()
-
-        # Force reset of lookup tables
-        a.Proto().name
-
-        with core.NameScope('n2'):
-            add_ops()
-
-        all_outputs = []
-        for s in all_outputs_single:
-            all_outputs.append(str(s))
-        for l in all_outputs_list:
-            for o in l:
-                all_outputs.append(str(o))
-
-        for i, o1 in enumerate(all_outputs):
-            for j, o2 in enumerate(all_outputs):
-                if i != j:
-                    self.assertNotEqual(str(o1), str(o2))
-
-        a._CheckLookupTables()
-        b._CheckLookupTables()
-
-
-class TestAppendNet(test_util.TestCase):
-
-    def test_external_inputs_merged_correctly(self):
-        netA = core.Net("A")
-        netA.Sum(["in1", "in2"], ["sum1"])
-        self.assertTrue("in1" in netA.external_inputs)
-
-        netB = core.Net("B")
-        netB.Sum(["in3", "in4"], ["in1"])
-        netB.AppendNet(netA)
-        self.assertFalse("in1" in netB.external_inputs)
-
-    def test_external_inputs_merged_correctlyB(self):
-        netA = core.Net("A")
-        netA.Sum(["in1", "in2"], ["sum1"])
-        self.assertTrue("in1" in netA.external_inputs)
-
-        netB = core.Net("B")
-        netB.Sum(["in3", "in4"], ["in1"])
-        netA.AppendNet(netB)  # note different order than in prev test
-        self.assertTrue("in1" in netA.external_inputs)
-
-
-class TestExtractPredictorNet(test_util.TestCase):
-
-    @unittest.skipIf('ImageInput' not in workspace.RegisteredOperators(), "Needs OpenCV")
-    def test_extract_simple(self):
-        from caffe2.python import brew
-        from caffe2.python.model_helper import ModelHelper, ExtractPredictorNet
-
-        model = ModelHelper(name="test", arg_scope={'order': 'NCHW'})
-        [data, label] = brew.image_input(
-            model,
-            "reader", ["xx/data", "label"],
-            is_test=1,
-        )
-        cnv = brew.conv(model, data, 'cnv', 32, 32, 4)
-        a = brew.fc(model, cnv, 'a', 100, 200)
-        pred = brew.fc(model, a, 'pred', 200, 5)
-        brew.softmax(model, [pred, label], "softmax")
-
-        (predict_net, export_blobs) = ExtractPredictorNet(
-            net_proto=model.net.Proto(),
-            input_blobs=["xx/data"],
-            output_blobs=["pred"],
-            renames={"xx/data": "image"},
-        )
-        export_blobs = set(export_blobs)
-
-        ops = list(predict_net.Proto().op)
-        for op in ops:
-            self.assertFalse(op.type == "Softmax")
-            self.assertFalse("xx/data" in op.input)
-
-        # Note: image input should not be included
-        self.assertEqual(ops[0].type, "Conv")
-        self.assertEqual(ops[1].type, "FC")
-        self.assertEqual(ops[2].type, "FC")
-        self.assertEqual(len(ops), 3)
-
-        # test rename happened
-        self.assertEqual(ops[0].input[0], "image")
-
-        # Check export blobs
-        self.assertTrue("image" not in export_blobs)
-        self.assertTrue("xx/data" not in export_blobs)
-        self.assertEqual(set([str(p) for p in model.params]), export_blobs)
-
-        # Check external inputs/outputs
-        self.assertTrue("image" in predict_net.Proto().external_input)
-        self.assertEqual(set(["pred"]), set(predict_net.Proto().external_output))
-        self.assertEqual(
-            set(predict_net.Proto().external_input) -
-            set([str(p) for p in model.params]), set(["image"])
-        )
-
-
-class TestOperatorTraceback(test_util.TestCase):
-    def op_name_check(self, net, cf, line, func):
-        net.PopulateProtoWithFileName()
-        filename = getframeinfo(cf).filename
-        self.assertEqual(net.Proto().op[0].name, '{}:{}:{}'.format(
-            filename, line, func))
-
-    def test_operator_constructor_traceback(self):
-        net = core.Net("test")
-        a, b = net.AddExternalInput("a", "b")
-        net.Mul([a, b], "c"); cf = currentframe(); line = cf.f_lineno
-        func = cf.f_code.co_name
-        with self.assertRaises(Exception):
-            workspace.RunNetOnce(net)
-        with self.assertRaises(Exception):
-            workspace.CreateNet(net)
-        self.op_name_check(net, cf, line, func)
-
-    def test_operator_runtime_traceback(self):
-        net = core.Net("test")
-        a = net.AddExternalInput("a")
-        workspace.blobs[a] = np.array([1, 2, 3], dtype=np.float32)
-        net.Split(a, ["b", "c"], axis=0); cf = currentframe(); line = cf.f_lineno
-        func = cf.f_code.co_name
-        with self.assertRaises(Exception):
-            workspace.RunNetOnce(net)
-        workspace.CreateNet(net)
-        with self.assertRaises(Exception):
-            workspace.RunNet(net)
-        self.op_name_check(net, cf, line, func)
-
-    def test_c_workspace_constructor(self):
-        net = core.Net("test")
-        a, b = net.AddExternalInput("a", "b")
-        net.Mul([a, b], "c"); cf = currentframe(); line = cf.f_lineno
-        func = cf.f_code.co_name
-        ws = workspace.C.Workspace()
-        with self.assertRaises(Exception):
-            ws.run(net)
-        with self.assertRaises(Exception):
-            ws.create_net(net)
-        self.op_name_check(net, cf, line, func)
-
-    def test_c_workspace_runtime(self):
-        net = core.Net("test")
-        a = net.AddExternalInput("a")
-        net.Split(a, ["b", "c"], axis=0); cf = currentframe(); line = cf.f_lineno
-        func = cf.f_code.co_name
-        ws = workspace.C.Workspace()
-        ws.create_blob(str(a)).feed(np.array([1, 2, 3], dtype=np.float32))
-        ws.create_net(net)
-        with self.assertRaises(Exception):
-            ws.run(net)
-        self.op_name_check(net, cf, line, func)
-
-    def test_async_exception_handling(self):
-        net = core.Net("test")
-        net.Proto().type = 'dag'  # this runs operators on background threads
-        a = net.AddExternalInput("a")
-        net.Split(a, ["b", "c"], axis=0); cf = currentframe(); line = cf.f_lineno
-        func = cf.f_code.co_name
-        workspace.FeedBlob(a, np.array([1, 2, 3], dtype=np.float32))
-        with self.assertRaises(Exception) as enforceNotMet:
-            workspace.RunNetOnce(net)
-        self.assertIn('enforce fail', str(enforceNotMet.exception))
-        self.op_name_check(net, cf, line, func)
-
-
-class TestCreatePlan(test_util.TestCase):
-
-    def test_create_plan_from_proto_correctly(self):
-        from caffe2.python.net_builder import ops
-        with Node('trainer'), Task(name='my_task', num_instances=2) as task:
-            with ops.task_init():
-                globl = ops.Const(0)
-            with ops.task_instance_init():
-                local = ops.Const(0)
-            with ops.loop(100):
-                ops.Copy(globl, local)
-            with ops.task_instance_exit():
-                ops.Add([globl, local], [globl])
-            with ops.task_exit():
-                ops.Mul([globl, globl], [globl])
-
-        plan = core.Plan(task.get_step())
-        test_plan = core.Plan.create_from_proto(plan.Proto())
-
-        self.assertEqual(len(plan.Steps()), 1)
-        self.assertEqual(len(test_plan.Steps()), 1)
-        self.assertEqual(len(plan.Proto().network), 9)
-        self.assertEqual(len(test_plan.Proto().network), 9)
-        self.assertEqual(len(plan.Proto().execution_step), 1)
-        self.assertEqual(len(test_plan.Proto().execution_step), 1)
-        self.assertEqual(plan.Steps()[0].Name(), test_plan.Steps()[0].Name())
-        self.assertEqual(len(plan.Nets()), len(test_plan.Nets()))
-        for idx in range(0, len(plan.Nets())):
-            # When we create Net for test_plan, we will end up with new Net
-            # name with postfix.
-            net_1 = plan.Nets()[idx]
-            net_2 = test_plan.Nets()[idx]
-            trim_size = len(net_1.Name())
-            self.assertEqual(net_1.Name(), net_2.Name()[:trim_size])
-
-
-class TestOpRegistryKey(test_util.TestCase):
-    def test_is_operator(self):
-        self.assertTrue(core.IsOperator('Relu'))
-        self.assertFalse(core.IsOperator('NOEXIST'))
-
-    def test_is_operator_with_engine(self):
-        self.assertTrue(core.IsOperatorWithEngine('Relu', 'DEFAULT'))
-        self.assertFalse(core.IsOperatorWithEngine('Relu', 'NOEXIST'))
-
-
-class TestDeviceOption(test_util.TestCase):
-    def test_check_equal_node_name(self):
-        opt1 = core.DeviceOption(0)
-        opt2 = core.DeviceOption(0)
-        self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt2.node_name = 'test'
-        self.assertTrue(core.device_option_equal(opt1, opt2))
-        self.assertFalse(core.device_option_equal(opt1, opt2, ignore_node_name=False))
-        opt1.node_name = 'test'
-        self.assertTrue(core.device_option_equal(opt1, opt2, ignore_node_name=False))
-
-    def test_check_equal_default_value(self):
-        opt1 = caffe2_pb2.DeviceOption()
-        opt2 = caffe2_pb2.DeviceOption()
-        opt1.device_type = 0
-        self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt1.device_id = 5
-        # opt1 still is on CPU, so the options should be equal
-        self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt2.device_type = 0
-        self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt1.device_type = 1
-        self.assertFalse(core.device_option_equal(opt1, opt2))
-
-
-class TestInferDeviceCpuOnly(test_util.TestCase):
-    def test_inject_copy(self):
-        '''
-        Test inject cross device copies - this is a no-op on CPU only devices.
-        '''
-        send_node = 'node:0'
-        recv_node = 'node:1'
-        # Using placeholder ops for send/recv. Placeholder ops are
-        # decorator/fake ops that don't have operator schema.
-        placeholder_send = 'Placeholder:Dummy:Send'
-        placeholder_recv = 'Placeholder:Dummy:Recv'
-
-        # init_net.
-        init_net = core.Net("init_net")
-        with core.DeviceScope(0, node_name=send_node):
-            init_net.XavierFill([], 'fc_w', shape=[10, 100])
-            init_net.ConstantFill([], 'fc_b', shape=[10, ])
-
-        # train_net.
-        train_net = core.Net("train_net")
-        train_net.Proto().external_input.extend(['fc_w', 'fc_b'])
-        with core.DeviceScope(0, node_name=send_node):
-            op = core.CreateOperator(
-                placeholder_send, ["fc_w", 'fc_b'], [],
-                dst_node=recv_node)
-            train_net.Proto().op.extend([op])
-        with core.DeviceScope(0, node_name=recv_node):
-            # Let's rename the recv blob i.e. fc_w -> fc_w_recv.
-            op = core.CreateOperator(
-                placeholder_recv, [], ['fc_w_recv', 'fc_b'],
-                src_node=send_node)
-            train_net.Proto().op.extend([op])
-            train_net.FC(["data", 'fc_w_recv', 'fc_b'], "fc1")
-
-        # Inject cross device copies.
-        init_net, x_dev_state = core.InjectCrossDeviceCopies(
-            init_net,
-            placeHolderOps=[placeholder_send, placeholder_recv])
-        train_net, x_dev_state = core.InjectCrossDeviceCopies(
-            train_net, x_dev_state,
-            placeHolderOps=[placeholder_send, placeholder_recv])
-
-        # Verify: No Copy operators should be injected since it is CPU only.
-        op = train_net.Proto().op[0]
-        self.assertEqual(op.type, placeholder_send)
-        self.assertEqual(op.device_option.device_type, 0)
-        self.assertEqual(op.input[0], "fc_w")
-        self.assertEqual(op.input[1], "fc_b")
-        op = train_net.Proto().op[1]
-        self.assertEqual(op.type, placeholder_recv)
-        self.assertEqual(op.device_option.device_type, 0)
-        self.assertEqual(op.output[0], "fc_w_recv")
-        self.assertEqual(op.output[1], "fc_b")
-        op = train_net.Proto().op[2]
-        self.assertEqual(op.type, "FC")
-        self.assertEqual(op.device_option.device_type, 0)
-        self.assertEqual(op.input[1], "fc_w_recv")
-        self.assertEqual(op.input[2], "fc_b")
-
-
-@unittest.skipIf(not workspace.has_gpu_support, 'No GPU support')
-class TestInferDevice(test_util.TestCase):
-
-    def setUp(self):
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = workspace.GpuDeviceType
-        device_option.device_id = 1
-        self.gpu_option = device_option
-        self.cpu_option = caffe2_pb2.DeviceOption()
-
-    def _test_op(
-        self,
-        op_name,
-        in_option,
-        out_option,
-        op_option=None,
-        inputs=None,
-        outputs=None
-    ):
-        op_option = self.gpu_option if not op_option else op_option
-        inputs = ["blob_1"] if not inputs else inputs
-        outputs = ["blob_2"] if not outputs else outputs
-        with core.DeviceScope(op_option):
-            op = core.CreateOperator(op_name, inputs, outputs)
-        input_dev, output_dev = core.InferOpBlobDevices(op)
-        if isinstance(in_option, list):
-            assert len(in_option) == len(input_dev), \
-                'Length of input device option should match' \
-                '{} vs. {}'.format(in_option, input_dev)
-            for in_dev, in_opt in zip(input_dev, in_option):
-                self.assertEqual(in_dev, in_opt)
-        else:
-            for in_dev in input_dev:
-                self.assertEqual(in_dev, in_option)
-        if isinstance(out_option, list):
-            assert len(out_option) == len(output_dev), \
-                'Length of output device option should match' \
-                '{} vs. {}'.format(out_option, output_dev)
-            for out_dev, out_opt in zip(output_dev, out_option):
-                self.assertEqual(out_dev, out_opt)
-        else:
-            for out_dev in output_dev:
-                self.assertEqual(out_dev, out_option)
-
-    def test_infer_device(self):
-        self._test_op(
-            "FC",
-            self.gpu_option,
-            self.gpu_option,
-            op_option=self.gpu_option,
-            inputs=["data", "fc_w", "fc_b"],
-            outputs=["fc_1"]
-        )
-
-    def test_infer_device_split_by_lengths(self):
-        self._test_op(
-            "SplitByLengths",
-            [self.gpu_option, self.cpu_option],
-            self.gpu_option,
-            op_option=self.gpu_option,
-            inputs=["data", "fc_w"],
-            outputs=["fc_1"]
-        )
-
-    def test_infer_device_adam(self):
-        in_options = [self.gpu_option] * 6
-        in_options[5] = self.cpu_option
-        out_options = [self.gpu_option] * 4
-        self._test_op(
-            "Adam",
-            in_options,
-            out_options,
-            op_option=self.gpu_option,
-            inputs=["param", "moment_1", "moment_2", "grad", "lr", "iter"],
-            outputs=["output_param", "output_moment_1", "output_moment_2",
-                "output_grad"]
-        )
-
-    def test_infer_device_cross_device(self):
-        self._test_op("CopyGPUToCPU", self.gpu_option, self.cpu_option)
-        self._test_op("CopyCPUToGPU", self.cpu_option, self.gpu_option)
-        self._test_op("CopyFromCPUInput", self.cpu_option, self.gpu_option)
-        self._test_op(
-            "CopyFromCPUInput",
-            self.cpu_option,
-            self.cpu_option,
-            op_option=self.cpu_option
-        )
-
-    def test_device_inference_function(self):
-        # ConcatOp.
-        op_option = self.gpu_option
-        with core.DeviceScope(op_option):
-            op = core.CreateOperator(
-                'Concat',
-                ['X_{}'.format(i) for i in range(4)],
-                ['concat_result', 'split_info'],
-                axis=1)
-        input_dev, output_dev = core.InferOpBlobDevices(op)
-        # 2nd output's type is CPU irrespective of Concat op's device option.
-        self.assertEqual(output_dev[1], self.cpu_option)
-
-        #SplitOp.
-        op_option = self.gpu_option
-        with core.DeviceScope(op_option):
-            op = core.CreateOperator(
-                'Split',
-                ['input', 'split'],
-                ['X_{}'.format(i) for i in range(4)],
-                axis=0)
-        input_dev, output_dev = core.InferOpBlobDevices(op)
-        # 2nd input's type is CPU irrespective of Split op's device option.
-        self.assertEqual(input_dev[1], self.cpu_option)
-
-    def test_inject_copy(self):
-        net = core.Net("test")
-        init_net = core.Net("init")
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = workspace.GpuDeviceType
-        device_option.device_id = 1
-        weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
-        bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
-
-        with core.DeviceScope(device_option):
-            net.FC(["data", weight, bias], "fc1")
-
-        _, blob_to_device = core.InjectCrossDeviceCopies(init_net)
-        new_net, blob_to_device = core.InjectCrossDeviceCopies(
-            net, blob_to_device
-        )
-        op = new_net._net.op[-1]
-        self.assertEqual(op.type, "FC")
-        self.assertEqual(op.input[0], "data_gpu_1")
-        self.assertEqual(op.input[1], "fc_w_gpu_1")
-        self.assertEqual(op.input[2], "fc_b_gpu_1")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(new_net._net.op[-2].type, "CopyCPUToGPU")
-        self.assertEqual(new_net._net.op[0].type, "CopyCPUToGPU")
-        self.assertNotEqual(blob_to_device["fc_w"], device_option)
-
-    def test_cross_nets(self):
-        net = core.Net("test")
-        init_net = core.Net("init")
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = workspace.GpuDeviceType
-        device_option.device_id = 1
-        weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
-        bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
-        const = init_net.ConstantFill([], 'const', shape=[], value=1.)
-        with core.DeviceScope(device_option):
-            const = init_net.Add([const, const], [const])
-            fc_out = net.FC(["data", weight, bias], "fc1")
-            net.Add([fc_out, const], [fc_out])
-
-        data_remap = {'data': device_option}
-        nets, _ = core.InjectDeviceCopiesAmongNets(
-            [init_net, net], blob_to_device_init=data_remap
-        )
-        op = nets[1]._net.op[0]
-        self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(op.output[0], "fc_w_gpu_1")
-        op = nets[1]._net.op[1]
-        self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(op.output[0], "fc_b_gpu_1")
-        op = nets[1]._net.op[2]
-        self.assertEqual(op.type, "FC")
-        self.assertEqual(op.input[0], "data")
-        self.assertEqual(op.input[1], "fc_w_gpu_1")
-        self.assertEqual(op.input[2], "fc_b_gpu_1")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 1)
-        op = nets[1]._net.op[3]
-        self.assertEqual(op.type, "Add")
-        self.assertEqual(op.input[0], "fc1")
-        self.assertEqual(op.input[1], "const_gpu_1")
-        # check that moved blob is in input to the new net
-        for c in ["data", "fc_w", "fc_b", "const_gpu_1"]:
-            self.assertTrue(c in nets[1]._net.external_input)
-        """
-For reference, net.Proto() should be like:
-name: ""
-op {
-  input: "fc_w"
-  output: "fc_w_gpu_1"
-  name: ""
-  type: "CopyCPUToGPU"
-  device_option {
-    device_type: 1
-    device_id: 1
-  }
-}
-op {
-  input: "fc_b"
-  output: "fc_b_gpu_1"
-  name: ""
-  type: "CopyCPUToGPU"
-  device_option {
-    device_type: 1
-    device_id: 1
-  }
-}
-op {
-  input: "data"
-  input: "fc_w_gpu_1"
-  input: "fc_b_gpu_1"
-  output: "fc1"
-  name: ""
-  type: "FC"
-  device_option {
-    device_type: 1
-    device_id: 1
-  }
-}
-op {
-  input: "fc1"
-  input: "const_gpu_1"
-  output: "fc1"
-  name: ""
-  type: "Add"
-  device_option {
-    device_type: 1
-    device_id: 1
-  }
-}
-external_input: "data"
-external_input: "fc_w"
-external_input: "fc_b"
-external_input: "const"
-external_input: "const_gpu_1"
-"""
-
-    def test_cross_nets_no_change(self):
-        net = core.Net("test")
-        init_net = core.Net("init")
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = workspace.GpuDeviceType
-        device_option.device_id = 1
-
-        with core.DeviceScope(device_option):
-            weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
-            bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
-            net.FC(["data", weight, bias], "fc1")
-
-        data_remap = {'data': device_option}
-        nets = core.InjectDeviceCopiesAmongNetsWithoutB2D(
-            [init_net, net], blob_to_device_init=data_remap
-        )
-        op = nets[1]._net.op[0]
-        self.assertEqual(op.type, "FC")
-        self.assertEqual(op.input[0], "data")
-        self.assertEqual(op.input[1], "fc_w")
-        self.assertEqual(op.input[2], "fc_b")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 1)
-        """
-For reference, net.Proto() should be like:
-name: ""
-op {
-  input: "data"
-  input: "fc_w"
-  input: "fc_b"
-  output: "fc1"
-  name: ""
-  type: "FC"
-  device_option {
-    device_type: 1
-    device_id: 1
-  }
-}
-external_input: "data"
-external_input: "fc_w"
-external_input: "fc_b"
-"""
-
-    def test_inject_copy_multi_use(self):
-        net = core.Net("test")
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = workspace.GpuDeviceType
-        device_option.device_id = 1
-
-        with core.DeviceScope(device_option):
-            net.Relu("data", "relu1")
-        net.Relu("data", "relu2")
-        with core.DeviceScope(device_option):
-            net.Relu("data", "relu3")
-        net.Relu("data", "relu4")
-        device_option.device_id = 0
-        with core.DeviceScope(device_option):
-            net.Relu("data", "relu5")
-        device_option.device_id = 1
-        with core.DeviceScope(device_option):
-            net.Relu("data", "relu6")
-
-        new_net, _ = core.InjectCrossDeviceCopies(net)
-        op = new_net._net.op[0]
-        self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(op.output[0], "data_gpu_1")
-        op = new_net._net.op[1]
-        self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(op.output[0], "relu1")
-        op = new_net._net.op[2]
-        self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 0)
-        self.assertEqual(op.output[0], "relu2")
-        op = new_net._net.op[3]
-        self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(op.input[0], "data_gpu_1")
-        self.assertEqual(op.output[0], "relu3")
-        op = new_net._net.op[4]
-        self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 0)
-        self.assertEqual(op.output[0], "relu4")
-        op = new_net._net.op[5]
-        self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 0)
-        self.assertEqual(op.output[0], "data_gpu_0")
-        op = new_net._net.op[6]
-        self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 0)
-        self.assertEqual(op.input[0], "data_gpu_0")
-        self.assertEqual(op.output[0], "relu5")
-        op = new_net._net.op[7]
-        self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 1)
-        self.assertEqual(op.input[0], "data_gpu_1")
-        self.assertEqual(op.output[0], "relu6")
-        """
-For reference, net.Proto() should be like:
-name: ""
-op {
-  input: "data"
-  output: "data_gpu_1"
-  name: ""
-  type: "CopyCPUToGPU"
-  device_option {
-    device_type: 1
-    device_id: 1
-  }
-}
-op {
-  input: "data_gpu_1"
-  output: "relu1"
-  name: ""
-  type: "Relu"
-  device_option {
-    device_type: 1
-    device_id: 1
-  }
-}
-op {
-  input: "data"
-  output: "relu2"
-  name: ""
-  type: "Relu"
-}
-op {
-  input: "data_gpu_1"
-  output: "relu3"
-  name: ""
-  type: "Relu"
-  device_option {
-    device_type: 1
-    device_id: 1
-  }
-}
-op {
-  input: "data"
-  output: "relu4"
-  name: ""
-  type: "Relu"
-}
-op {
-  input: "data"
-  output: "data_gpu_0"
-  name: ""
-  type: "CopyCPUToGPU"
-  device_option {
-    device_type: 1
-    device_id: 0
-  }
-}
-op {
-  input: "data_gpu_0"
-  output: "relu5"
-  name: ""
-  type: "Relu"
-  device_option {
-    device_type: 1
-    device_id: 0
-  }
-}
-op {
-  input: "data_gpu_1"
-  output: "relu6"
-  name: ""
-  type: "Relu"
-  device_option {
-    device_type: 1
-    device_id: 1
-  }
-}
-external_input: "data"
-"""
-
-    def test_inject_copy_placeholder_ops(self):
-        '''
-        Test inject cross device copies with placeholder ops. Placeholder ops
-        are decorator/fake ops that don't have operator schema.
-        '''
-        # Create CPU and GPU devices on 2 nodes.
-        cpu_device = []
-        gpu_device = []
-        for i in range(0, 2):
-            cpu_device.append(caffe2_pb2.DeviceOption())
-            cpu_device[i].node_name = 'node:' + str(i)
-            gpu_device.append(caffe2_pb2.DeviceOption())
-            gpu_device[i].device_type = workspace.GpuDeviceType
-            gpu_device[i].device_id = 0
-            gpu_device[i].node_name = 'node:' + str(i)
-        send_node = 'node:0'
-        recv_node = 'node:1'
-        placeholder_send = 'Placeholder:Dummy:Send'
-        placeholder_recv = 'Placeholder:Dummy:Recv'
-
-        # init_net.
-        init_net = core.Net("init_net")
-        with core.DeviceScope(gpu_device[0]):
-            weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
-            bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
-        with core.DeviceScope(cpu_device[0]):
-            op = core.CreateOperator(
-                placeholder_send, [weight, bias], [],
-                dst_node=recv_node)
-            init_net._net.op.extend([op])
-
-        # train_net
-        train_net = core.Net("train_net")
-        with core.DeviceScope(cpu_device[1]):
-            # XXX. replace hardcoded op name. Move test to net_transforms.
-            op = core.CreateOperator(
-                placeholder_recv, [], [weight, bias],
-                src_node=send_node)
-            train_net._net.op.extend([op])
-            train_net.FC(["data", weight, bias], "fc1")
-
-        # Inject cross device copies.
-        init_net, x_dev_state = core.InjectCrossDeviceCopies(
-            init_net,
-            placeHolderOps=[placeholder_send, placeholder_recv])
-        train_net, x_dev_state = core.InjectCrossDeviceCopies(
-            train_net, x_dev_state,
-            placeHolderOps=[placeholder_send, placeholder_recv])
-
-        # Verify (init_net)
-        op = init_net._net.op[2]
-        self.assertEqual(op.type, "CopyGPUToCPU")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 0)
-        self.assertEqual(op.output[0], "fc_w_cpu")
-        op = init_net._net.op[3]
-        self.assertEqual(op.type, "CopyGPUToCPU")
-        self.assertEqual(op.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(op.device_option.device_id, 0)
-        self.assertEqual(op.output[0], "fc_b_cpu")
-        op = init_net._net.op[4]
-        self.assertEqual(op.type, placeholder_send)
-        self.assertEqual(op.device_option.device_type, 0)
-        self.assertEqual(op.input[0], "fc_w_cpu")
-        self.assertEqual(op.input[1], "fc_b_cpu")
-        # Verify (train_net)
-        op = train_net._net.op[0]
-        self.assertEqual(op.type, placeholder_recv)
-        self.assertEqual(op.device_option.device_type, 0)
-        self.assertEqual(op.output[0], "fc_w_cpu")
-        self.assertEqual(op.output[1], "fc_b_cpu")
-        op = train_net._net.op[3]
-        self.assertEqual(op.type, "FC")
-        self.assertEqual(op.device_option.device_type, 0)
-        self.assertEqual(op.input[1], "fc_w_cpu")
-        self.assertEqual(op.input[2], "fc_b_cpu")
-
-    def test_blob_inplace(self):
-        net = core.Net("test")
-        device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = workspace.GpuDeviceType
-        device_option.device_id = 1
-
-        net.Adagrad(['param', 'moment', 'grad', 'lr'], ['param', 'moment'])
-        with core.DeviceScope(device_option):
-            net.Relu("param", "param_relu_no_sense")
-        net, _ = core.InjectCrossDeviceCopies(net)
-        op = net._net.op[1]
-        self.assertEqual(op.type, 'CopyCPUToGPU')
-        self.assertEqual(op.input[0], 'param')
-        self.assertEqual(op.output[0], 'param_gpu_1')
-        op = net._net.op[2]
-        self.assertEqual(op.input[0], 'param_gpu_1')
-
-        net.Relu('nonsense_input', 'moment')
-        # should not raise inplace error
-        core.InjectCrossDeviceCopies(net)
-        with core.DeviceScope(device_option):
-            net.Relu('nonsense_input_gpu', 'moment')
-        with self.assertRaises(RuntimeError):
-            core.InjectCrossDeviceCopies(net)
-
-
-class TestRerouteTensor(test_util.TestCase):
-    def test_reroute_tensor(self):
-        net = core.Net("reroute_tensor")
-        net.Conv(["input", "w", "b"], "conv1")
-        net.Relu(["conv1"], "conv1_relu")
-        new_op = core.CreateOperator("SpatialBN",
-            ["conv1", "scale", "bias", "mean", "var"],
-            ["conv1_bn", "mean", "var", "saved_mean", "saved_var"])
-        # insert bn between conv and relu
-        net.reroute_tensor("conv1", new_op, [net.Proto().op[1]])
-        self.assertEqual(new_op, net.Proto().op[1], "insertion failed")
-        self.assertEqual(net.Proto().op[2].input[0], "conv1_bn", "reroute failed")
-
-
-class TestRunAllOnGPU(test_util.TestCase):
-    def test_rnn_run_on_gpu(self):
-        step_net = core.Net("step_net")
-        step_net.Conv(["input_1", "w", "b"], "conv1")
-        step_net.Relu(["conv1"], "input_1")
-        net = core.Net("to_run_on_gpu")
-        net.RecurrentNetwork(["input_1"], ["input_1"], step_net=step_net.Proto())
-        net.Relu(["input_1"], "input_relu")
-        # check network structure before conversion
-        net_proto = net.Proto()
-        self.assertFalse(net_proto.HasField('device_option'))
-        self.assertTrue(net_proto.op[0].arg[0].name == 'step_net')
-        self.assertTrue(net_proto.op[0].arg[0].HasField('n'))
-        self.assertFalse(net_proto.op[0].arg[0].n.HasField('device_option'))
-
-        net.RunAllOnGPU(gpu_id=3, use_cudnn=True)
-        # check that root net and rnn net got device_option attribute assigned
-        self.assertTrue(net_proto.HasField('device_option'))
-        self.assertEqual(net_proto.device_option.device_type, workspace.GpuDeviceType)
-        self.assertEqual(net_proto.device_option.device_id, 3)
-        self.assertTrue(net_proto.op[0].arg[0].n.HasField('device_option'))
-
-
-class TestConstructionFromProto(test_util.TestCase):
-    def test_inplace_construction(self):
-        # just create some random net
-        n = core.Net('original')
-        a1 = n.AddExternalInput('a1')
-        a2 = n.AddExternalInput('a2')
-        b1, b2 = n.Concat([a1, a2], ['b1', 'b2'], axis=0)
-        c1 = n.Sum([b1, b1], ['c1'])
-        c2 = n.Sum([b2], ['c2'])
-        d = n.Sum([c1, c2], ['d'])
-
-        proto = n.Proto()
-        n_copied = core.Net(proto)
-        n_moved = core.Net(proto, inplace=True)
-        self.assertTrue(n_moved.Proto() is proto)
-        self.assertTrue(n_copied.Proto() is not proto)
-
-        proto.external_input.extend(['foo'])
-        self.assertEqual(len(n_moved.Proto().external_input), len(proto.external_input))
-        self.assertEqual(len(n_copied.Proto().external_input), len(proto.external_input) - 1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/crf.py b/caffe2/python/crf.py
deleted file mode 100644
index e6c36a3c571e..000000000000
--- a/caffe2/python/crf.py
+++ /dev/null
@@ -1,313 +0,0 @@
-## @package crf
-# Module caffe2.python.crf
-
-
-import numpy as np
-from caffe2.python import brew, core, model_helper, recurrent
-
-
-"""
-Due to a limitation in ReccurentNetworkOp, this layer only supports batch_size=1
-In order to support batch_size > 1, we will have to implement the CRFUnit
-and its gradient in C++ and handle the different batches there.
-"""
-
-
-class CRFWithLoss:
-    def __init__(self, model, num_classes, transitions_blob=None):
-        self.model = model
-        self.num_classes = num_classes
-        self.num_classes_padded = num_classes + 2  # After adding BOS and EOS
-        if not transitions_blob:
-            transitions_blob = self.model.param_init_net.UniformFill(
-                [],
-                [core.ScopedBlobReference("crf_transitions")],
-                shape=[self.num_classes_padded, self.num_classes_padded],
-                min=-1.0,
-                max=1.0,
-            )
-        self.transitions = transitions_blob
-        self.model.params.append(self.transitions)
-
-    def crf_loss(self, predictions, labels, seq_lengths=None):
-        # Since the transitions matrix is a shared parameter, need to
-        # take a snapshot of it at the beginning since it can be updated
-        # in between the operators that uses it when doing parallel updates
-        transitions_snapshot = self.model.net.Copy(
-            self.transitions, core.ScopedBlobReference("transitions_snapshot")
-        )
-        # Compute best path unary score from the logits
-        path_unary_score = self._gather_entries_sum(
-            predictions, labels, self.num_classes
-        )
-        # Append BOS and EOS entries to the predictions and labels
-        predictions = CRFWithLoss.pad_predictions(
-            predictions, self.model.param_init_net, self.model.net, self.num_classes
-        )
-        labels = CRFWithLoss.pad_labels(
-            labels, self.model.param_init_net, self.model.net, self.num_classes
-        )
-        # Compute best path binary scores from the transitions matrix
-        path_binary_score = self._path_binary_scores(
-            labels, transitions_snapshot, seq_lengths
-        )
-        path_total_score = self.model.net.Add(
-            [path_binary_score, path_unary_score],
-            core.ScopedBlobReference("path_total"),
-        )
-        # Compute all paths score
-        zero_index = self.model.param_init_net.ConstantFill([], shape=[1], value=0)
-        initial_state = self.model.net.Gather(
-            [predictions, zero_index],
-            core.ScopedBlobReference("rnn_initial"),
-            dense_gradient=True,
-        )
-        input_data, _ = self.model.net.RemovePadding(
-            [predictions], padding_width=1, end_padding_width=0, outputs=2
-        )
-        input_data = self.model.net.ExpandDims(
-            [input_data], core.ScopedBlobReference("rnn_input_data"), dims=[1]
-        )
-        # Due to a bug in RecurrentNetworkGradientOp, we need to copy the
-        # transitions blob before sending it to the recurrent network
-        transitions_copy = self.model.net.Copy(
-            transitions_snapshot, core.ScopedBlobReference("transitions_copy")
-        )
-        all_paths_scores = self._crf_forward(
-            input_data, initial_state, transitions_copy
-        )
-        loss = self.model.net.Sub(
-            [all_paths_scores, path_total_score], core.ScopedBlobReference("crf_loss")
-        )
-        return loss
-
-    def _path_binary_scores(self, labels, transitions, seq_lengths=None):
-        column_ids, _ = self.model.net.RemovePadding(
-            [labels], outputs=2, padding_width=1, end_padding_width=0
-        )
-        row_ids, _ = self.model.net.RemovePadding(
-            [labels], outputs=2, padding_width=0, end_padding_width=1
-        )
-        # Since there is no multi-dimensional gather, I flatten the matrix to
-        # a 1-d vector and transform the ids to (row_ids * num_columns +
-        # column_ids) and do gather in 1-d
-        num_columns_blob = self.model.net.ConstantFill(
-            [row_ids], value=self.num_classes_padded
-        )
-        flattened_ids = self.model.net.Mul([row_ids, num_columns_blob])
-        flattened_ids = self.model.net.Add([flattened_ids, column_ids])
-        flattened_transitions = self.model.net.FlattenToVec([transitions])
-        entries = self.model.net.Gather(
-            [flattened_transitions, flattened_ids], dense_gradient=True
-        )
-        return self.model.ReduceFrontSum(entries)
-
-    def _gather_entries_sum(self, in_data, indices, index_size):
-        indices = self.model.net.Cast([indices], to="int64")
-        index_size_blob = self.model.param_init_net.ConstantFill(
-            [], shape=[1], value=index_size
-        )
-        query_one_hot = self.model.net.OneHot([indices, index_size_blob])
-        flattend_query = self.model.net.FlattenToVec(query_one_hot)
-        flattend_data = self.model.net.FlattenToVec(in_data)
-        query_scores = self.model.net.DotProduct([flattend_query, flattend_data])
-        final_sum = self.model.net.ReduceFrontSum([query_scores])
-        return final_sum
-
-    def _crf_forward(
-        self, input_blob, initial_state, transitions_copy, seq_lengths=None
-    ):
-        # Build the RNN net and get the last timestep output
-        out_last = self.build_crf_net(input_blob, initial_state, transitions_copy)
-        out_last, _ = self.model.net.Reshape(
-            [out_last], outputs=2, shape=(self.num_classes_padded,)
-        )
-        zero_segment_id = self.model.param_init_net.ConstantFill(
-            [], value=0, shape=[self.num_classes_padded], dtype=core.DataType.INT32
-        )
-
-        # Compute the accumulated total score of all the paths
-        accum_score = self.model.net.SortedSegmentRangeLogSumExp(
-            [out_last, zero_segment_id]
-        )
-        accum_score, _ = self.model.net.Reshape(accum_score, outputs=2, shape=())
-        return accum_score
-
-    def build_crf_net(self, input_blob, initial_state, transitions):
-        """
-            Adds the crf_net recurrent operator to the model.
-
-            model: model_helper.ModelHelper object new operators would be added
-            to
-
-            input_blob: the input sequence in a format T x N x D
-            where T is sequence size, N - batch size and D - input dimension
-            ##Only supports batch-size 1##
-
-            seq_lengths: blob containing sequence lengths (unused)
-            """
-
-        scope = "crf_net"
-
-        def s(name):
-            ""
-            # We have to manually scope due to our internal/external blob
-            # relationships.
-            return "{}/{}".format(str(scope), str(name))
-
-        step_model = model_helper.ModelHelper(name="crf_step", param_model=self.model)
-        input_t, cell_t_prev, _ = step_model.net.AddExternalInputs(
-            core.ScopedBlobReference("input_t"),
-            core.ScopedBlobReference("cell_t_prev"),
-            transitions,
-        )
-        zero_segment_id = step_model.param_init_net.ConstantFill(
-            [],
-            [s("zero_segment_id")],
-            value=0,
-            shape=[self.num_classes_padded],
-            dtype=core.DataType.INT32,
-        )
-
-        # A hack to bypass model cloning for test
-        step_model.param_init_net.AddExternalOutput(zero_segment_id)
-        """ the CRF step """
-        # Do tile
-        prev_transpose = brew.transpose(
-            step_model, cell_t_prev, [s("prev_transpose")], axes=(0, 2, 1)
-        )
-        prev_tiled = step_model.net.Tile(
-            prev_transpose, [s("prev_tiled")], tiles=self.num_classes_padded, axis=2
-        )
-        input_t_tiled = step_model.net.Tile(
-            input_t, [s("input_t_tiled")], tiles=self.num_classes_padded, axis=1
-        )
-        input_with_prev = step_model.net.Add(
-            [prev_tiled, input_t_tiled], [s("input_with_prev")]
-        )
-        all_with_transitions = step_model.net.Add(
-            [input_with_prev, transitions],
-            [s("prev_with_transitions")],
-            broadcast=1,
-            use_grad_hack=1,
-        )
-        all_with_transitions_reshaped, _ = step_model.net.Reshape(
-            all_with_transitions,
-            [s("all_with_transitions_reshaped"), s("all_with_transitions_orig")],
-            shape=(self.num_classes_padded, self.num_classes_padded),
-        )
-        cell_t = step_model.net.SortedSegmentRangeLogSumExp(
-            [all_with_transitions_reshaped, zero_segment_id], [s("cell_t")]
-        )
-        step_model.net.AddExternalOutputs(cell_t)
-        """ recurrent network """
-        cell_input_blob = initial_state
-        out_all, out_last = recurrent.recurrent_net(
-            net=self.model.net,
-            cell_net=step_model.net,
-            inputs=[(input_t, input_blob)],
-            initial_cell_inputs=[(cell_t_prev, cell_input_blob)],
-            links={cell_t_prev: cell_t},
-            scope=scope,
-            outputs_with_grads=(1,),
-        )
-        return out_last
-
-    def update_predictions(self, classes):
-        def crf_update_predictions_op(inputs, outputs):
-            # This operator will compute the best path of classes by performing
-            # Viterbi decoding and then updates the predictions to make the tag
-            # On the best path has the highest score among the others
-            predictions = inputs[0].data
-            transitions = inputs[1].data
-            predictions = inputs[0].data
-            predictions_shape = inputs[0].shape
-            outputs[0].reshape(predictions_shape)
-
-            trellis = np.zeros(predictions_shape)
-            backpointers = np.zeros(predictions_shape, dtype=np.int32)
-            trellis[0] = predictions[0]
-
-            for t in range(1, predictions_shape[0]):
-                v = np.expand_dims(trellis[t - 1], 1) + transitions
-                trellis[t] = predictions[t] + np.max(v, 0)
-                backpointers[t] = np.argmax(v, 0)
-
-            viterbi = [np.argmax(trellis[-1])]
-            for bp in reversed(backpointers[1:]):
-                viterbi.append(bp[viterbi[-1]])
-            viterbi.reverse()
-
-            new_predictions = np.zeros(predictions_shape)
-            old_bests = []
-            for i, w_predictions in enumerate(predictions):
-                # Get the current tag with the maximum score
-                new_predictions[i] = predictions[i]
-                old_best = np.argmax(w_predictions)
-                old_bests.append(old_best)
-                # Swap the scores of the current best tag and the tag on the
-                # Viterbi path
-                w_predictions[viterbi[i]], w_predictions[old_best] = (
-                    w_predictions[old_best],
-                    w_predictions[viterbi[i]],
-                )
-                new_predictions[i] = w_predictions
-            # Remove the BOS and EOS entries from the predictions matrix
-            orig_predictions = new_predictions[1:-1, 0:-2]
-            outputs[0].reshape(orig_predictions.shape)
-            outputs[0].data[...] = orig_predictions
-
-        padded_classes = CRFWithLoss.pad_predictions(
-            classes, self.model.param_init_net, self.model.net, self.num_classes
-        )
-        new_classes = self.model.net.Python(crf_update_predictions_op)(
-            [padded_classes, self.transitions],
-            core.ScopedBlobReference("post_crf_classes"),
-        )
-        return new_classes
-
-    @staticmethod
-    def pad_labels(labels, init_net, net, num_classes):
-        bos_i = num_classes
-        eos_i = num_classes + 1
-        bos_i_b = init_net.ConstantFill([], shape=[1], value=bos_i)
-        eos_i_b = init_net.ConstantFill([], shape=[1], value=eos_i)
-        labels = net.Cast([labels], to="int64")
-        padded_labels, _ = net.Concat([bos_i_b, labels, eos_i_b], axis=0, outputs=2)
-        return padded_labels
-
-    @staticmethod
-    def pad_predictions(predictions, init_net, net, num_classes):
-        # This function will introduce two labels for beginning of sequence
-        # And end of sequence, it will make the necessary udpates to the
-        # the predictions blob
-
-        low_score = -1000.0  # An arbitray very low number
-        b_scores = np.array([[low_score] * num_classes + [0, low_score]]).astype(
-            np.float32
-        )
-
-        e_scores = np.array([[low_score] * num_classes + [low_score, 0]]).astype(
-            np.float32
-        )
-
-        b_scores = init_net.GivenTensorFill(
-            [], "b_scores", shape=[1, num_classes + 2], values=b_scores
-        )
-        e_scores = init_net.GivenTensorFill(
-            [], "e_scores", shape=[1, num_classes + 2], values=e_scores
-        )
-
-        zero_index = net.ConstantFill([], shape=[1], value=0)
-        length = net.Gather([net.Shape([predictions]), zero_index])
-        length = net.Cast(length, to="int32")
-        t_range = net.LengthsRangeFill(length)
-        padding = net.ConstantFill([t_range], value=low_score)
-        padding = net.ExpandDims(padding, dims=[1])
-        padded_predictions, _ = net.Concat(
-            [predictions, padding, padding], outputs=2, axis=1
-        )
-        padded_predictions_concat, _ = net.Concat(
-            [b_scores, padded_predictions, e_scores], outputs=2, axis=0
-        )
-        return padded_predictions_concat
diff --git a/caffe2/python/crf_predict.py b/caffe2/python/crf_predict.py
deleted file mode 100644
index 9bc0372c50c0..000000000000
--- a/caffe2/python/crf_predict.py
+++ /dev/null
@@ -1,33 +0,0 @@
-
-
-import numpy as np
-from caffe2.python.crf import CRFWithLoss
-
-
-def crf_update_predictions(model, crf_with_loss, classes):
-    return apply_crf(
-        model.param_init_net,
-        model.net,
-        crf_with_loss.transitions,
-        classes,
-        crf_with_loss.num_classes,
-    )
-
-
-def apply_crf(init_net, net, transitions, predictions, num_classes):
-    padded_classes = CRFWithLoss.pad_predictions(
-        predictions, init_net, net, num_classes
-    )
-    bestPath = net.ViterbiPath([padded_classes, transitions])
-    new_padded_classes = net.SwapBestPath([padded_classes, bestPath])
-    # Revert the effect of pad_predictions by removing the last two rows and
-    # the last two columns
-    new_classes = net.RemovePadding(
-        [new_padded_classes], padding_width=1, end_padding_width=1
-    )
-    slice_starts = np.array([0, 0]).astype(np.int32)
-    slice_ends = np.array([-1, -3]).astype(np.int32)
-    slice_starts = net.GivenTensorIntFill([], shape=[2], values=slice_starts)
-    slice_ends = net.GivenTensorIntFill([], shape=[2], values=slice_ends)
-    new_classes = net.Slice([new_classes, slice_starts, slice_ends])
-    return new_classes
diff --git a/caffe2/python/crf_viterbi_test.py b/caffe2/python/crf_viterbi_test.py
deleted file mode 100644
index 052bbbf4e6bf..000000000000
--- a/caffe2/python/crf_viterbi_test.py
+++ /dev/null
@@ -1,46 +0,0 @@
-
-
-
-
-from caffe2.python import workspace, crf
-
-from caffe2.python.cnn import CNNModelHelper
-from caffe2.python.crf_predict import crf_update_predictions
-from caffe2.python.test_util import TestCase
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-
-
-class TestCrfDecode(TestCase):
-
-    @given(num_tags=st.integers(2, 4), num_words=st.integers(2, 15))
-    @settings(deadline=2000)
-    def test_crf_viterbi(self, num_tags, num_words):
-        model = CNNModelHelper(name='external')
-        predictions = np.random.randn(num_words, num_tags).astype(np.float32)
-        transitions = np.random.uniform(
-            low=-1, high=1, size=(num_tags + 2, num_tags + 2)
-        ).astype(np.float32)
-        predictions_blob, transitions_blob = (
-            model.net.AddExternalInputs('predictions', 'crf_transitions')
-        )
-        workspace.FeedBlob(str(transitions_blob), transitions)
-        workspace.FeedBlob(str(predictions_blob), predictions)
-        crf_layer = crf.CRFWithLoss(model, num_tags, transitions_blob)
-
-        updated_predictions = crf_update_predictions(
-            model, crf_layer, predictions_blob
-        )
-        ref_predictions = crf_layer.update_predictions(predictions_blob)
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        updated_predictions = workspace.FetchBlob(str(updated_predictions))
-        ref_predictions = workspace.FetchBlob(str(ref_predictions))
-        np.testing.assert_allclose(
-            updated_predictions,
-            ref_predictions,
-            atol=1e-4, rtol=1e-4, err_msg='Mismatch in CRF predictions'
-        )
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
deleted file mode 100644
index 0dfe4de0ea91..000000000000
--- a/caffe2/python/data_parallel_model.py
+++ /dev/null
@@ -1,2221 +0,0 @@
-## @package data_parallel_model
-# Module caffe2.python.data_parallel_model
-
-
-
-
-from collections import OrderedDict
-import logging
-import copy
-
-from multiprocessing import cpu_count
-
-from caffe2.python import \
-    model_helper, dyndep, scope, workspace, core, memonger, utils
-from caffe2.proto import caffe2_pb2
-
-import numpy as np
-import warnings
-
-dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops")
-
-# We only import nccl operators when the machine has GPUs
-# Otherwise the binary can be compiled with CPU-only mode, and
-# will not be able to find those modules
-if workspace.NumGpuDevices() > 0:
-    dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nccl:nccl_ops")
-    dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops_gpu")
-
-log = logging.getLogger("data_parallel_model")
-log.setLevel(logging.INFO)
-
-_DEFAULT_TIMEOUT_SEC = 30
-_DEFAULT_BARRIER_NET_TIMEOUT_SEC = 300
-
-
-def Parallelize_GPU(*args, **kwargs):
-    kwargs['cpu_device'] = False
-    Parallelize(*args, **kwargs)
-
-
-def Parallelize_CPU(*args, **kwargs):
-    kwargs['cpu_device'] = True
-    Parallelize(*args, **kwargs)
-
-def Parallelize_iDeep(*args, **kwargs):
-    kwargs['ideep'] = True
-    Parallelize(*args, **kwargs)
-
-def Parallelize(
-    model_helper_obj,
-    input_builder_fun,
-    forward_pass_builder_fun,
-    param_update_builder_fun=None,
-    optimizer_builder_fun=None,
-    post_sync_builder_fun=None,
-    pre_grad_net_transformer_fun=None,
-    net_transformer_fun=None,
-    devices=None,
-    rendezvous=None,
-    net_type='dag',
-    broadcast_computed_params=True,
-    optimize_gradient_memory=False,
-    dynamic_memory_management=False,
-    blobs_to_keep=None,
-    use_nccl=False,
-    max_concurrent_distributed_ops=16,
-    cpu_device=False,
-    ideep=False,
-    num_threads_per_device=4,
-    shared_model=False,
-    combine_spatial_bn=False,
-    barrier_net_timeout_sec=_DEFAULT_BARRIER_NET_TIMEOUT_SEC,
-):
-    '''
-    Function to create a model that can run on many GPUs or CPUs.
-      model_helper_obj: an object of ModelHelper
-      input_builder_fun:
-                         Function that adds the input operators
-                         Note: Remember to instantiate reader outside of this
-                         function so all devices share same reader object.
-                         Signature:  input_builder_fun(model)
-      forward_pass_builder_fun:
-                        Function to add the operators to the model.
-                        Must return list of loss-blob references that
-                        are used to build the gradient. Loss scale parameter
-                        is passed, as you should scale the loss of your model
-                        by 1.0 / the total number of devices.
-                        Signature: forward_pass_builder_fun(model, loss_scale)
-      param_update_builder_fun:
-                        Function that adds operators that are run after
-                        gradient update, such as updating the weights and
-                        weight decaying. This is called for each GPU separately.
-                        Signature: param_update_builder_fun(model)
-      optimizer_builder_fun:
-                        Alternative to param_update_builder_fun, allows one
-                        to add an optimizer for the whole model. Called only
-                        once, without name or devicescope.
-      net_transformer_fun:
-                        Optional function to transform the network after the
-                        network is built. It will be called once (NOT once per
-                        GPU.)
-                        Signature:
-                        net_transformer_fun(
-                            model, num_devices, device_prefix, device_type)
-      pre_grad_net_transformer_fun:
-                        Optional function to transform the network similar to
-                        net_transformer_fun, but happens before gradient ops
-                        been add.
-                        Signature: pre_grad_net_transformer_fun(model)
-      post_sync_builder_fun:
-                        Function applied after initial parameter sync has been
-                        completed, such as keeping multi-precision parameters
-                        in sync.
-                        Signature: post_sync_builder_fun(model)
-      devices:          List of GPU ids, such as [0, 1, 2, 3],
-      rendezvous:       used for rendezvous in distributed computation, if None
-                        then only one node is used. To create rendezvous,
-                        use <TBD>.
-      net_type:         Network type
-      optimize_gradient_memory: whether to apply 'memonger' to share blobs
-      shared_model      (only for CPU) use same parameters on each device
-                        in gradient computation to reduce memory footprint.
-      dynamic_memory_management: Whether to apply dynamic memory optimization
-                        by freeing unused blobs. The underlying (de)allocation
-                        uses cached allocator. For GPU training PLEASE MAKE SURE
-                        caffe2_cuda_memory_pool is set.
-      blobs_to_keep :   A list of blob names to keep and don't free during
-                        dynamic memory optimization (for example loss blob).
-      cpu_device        Use CPU instead of GPU.
-      ideep             Use ideep.
-      combine_spatial_bn:
-                        When set to True, applies batch normalization across
-                        all devices within the node. If False, batch
-                        normalization will be done separately for each device.
-                        This option is currently only supported on the CPU.
-      barrier_net_timeout_sec:
-                        The timeout in seconds of the barrier net, which is run
-                        to synchronize shards before a training epoch starts.
-                        Defaults to 300 seconds.
-    '''
-    assert scope.CurrentDeviceScope() is None \
-        or scope.CurrentDeviceScope().device_type == caffe2_pb2.CPU, \
-        "Parallelize must be called without device-scope, \
-        device scope was: {}".format(scope.CurrentDeviceScope())
-
-    if devices is None:
-        if not (cpu_device or ideep):
-            devices = list(range(0, workspace.NumCudaDevices()))
-        else:
-            devices = list(range(0, cpu_count()))
-
-    if not (cpu_device or ideep):
-        for gpu in devices:
-            if gpu >= workspace.NumGpuDevices():
-                log.warning("** Only {} GPUs available, GPUs {} requested".format(
-                    workspace.NumGpuDevices(), devices))
-                break
-        model_helper_obj._device_type = workspace.GpuDeviceType
-        model_helper_obj._device_prefix = "gpu"
-        model_helper_obj._shared_model = False
-        device_name = "GPU"
-        assert shared_model is False, "Shared model only supported on CPU"
-    elif ideep:
-        model_helper_obj._device_type = caffe2_pb2.IDEEP
-        model_helper_obj._device_prefix = "ideep"
-        device_name = "IDEEP"
-        model_helper_obj._shared_model = shared_model
-        if shared_model and rendezvous is not None:
-            assert "Shared model only supported on single-node currently"
-    else:
-        model_helper_obj._device_type = caffe2_pb2.CPU
-        model_helper_obj._device_prefix = "cpu"
-        device_name = "CPU"
-        model_helper_obj._shared_model = shared_model
-        if shared_model and rendezvous is not None:
-            assert "Shared model only supported on single-node currently"
-
-    log.info("Parallelizing model for devices: {}".format(devices))
-    extra_workers = 8 if rendezvous is not None else 0  # best-guess
-    num_workers = len(devices) * num_threads_per_device + extra_workers
-    max_concurrent_distributed_ops =\
-        min(max_concurrent_distributed_ops, num_workers - 1)
-    model_helper_obj.net.Proto().num_workers = num_workers
-    model_helper_obj.net.Proto().type = net_type
-
-    # Store some information in the model -- a bit ugly
-    model_helper_obj._devices = devices
-    model_helper_obj._rendezvous = rendezvous
-    model_helper_obj._sync_barrier_net = None
-
-    model_helper_obj._broadcast_context = None
-    model_helper_obj._grad_names = []
-
-    assert isinstance(model_helper_obj, model_helper.ModelHelper)
-
-    # Keep track of params that were in the model before: they are not
-    # data parallel, so we need to handle them separately
-    non_datapar_params = copy.copy(model_helper_obj.params)
-
-    # Add input and model
-    log.info("Create input and model training operators")
-
-    losses_by_gpu = {}
-    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
-    loss_scale = 1.0 / (len(devices) * num_shards)
-
-    has_parameter_updates = param_update_builder_fun is not None or \
-        optimizer_builder_fun is not None
-    assert not (
-        param_update_builder_fun is not None and
-        optimizer_builder_fun is not None
-    ), 'Can only specify one of param_update_builder_fun, optimizer_builder_fun'
-
-    # Check that a model that is used for validation/testing has
-    # init_params False, otherwise running the param init net will overwrite
-    # synchronized values by the training net
-    if not has_parameter_updates and model_helper_obj.init_params:
-        log.warning('')
-        log.warning("############# WARNING #############")
-        log.warning("Model {}/{} is used for testing/validation but".format(
-            model_helper_obj.name, model_helper_obj))
-        log.warning("has init_params=True!")
-        log.warning("This can conflict with model training.")
-        log.warning("Please ensure model = ModelHelper(init_params=False)")
-        log.warning('####################################')
-        log.warning('')
-        # TODO: make into assert
-
-    for device in devices:
-        device_opt = core.DeviceOption(model_helper_obj._device_type, device)
-        with core.DeviceScope(device_opt):
-            with core.NameScope("{}_{}".format(model_helper_obj._device_prefix,
-                                               device)):
-                log.info("Model for {} : {}".format(device_name, device))
-                input_builder_fun(model_helper_obj)
-                losses = forward_pass_builder_fun(model_helper_obj, loss_scale)
-                # Losses are not needed for test net
-                if has_parameter_updates:
-                    assert isinstance(losses, list), \
-                        'Model builder function must return list of loss blobs'
-                    for loss in losses:
-                        assert isinstance(loss, core.BlobReference), \
-                            'Model builder func must return list of loss blobs'
-
-                losses_by_gpu[device] = losses
-    _ValidateParams(model_helper_obj.params)
-
-    # Create parameter map
-    model_helper_obj._device_grouped_blobs =\
-        _GroupByDevice(model_helper_obj, devices,
-                       model_helper_obj.params, non_datapar_params)
-
-    # computed params
-    computed_params_grouped =\
-        _GroupByDevice(model_helper_obj, devices,
-                       model_helper_obj.GetComputedParams(''), [])
-    model_helper_obj._device_grouped_blobs.update(computed_params_grouped)
-
-    model_helper_obj._param_names =\
-        list(model_helper_obj._device_grouped_blobs.keys())
-    model_helper_obj._computed_param_names =\
-        list(computed_params_grouped.keys())
-
-    if pre_grad_net_transformer_fun:
-        pre_grad_net_transformer_fun(model_helper_obj)
-
-    if has_parameter_updates:
-        log.info("Adding gradient operators")
-        _AddGradientOperators(devices, model_helper_obj, losses_by_gpu)
-
-    if net_transformer_fun:
-        net_transformer_fun(
-            model_helper_obj,
-            len(devices),
-            model_helper_obj._device_prefix,
-            model_helper_obj._device_type)
-
-    if not has_parameter_updates:
-        log.info("Parameter update function not defined --> only forward")
-        _InferBlobDevice(model_helper_obj)
-        return
-
-    if combine_spatial_bn:
-        assert(has_parameter_updates), \
-            'combine_spatial_bn should only be used for train model'
-        _InterleaveOps(model_helper_obj)
-        if cpu_device:
-            _CPUInterDeviceBatchNormalization(model_helper_obj)
-        else:
-            _GPUInterDeviceBatchNormalization(model_helper_obj)
-
-    _ValidateParams(model_helper_obj.params)
-
-    # Group gradients by device and register to blob lookup
-    param_to_grad = model_helper_obj.param_to_grad
-    grads_ordered = [param_to_grad[p] for p in
-                     model_helper_obj.params if p in param_to_grad]
-    non_datapar_grads = [param_to_grad[p] for p in non_datapar_params]
-
-    gradients_grouped = _GroupByDevice(
-        model_helper_obj,
-        devices,
-        grads_ordered,
-        non_datapar_grads
-    )
-    model_helper_obj._device_grouped_blobs.update(gradients_grouped)
-    model_helper_obj._grad_names = list(gradients_grouped.keys())
-    model_helper_obj._losses_by_gpu = losses_by_gpu
-
-    _InferBlobDevice(model_helper_obj)
-
-    log.info("Add gradient all-reduces for SyncSGD")
-    if broadcast_computed_params:
-        _BroadcastComputedParams(devices, model_helper_obj, rendezvous, use_nccl)
-
-    if len(model_helper_obj._grad_names) > 0:
-        # Gradients in reverse order
-        reverse_ordered_grads = _GetReverseOrderedGrads(model_helper_obj)
-        assert(len(reverse_ordered_grads) > 0)
-        _AllReduceBlobs(
-            reverse_ordered_grads,
-            devices,
-            model_helper_obj,
-            model_helper_obj.net,
-            rendezvous,
-            use_nccl,
-            max_concurrent_distributed_ops,
-        )
-    else:
-        log.info("NOTE: Param builder function did not create any parameters.")
-
-    log.info("Post-iteration operators for updating params")
-    num_shards = 1 if rendezvous is None else rendezvous['num_shards']
-
-    all_params = set(model_helper_obj.GetParams(''))
-    if shared_model:
-        _PruneParametersForSharing(model_helper_obj)
-
-    if param_update_builder_fun is not None:
-        for device in devices:
-            device_opt = core.DeviceOption(model_helper_obj._device_type, device)
-            with core.DeviceScope(device_opt):
-                with core.NameScope(
-                    "{}_{}".format(model_helper_obj._device_prefix, device)
-                ):
-                    param_update_builder_fun(model_helper_obj)
-    else:
-        log.info("Calling optimizer builder function")
-        optimizer = optimizer_builder_fun(model_helper_obj)
-        model_helper_obj._optimizer = optimizer
-
-    (sync_blobs, sync_names) = _ComputeBlobsToSync(model_helper_obj)
-    sync_blobs_grouped = _GroupByDevice(
-        model_helper_obj,
-        devices,
-        sync_blobs,
-        [],
-    )
-    model_helper_obj._device_grouped_blobs.update(sync_blobs_grouped)
-
-    _InferBlobDevice(model_helper_obj)
-    _AnalyzeOperators(model_helper_obj)
-
-    # Configure dagnet to run with only one worker on the first iteration,
-    # to prevent concurrency problems with allocs and nccl.
-    arg = model_helper_obj.Proto().arg.add()
-    arg.name = "first_iter_only_one_worker"
-    arg.i = 1
-
-    # Add initial parameter syncs
-    log.info("Add initial parameter sync")
-    _SyncAllParams(
-        devices,
-        model_helper_obj,
-        model_helper_obj.param_init_net,
-        model_helper_obj.param_init_net,
-        rendezvous,
-        sync_names,
-        max_concurrent_distributed_ops=1
-    )
-
-    # Handle any operations that need to be done after parameter sync
-    # i.e. making sure multi-precision copies of parameters are up-to-date
-    if post_sync_builder_fun is not None:
-        for device in devices:
-            device_opt = core.DeviceOption(model_helper_obj._device_type, device)
-            with core.DeviceScope(device_opt):
-                with core.NameScope(
-                    "{}_{}".format(model_helper_obj._device_prefix, device)
-                ):
-                    post_sync_builder_fun(model_helper_obj)
-
-    assert not (optimize_gradient_memory and dynamic_memory_management), \
-        """It is not advised to use gradient optimization ('memonger')
-        with dynamic memory management."""
-
-    if optimize_gradient_memory:
-        _OptimizeGradientMemorySimple(model_helper_obj, losses_by_gpu, devices)
-
-    if dynamic_memory_management:
-        _AddDynamicMemoryOptimization(model_helper_obj, blobs_to_keep, devices)
-
-
-    model_helper_obj._data_parallel_model_init_nets = [
-        model_helper_obj.param_init_net,
-    ]
-
-    model_helper_obj._data_parallel_model_nets = [
-        model_helper_obj.net
-    ]
-    _AddBarrierToModelNets(model_helper_obj, barrier_net_timeout_sec)
-
-    if shared_model:
-        _RemapParameterBlobsForSharedModel(model_helper_obj, all_params)
-
-
-def Parallelize_GPU_BMUF(*args, **kwargs):
-    kwargs['cpu_device'] = False
-    Parallelize_BMUF(*args, **kwargs)
-
-
-def Parallelize_CPU_BMUF(*args, **kwargs):
-    kwargs['cpu_device'] = True
-    Parallelize_BMUF(*args, **kwargs)
-
-
-def Parallelize_BMUF(
-    model_helper_obj,
-    input_builder_fun,
-    forward_pass_builder_fun,
-    param_update_builder_fun,
-    block_learning_rate=1.0,
-    block_momentum=None,
-    devices=None,
-    rendezvous=None,
-    net_type='dag',
-    master_device=None,
-    use_nccl=False,
-    nesterov=False,
-    optimize_gradient_memory=False,
-    reset_momentum_sgd=False,
-    warmup_iterations=None,
-    max_concurrent_distributed_ops=4,
-    add_blobs_to_sync=None,
-    num_threads_per_device=4,
-    cpu_device=False,
-    barrier_net_timeout_sec=_DEFAULT_BARRIER_NET_TIMEOUT_SEC,
-):
-    '''
-    Function to create model that run on many GPUs and creates a net for
-    parameter_updates that can be run independently for number of iterations
-    then followed by another net that runs once to compute the final parameter
-    updates according to block wise model update filtering rule described
-    in : Scalable Training of Deep Learning Machines by Incremental Block
-    Training with Intra-block Parallel Optimization and Blockwise Model-Update
-    Filtering (ICASSP 2016).
-    '''
-    assert scope.CurrentDeviceScope() is None \
-        or scope.CurrentDeviceScope().device_type == caffe2_pb2.CPU, \
-        "Parallelize must be called without device-scope, \
-        device scope was: {}".format(scope.CurrentDeviceScope())
-
-    assert isinstance(model_helper_obj, model_helper.ModelHelper)
-
-    if devices is None:
-        devices = list(range(0, workspace.NumGpuDevices()))
-    if master_device is None:
-        master_device = devices[0]
-
-    if not cpu_device:
-        for gpu in devices:
-            if gpu >= workspace.NumGpuDevices():
-                log.warning("** Only {} GPUs available, GPUs {} requested".format(
-                    workspace.NumGpuDevices(), devices))
-                break
-        model_helper_obj._device_type = workspace.GpuDeviceType
-        model_helper_obj._device_prefix = "gpu"
-    else:
-        model_helper_obj._device_type = caffe2_pb2.CPU
-        model_helper_obj._device_prefix = "cpu"
-
-    model_helper_obj._devices = devices
-    model_helper_obj._rendezvous = rendezvous
-    model_helper_obj._sync_barrier_net = None
-    model_helper_obj._broadcast_context = None
-    model_helper_obj._shared_model = False
-    master_dev_opt = core.DeviceOption(model_helper_obj._device_type, master_device)
-
-    # question: rendezvous structure
-    num_shards = rendezvous['num_shards'] if rendezvous else 1
-    # num_devices is #devices across all machines
-    num_devices = len(devices) * num_shards
-    # num_workers is #threads to execute the DAG per shard
-    num_workers = num_threads_per_device * len(devices)
-    if rendezvous:
-        num_workers += 8
-
-    loss_scale = 1.0 / num_devices
-    if block_momentum is None:
-        block_momentum = 1.0 - 1.0 / num_devices
-
-    max_concurrent_distributed_ops = min(
-        max_concurrent_distributed_ops,
-        num_workers - 1
-    )
-
-    model_helper_obj.net.Proto().num_workers = num_workers
-    model_helper_obj.net.Proto().type = net_type
-
-    # A net for initializing global model parameters. Its called once in the
-    # same step as net parameters initialization.
-    model_helper_obj._global_model_init_net = core.Net('global_model_init')
-    model_helper_obj._global_model_init_net.Proto().type = net_type
-    model_helper_obj._global_model_init_net.Proto().num_workers = \
-        num_workers
-
-    # A net for computing final parameter updates. Its will run once after
-    # running net (local models updates) for `num_local_iterations` times.
-    model_helper_obj._global_model_param_updates_net = core.Net('global_model')
-    model_helper_obj._global_model_param_updates_net.Proto().type = net_type
-    model_helper_obj._global_model_param_updates_net.Proto().num_workers = \
-        num_workers
-
-    def _v(param):
-        return "{}_v".format(param)
-
-    def _g(param):
-        return "{}_g".format(param)
-
-    def _v_prev(param):
-        return "{}_prev".format(param)
-
-    # Keep track of params that were in the model before: they are not
-    # data parallel, so we need to handle them separately
-    non_datapar_params = copy.copy(model_helper_obj.params)
-    model_helper_obj._losses_by_gpu = {}
-
-    def _InitializeModels(gpu_id):
-        input_builder_fun(model_helper_obj)
-        loss = forward_pass_builder_fun(model_helper_obj, loss_scale)
-        model_helper_obj._losses_by_gpu[gpu_id] = loss
-    _ForEachDevice(
-        devices,
-        _InitializeModels,
-        device_type=model_helper_obj._device_type,
-        device_prefix=model_helper_obj._device_prefix,
-        scoped=True
-    )
-    _ValidateParams(model_helper_obj.params)
-
-    model_helper_obj._device_grouped_blobs =\
-        _GroupByDevice(model_helper_obj, devices,
-                       model_helper_obj.params, non_datapar_params)
-
-    model_helper_obj._param_names =\
-        list(model_helper_obj._device_grouped_blobs.keys())
-
-    _AddGradientOperators(
-        devices, model_helper_obj, model_helper_obj._losses_by_gpu
-    )
-    _ValidateParams(model_helper_obj.params)
-
-    _InferBlobDevice(model_helper_obj)
-
-    def _InitializeParamUpdate(gpu_id):
-        param_update_builder_fun(model_helper_obj)
-    _ForEachDevice(
-        devices,
-        _InitializeParamUpdate,
-        device_type=model_helper_obj._device_type,
-        device_prefix=model_helper_obj._device_prefix,
-        scoped=True
-    )
-
-    model_parameter_names = list(
-        model_helper_obj._device_grouped_blobs.keys()
-    )
-    if warmup_iterations is not None:
-        model_helper_obj._warmup_iterations = warmup_iterations
-        # A net for broadcasting gpu-0 (master shard) parameters after
-        # running net for `warmup_iterartions`.
-        model_helper_obj._warmup_broadcast = core.Net('warmup-broadcast')
-        model_helper_obj._warmup_broadcast.Proto().type = net_type
-        model_helper_obj._warmup_broadcast.Proto().num_workers = \
-           num_workers
-
-        _SyncAllParams(
-            devices,
-            model_helper_obj,
-            model_helper_obj.param_init_net,
-            model_helper_obj._warmup_broadcast,
-            rendezvous,
-            model_parameter_names,
-            max_concurrent_distributed_ops
-        )
-        for param_name in model_helper_obj._device_grouped_blobs.keys():
-            param = model_helper_obj._device_grouped_blobs[param_name][master_device]
-            with core.DeviceScope(master_dev_opt):
-                model_helper_obj._warmup_broadcast.Copy(param, _g(param))
-
-    # (Step-0) Initialize momentum parameters on master device.
-    for param_name in model_helper_obj._device_grouped_blobs.keys():
-        param = model_helper_obj._device_grouped_blobs[param_name][master_device]
-        with core.DeviceScope(master_dev_opt):
-            model_helper_obj._global_model_init_net.ConstantFill(
-                param, _v(param), value=0.0
-            )
-            model_helper_obj._global_model_init_net.Copy(param, _g(param))
-            if nesterov:
-                model_helper_obj._global_model_init_net.ConstantFill(
-                    param, _v_prev(param), value=0.0
-                )
-
-    # (Step-1) Update models for num_local_iterations.
-
-    # (Step-2) Compute post-local-updates average of the params.
-    # Sum model params across GPUs and store resutls in param_avg blob.
-    _AllReduceBlobs(
-        model_parameter_names,
-        devices,
-        model_helper_obj,
-        model_helper_obj._global_model_param_updates_net,
-        rendezvous,
-        use_nccl,
-        max_concurrent_distributed_ops
-    )
-
-    # (Step-3) Update momentum params :
-    # param_v = block_momentum * param_v
-    # + block_learning_Rate * (param_avg - param)
-    # if nesterov momentum:
-    # param = param + param_v
-    # - block_momentum * (param_v - param_v_prev)
-    # param_v_prev = param_v
-    # else:
-    # param = param + param_v
-    for param_name in model_parameter_names:
-        param = model_helper_obj._device_grouped_blobs[param_name][master_device]
-        with core.DeviceScope(master_dev_opt):
-            # TODO(ataei) : Stop building the graph here to get model average ?
-            model_helper_obj._global_model_param_updates_net.Scale(
-                param, param, scale=1.0 / num_devices
-            )
-            model_helper_obj._global_model_param_updates_net.Sub(
-                [param, _g(param)], param
-            )
-            model_helper_obj._global_model_param_updates_net.Scale(
-                param, param, scale=block_learning_rate
-            )
-            model_helper_obj._global_model_param_updates_net.Scale(
-                _v(param), _v(param), scale=block_momentum
-            )
-            model_helper_obj._global_model_param_updates_net.Add(
-                [_v(param), param], _v(param)
-            )
-            model_helper_obj._global_model_param_updates_net.Add(
-                [_g(param), _v(param)], _g(param)
-            )
-            if nesterov:
-                model_helper_obj._global_model_param_updates_net.Sub(
-                    [_v(param), _v_prev(param)], _v_prev(param)
-                )
-                model_helper_obj._global_model_param_updates_net.Scale(
-                    _v_prev(param), _v_prev(param), scale=block_momentum
-                )
-                model_helper_obj._global_model_param_updates_net.Sub(
-                    [_g(param), _v_prev(param)], _g(param)
-                )
-                model_helper_obj._global_model_param_updates_net.Copy(
-                    _v(param), _v_prev(param)
-                )
-            model_helper_obj._global_model_param_updates_net.Copy(
-                _g(param), param
-            )
-
-
-    _SyncAllParams(
-        devices,
-        model_helper_obj,
-        model_helper_obj.param_init_net,
-        model_helper_obj._global_model_param_updates_net,
-        rendezvous,
-        model_parameter_names,
-        max_concurrent_distributed_ops
-    )
-
-    # Add additional syncs
-    if add_blobs_to_sync is not None:
-        AddBlobSync(
-            model_helper_obj,
-            add_blobs_to_sync,
-            net=model_helper_obj._global_model_param_updates_net)
-
-    # Reset momentum-SGD parameters
-    if reset_momentum_sgd:
-        momentum_ops = [op for op in model_helper_obj.net.Proto().op
-                        if op.type == 'MomentumSGDUpdate']
-        for op in momentum_ops:
-            momentum_blob = op.input[1]
-            with core.DeviceScope(op.device_option):
-                model_helper_obj._global_model_param_updates_net.ConstantFill(
-                    [momentum_blob], momentum_blob, value=0.0
-                )
-
-    if optimize_gradient_memory:
-        _OptimizeGradientMemorySimple(
-            model_helper_obj, model_helper_obj._losses_by_gpu, devices
-        )
-
-    model_helper_obj._data_parallel_model_init_nets = [
-        model_helper_obj.param_init_net,
-        model_helper_obj._global_model_init_net
-    ]
-
-    model_helper_obj._data_parallel_model_nets = [
-        model_helper_obj.net,
-        (model_helper_obj._global_model_param_updates_net, 1)
-    ]
-    _AddBarrierToModelNets(model_helper_obj, barrier_net_timeout_sec)
-
-def CreateNet(model, overwrite=False):
-    for net_iters in model._data_parallel_model_nets:
-        if isinstance(net_iters, tuple):
-            workspace.CreateNet(net_iters[0], overwrite=overwrite)
-        else:
-            workspace.CreateNet(net_iters, overwrite=overwrite)
-
-
-def RunInitNet(model):
-    for init_net in model._data_parallel_model_init_nets:
-        workspace.RunNetOnce(init_net)
-    CreateNet(model)
-
-
-def RunWarmup(model):
-    workspace.RunNet(model.net, model._warmup_iterations)
-    workspace.RunNetOnce(model._warmup_broadcast)
-
-
-def RunNet(model, num_iterations):
-    for net_iter in model._data_parallel_model_nets:
-        if isinstance(net_iter, tuple):
-            workspace.RunNet(net_iter[0].Proto().name, net_iter[1])
-        else:
-            workspace.RunNet(net_iter, num_iterations)
-
-
-def _AddBarrierToModelNets(model, barrier_net_timeout_sec):
-    if model._rendezvous is not None and model._rendezvous['engine'] == 'GLOO':
-        # Synchronize DPM at the start of each epoch. This allows shards that
-        # starts an epoch sooner to wait for slower shards.  Without this,
-        # shards that are faster than others will begin training the next epoch
-        # while stragglers are blocked on IO, and may timeout after 30 seconds
-        # (_DEFAULT_TIMEOUT_SEC).
-        # We pass in model.param_init_net so that the barrier net can be run as
-        # part of the param_init_net.
-
-        model._barrier_init_net = core.Net("barrier_init_net")
-
-        model._barrier_net = _CreateBarrierNet(model, model._barrier_init_net,
-        "pre_training", barrier_net_timeout_sec)
-
-        model._data_parallel_model_init_nets.insert(0, model._barrier_init_net)
-
-        model._data_parallel_model_nets.insert(0, model._barrier_net)
-
-
-def _CreateBarrierNet(model, init_net, name_prefix, timeout_sec):
-    log.info("Creating barrier net")
-    assert model._rendezvous['engine'] == 'GLOO', "Engine does not support barrier"
-    comm_world = _CreateOrCloneCommonWorld(
-        init_net,
-        name_prefix + "_barrier_cw",
-        rendezvous=model._rendezvous,
-        timeout_sec=timeout_sec,
-    )
-    barrier_net = core.Net(name_prefix + "_barrier_net")
-    barrier_net.Barrier(
-        inputs=[comm_world],
-        outputs=[],
-        engine=model._rendezvous['engine'],
-    )
-    return barrier_net
-
-
-# DEPRECATED: See warnings below.
-def Synchronize(model, timeout_sec=_DEFAULT_BARRIER_NET_TIMEOUT_SEC):
-    warnings.warn("The Synchronize API has been deprecated.  We now have a "
-            "barrier net which runs before training to ensure all hosts wait "
-            "before training starts.  The default timeout for the barrier is "
-            "300s and it can be overridden using the barrier_net_timeout_sec "
-            "parameter when calling Parallelize.",
-            category=DeprecationWarning, stacklevel=2)
-    if model._rendezvous is None or model._rendezvous['num_shards'] <= 1:
-        # Single host case
-        return
-
-    if model._sync_barrier_net is None:
-        barrier_init_net = core.Net("sync_barrier_init_net")
-        model._sync_barrier_net = _CreateBarrierNet(
-            model, barrier_init_net, "sync", timeout_sec)
-        workspace.RunNetOnce(barrier_init_net)
-        workspace.CreateNet(model._sync_barrier_net)
-        model._sync_barrier_net_timeout = timeout_sec
-    assert model._sync_barrier_net_timeout == timeout_sec, \
-        "Must use fixed timeout, {} != {}".format(
-            model._sync_barrier_net_timeout, timeout_sec
-        )
-    log.info("Synchronize run barrier net.")
-    workspace.RunNet(model._sync_barrier_net)
-
-
-def ConvertNetForDevice(net, device=None):
-    '''
-    Converts all blobs in the net to have namescope gpu_X, and correct
-    device scope. You can use this to enable AppendNet with a
-    forward_pass_builder_fun:
-
-       def builder_fun(model):
-          ...
-          model.net.AppendNet(
-             data_parallel_model.ConvertNetForDevice(othermodel.net))
-          model.param_init_net.AppendNet(
-             data_parallel_model.ConvertNetForDevice(othermodel.param_init_net))
-    '''
-    mnet = copy.deepcopy(net)
-
-    if device is None:
-        device = scope.CurrentDeviceScope()
-    if core.IsGPUDeviceType(device.device_type):
-        device_prefix = "gpu"
-    elif device.device_type == caffe2_pb2.IDEEP:
-        device_prefix = "ideep"
-    else:
-        device_prefix = "cpu"
-
-    namescope = "{}_{}/".format(device_prefix, device.device_id)
-    for op in mnet.Proto().op:
-        if "RecurrentNetwork" in op.type:
-            raise NotImplementedError("RecurrentNetwork conversion not yet supported")
-        for i, inputb in enumerate(op.input):
-            op.input[i] = namescope + inputb
-        for i, outputb in enumerate(op.output):
-            op.output[i] = namescope + outputb
-        for i, blob in enumerate(op.control_input):
-            op.control_input[i] = namescope + blob
-        op.device_option.CopyFrom(device)
-    for i, einp in enumerate(mnet.Proto().external_input):
-        mnet.Proto().external_input[i] = namescope + einp
-    for i, eoutp in enumerate(mnet.Proto().external_output):
-        mnet.Proto().external_output[i] = namescope + eoutp
-    return mnet
-
-
-def _ForEachDevice(devices, f, device_type, device_prefix, scoped=False,
-                   *args, **kwargs):
-    for device in devices:
-        device_opt = core.DeviceOption(device_type, device)
-        with core.DeviceScope(device_opt):
-            if scoped:
-                with core.NameScope("{}_{}".format(device_prefix, device)):
-                    f(device, *args, **kwargs)
-            else:
-                f(device, *args, **kwargs)
-
-
-def _AddGradientOperators(devices, model, losses_by_gpu):
-    def create_grad(lossp):
-        return model.ConstantFill(lossp, str(lossp) + "_grad", value=1.0)
-
-    loss_grad = {}
-    # Explicitly need to create gradients on each GPU
-    for gpu_id in devices:
-        device = core.DeviceOption(model._device_type, gpu_id)
-        with core.DeviceScope(device):
-            for l in losses_by_gpu[gpu_id]:
-                lg = create_grad(l)
-                loss_grad[str(l)] = str(lg)
-
-    model.AddGradientOperators(loss_grad)
-
-
-def ExtractPredictorNet(model, inputs, outputs, device):
-    '''
-    Returns (net, params) that can be exported to be used as a prediction
-    net.
-    '''
-    master_device = model._devices[0]
-    prefix = "{}_{}/".format(model._device_prefix, master_device)
-    prefix_inputs = [prefix + str(b) for b in inputs]
-    prefix_outputs = [prefix + str(b) for b in outputs]
-    (predictor_net, export_blobs) = model_helper.ExtractPredictorNet(
-        net_proto=model.net.Proto(),
-        input_blobs=prefix_inputs,
-        output_blobs=prefix_outputs,
-        device=device,
-        renames={
-            a: b
-            for (a, b) in zip(prefix_inputs + prefix_outputs, inputs + outputs)
-        },
-    )
-
-    return (predictor_net, export_blobs)
-
-
-def GetCheckpointParams(model):
-    '''
-    Returns a set of blobs that are needed for a complete check point.
-    They are blobs for the first gpu and iteration blobs.
-    '''
-    (all_blobs, _) = _ComputeBlobsToSync(model)
-    first_gpu_blobs = {
-        b
-        for b in all_blobs
-        if str(b)
-        .startswith("{}_{}/".format(model._device_prefix, model._devices[0]))
-    }
-
-    # Add iteration blobs that do not have namescope separately, since
-    # it is important to checkpoint iteration counter
-    iteration_blobs = set()
-    for op in model.net.Proto().op:
-        if op.type == 'Iter' or op.type == 'AtomicIter':
-            if not op.output[0].startswith("{}_".format(model._device_prefix)):
-                iteration_blobs.add(op.output[0])
-
-    return first_gpu_blobs.union(iteration_blobs)
-
-
-def FinalizeAfterCheckpoint(model, blobs=None, cpu_mode=False):
-    '''
-    This function should be called after loading parameters from a
-    checkpoint / initial parameters file.
-    '''
-
-    if not hasattr(model, "_checkpoint_net"):
-        if blobs is None:
-            (_, uniq_blob_names) = _ComputeBlobsToSync(model)
-        else:
-            uniq_blob_names = [stripBlobName(p) for p in blobs]
-
-        # Synchronize to the blob lookup map, as the provided
-        # blobs might have non-parameters, such as momentum blobs.
-        log.info("Creating checkpoint synchronization net")
-        devices = model.GetDevices()
-        for name in uniq_blob_names:
-            if name not in model._device_grouped_blobs:
-                grouped = {
-                    d:
-                    core.BlobReference("{}_{}{}{}".format(
-                        model._device_prefix,
-                        d,
-                        scope._NAMESCOPE_SEPARATOR,
-                        name)
-                    ) for d in devices}
-                model._device_grouped_blobs[name] = grouped
-
-        model._checkpoint_net = core.Net("checkpoint_sync_net")
-        if not cpu_mode:
-            model._checkpoint_net.RunAllOnGPU()
-
-        checkpoint_init_net = None
-        if (model._rendezvous is not None and model._rendezvous['num_shards'] > 1):
-            checkpoint_init_net = core.Net("checkpoint_init_net")
-            if not cpu_mode:
-                checkpoint_init_net.RunAllOnGPU()
-
-        _SyncAllParams(
-            devices,
-            model,
-            checkpoint_init_net,
-            model._checkpoint_net,
-            model._rendezvous,
-            uniq_blob_names,
-            max_concurrent_distributed_ops=1
-        )
-        if (checkpoint_init_net):
-            workspace.RunNetOnce(checkpoint_init_net)
-
-        workspace.CreateNet(model._checkpoint_net)
-
-    # Run the sync
-    log.info("Run checkpoint net")
-    workspace.RunNet(model._checkpoint_net.Proto().name)
-
-
-def GetLearningRateBlobNames(model):
-    '''
-    Returns a list of learning rates blob names used in the optimizer.
-    '''
-    if model._optimizer is not None:
-        if model._device_type == caffe2_pb2.CPU or model._device_type == caffe2_pb2.IDEEP:
-            return [model._optimizer.get_cpu_blob_name('lr')]
-        elif core.IsGPUDeviceType(model._device_type):
-            return [model._optimizer.get_gpu_blob_name('lr', gpu, '')
-                    for gpu in model._devices]
-        else:
-            raise Exception(
-                "Unsupported device type : {}".format(model._device_type)
-            )
-    else:
-        lr_blob_names = []
-        for op in model.net.Proto().op:
-            if op.type == "LearningRate":
-                lr_blob_names.append(op.output(0))
-        return lr_blob_names
-
-
-def _Broadcast(devices, model, net, param, use_nccl=False):
-    # Copy params from gpu_0 to other
-    master_dev = devices[0]
-
-    if use_nccl:
-        if _IsGPUBlob(model, param):
-            master_device_opt = core.DeviceOption(model._device_type, master_dev)
-            with core.DeviceScope(master_device_opt):
-                # Note that the root is the root _rank_ and not the root
-                # _device_. Thus we always use root=0, regardless of the
-                # devices used.
-                net.NCCLBroadcast(
-                    list(model._device_grouped_blobs[param].values()),
-                    list(model._device_grouped_blobs[param].values()),
-                    root=0,
-                )
-                return
-
-    for dev_idx in devices[1:]:
-        if _IsGPUBlob(model, param):
-            device_opt = core.DeviceOption(workspace.GpuDeviceType, dev_idx)
-        else:
-            device_opt = core.DeviceOption(caffe2_pb2.IDEEP, 0) if _IsIDEEPBlob(model, param) else \
-                core.DeviceOption(caffe2_pb2.CPU, 0)
-        with core.DeviceScope(device_opt):
-            net.Copy(
-                model._device_grouped_blobs[param][master_dev],
-                model._device_grouped_blobs[param][dev_idx]
-            )
-
-
-def _AllReduce(devices, model, net, param, use_nccl=False, control_input=None):
-    blobs_group = list(model._device_grouped_blobs[param].values())
-    if model._device_type == caffe2_pb2.CUDA and use_nccl:
-        # TODO: for _shared_model, do only NCCLReduce
-        model.NCCLAllreduce(
-            blobs_group, blobs_group, control_input=control_input
-        )
-        return
-
-    if model._device_type == workspace.GpuDeviceType:
-        p2p_access_pattern = workspace.GetGpuPeerAccessPattern()
-    else:
-        p2p_access_pattern = None
-
-    def sumN(*dev_indices):
-        """Create a Sum op for 2 or more blobs on different devices.
-        Saves the result on the first device.
-
-        Args:
-        dev_indices -- a list of device indices, which can be translated into
-                       CUDA identifiers with model._devices
-        """
-        devices = [model._devices[idx] for idx in dev_indices]
-        blobs = [blobs_group[idx] for idx in dev_indices]
-        device_opt = core.DeviceOption(model._device_type, devices[0])
-        with core.DeviceScope(device_opt):
-            for i, peer in enumerate(devices):
-                if i == 0:
-                    continue  # Skip the first device
-                if p2p_access_pattern is not None and p2p_access_pattern.size and not p2p_access_pattern[
-                    devices[0], peer
-                ]:
-                    # Copy from peer to d0
-                    blobs[i] = model.Copy(
-                        blobs[i],
-                        'gpu_{}/{}_gpu{}_copy'.format(devices[0], param, peer)
-                    )
-            net.Sum(blobs, [blobs[0]], name='dpm')
-
-    if len(devices) == 16:
-        # Special tree reduction for 16 gpus, TODO generalize like in muji.py
-        for j in range(8):
-            sumN(j * 2, j * 2 + 1)
-        for j in range(4):
-            sumN(j * 4, j * 4 + 2)
-        for j in range(2):
-            sumN(j * 8, j * 8 + 4)
-        sumN(0, 8)
-    elif len(devices) == 8:
-        for j in range(4):
-            sumN(j * 2, j * 2 + 1)
-        for j in range(2):
-            sumN(j * 4, j * 4 + 2)
-        sumN(0, 4)
-    elif len(devices) == 4:
-        sumN(0, 1)
-        sumN(2, 3)
-        sumN(0, 2)
-    else:
-        sumN(*range(len(devices)))
-    # TODO: for _shared_model, no need to broadcast
-    _Broadcast(devices, model, net, param)
-
-
-def _SyncAllParams(
-    devices,
-    model,
-    init_net,
-    net,
-    rendezvous,
-    unique_param_names,
-    max_concurrent_distributed_ops=4
-):
-    if rendezvous is None or rendezvous['num_shards'] <= 1:
-        _SyncAllParamsSingleHost(devices, model, net, unique_param_names)
-    else:
-        _SyncAllParamsDistributed(
-            devices,
-            model,
-            init_net,
-            net,
-            rendezvous,
-            unique_param_names,
-            max_concurrent_distributed_ops
-        )
-
-
-def AddBlobSync(model, blobs, net=None):
-    '''
-    Sync a blob across devices and hosts
-    '''
-    if len(blobs) == 0:
-        return
-    net = model.net if net is None else net
-    for b in blobs:
-        assert not b.startswith(model._device_prefix), \
-            "Provide unprefixed blob name: {}".format(b)
-        model._device_grouped_blobs[b] = {
-            d: core.BlobReference("{}_{}/{}".format(model._device_prefix, d, b))
-            for d in model._devices
-        }
-
-    _SyncAllParams(
-        model._devices,
-        model,
-        model.param_init_net,
-        net,
-        model._rendezvous,
-        set(blobs))
-
-
-def AddDistributedBlobSync(model, blobs):
-    '''
-    Sync blobs across machines (but not across devices)
-    '''
-    if model._rendezvous is None:
-        return
-    synth_name = "_".join([str(b) for b in blobs])
-    comm_world = _CreateOrCloneCommonWorld(
-        model.param_init_net,
-        "blob_sync_cw_" + synth_name,
-        rendezvous=model._rendezvous,
-    )
-
-    model.net.Allreduce(
-        inputs=[comm_world] + blobs,
-        outputs=blobs,
-        engine=model._rendezvous['engine'],
-    )
-
-
-def _SyncAllParamsDistributed(
-    devices,
-    model,
-    init_net,
-    net,
-    rendezvous,
-    unique_param_names,
-    max_concurrent_distributed_ops
-):
-    assert rendezvous['num_shards'] > 1
-
-    gpu_device_opt = core.DeviceOption(model._device_type, devices[0])
-    cpu_device_opt = core.DeviceOption(caffe2_pb2.CPU)
-    ideep_device_opt = core.DeviceOption(caffe2_pb2.IDEEP)
-
-    if model._broadcast_context is None:
-        model._broadcast_context = CollectivesConcurrencyControl(
-            "broadcast",
-            max_concurrent_distributed_ops,
-            init_net,
-            rendezvous
-        )
-    context = model._broadcast_context
-
-    for param_name in sorted(unique_param_names):
-        master_param = model._device_grouped_blobs[param_name][devices[0]]
-        params_group = list(model._device_grouped_blobs[param_name].values())
-
-        def broadcast(params):
-            comm_world, control_input = context.get_control_and_context(params)
-            net.Broadcast(
-                inputs=[comm_world] + params,
-                outputs=params,
-                name=param_name,
-                engine=rendezvous['engine'],
-                control_input=control_input
-            )
-
-        device_opt = gpu_device_opt if _IsGPUBlob(
-            model, param_name
-        ) else ideep_device_opt if _IsIDEEPBlob(model, param_name) else cpu_device_opt
-
-        if rendezvous['engine'] == 'GLOO':
-            with core.DeviceScope(device_opt):
-                broadcast(params_group)
-        else:
-            # Copy between GPU and CPU
-            with core.DeviceScope(device_opt):
-                param_cpu = net.CopyGPUToCPU(
-                    master_param,
-                    str(master_param) + "cpu"
-                )
-            with core.DeviceScope(cpu_device_opt):
-                broadcast([param_cpu])
-            with core.DeviceScope(device_opt):
-                net.CopyCPUToGPU(param_cpu, master_param)
-
-            # Broadcast locally
-            _Broadcast(devices, model, net, param_name)
-
-
-def _SyncAllParamsSingleHost(devices, model, net, unique_param_names):
-    for param in unique_param_names:
-        _Broadcast(devices, model, net, param)
-
-
-def _AllReduceBlobs(blob_names, devices, model, net, rendezvous, use_nccl,
-                    max_concurrent_distributed_ops):
-    if rendezvous is None or rendezvous['num_shards'] <= 1:
-        _AllReduceBlobsSingleHost(
-            blob_names,
-            devices,
-            model,
-            net,
-            use_nccl
-        )
-    else:
-        _AllReduceBlobsDistributed(
-            blob_names,
-            devices,
-            model,
-            net,
-            rendezvous,
-            max_concurrent_distributed_ops,
-        )
-
-
-def _PruneParametersForSharing(model):
-    assert model._shared_model
-    master_prefix = "{}_{}/".format(model._device_prefix, model._devices[0])
-
-    # Remove non-master parameters so that they will not receive parameter
-    # update operators.
-    model.params = model.GetParams(master_prefix)
-    paramset = set(model.params)
-
-    model.param_to_grad = {
-        p: model.param_to_grad[p]
-        for p in model.param_to_grad if p in paramset
-    }
-    model.weights = [w for w in model.weights if w in paramset]
-    model.biases = [w for w in model.biases if w in paramset]
-
-
-def _RemapParameterBlobsForSharedModel(model, all_params):
-    assert model._shared_model
-    master_prefix = "{}_{}/".format(
-        model._device_prefix, model._devices[0])
-    log.info("Remapping param blobs to master -> {}".format(master_prefix))
-    master_params = set(model.GetParams())
-
-    # Remove all but master params
-    def modify_ops(net):
-        ops = []
-        for op in net.Proto().op:
-            delete_op = False
-            # Delete ops that output non-master version of parameter
-            for outp in op.output:
-                if outp in all_params and outp not in master_params:
-                    delete_op = True
-                    log.debug("Delete b/c {}:  {}".format(outp, str(op)))
-                    break
-            if delete_op:
-                continue
-            # Remap inputs to point to the master param
-            for j, inp in enumerate(op.input):
-                if inp in all_params and inp not in master_params:
-                    op.input[j] = master_prefix + stripBlobName(inp)
-            ops.append(op)
-        del net.Proto().op[:]
-        net.Proto().op.extend(ops)
-
-    modify_ops(model.param_init_net)
-    modify_ops(model.net)
-
-
-class CollectivesConcurrencyControl:
-    """
-    Creates common worlds (up to max_concurrent_context) and manage the
-    sequential execution of collectives that shares the same context with
-    cyclic control inputs.
-    """
-    def __init__(
-        self,
-        name,
-        max_concurrent_context,
-        param_init_net,
-        rendezvous
-    ):
-        self.name = name
-        self.param_init_net = param_init_net
-        self.max_concurrent_context = max_concurrent_context
-        self.counter = 0
-        self.common_worlds = []
-        self.control_inputs = []
-        self.rendezvous = rendezvous
-
-    def get_control_and_context(self, control_output_blob):
-        common_world, control_input = [None, None]
-        current_slot = self.counter % self.max_concurrent_context
-        if len(self.common_worlds) < self.max_concurrent_context:
-            common_world = _CreateOrCloneCommonWorld(
-                self.param_init_net,
-                "{}_{}_cw".format(self.name, current_slot),
-                rendezvous=self.rendezvous,
-            )
-            self.common_worlds.append(common_world)
-            self.control_inputs.append(control_output_blob)
-        else:
-            common_world = self.common_worlds[current_slot]
-            control_input = self.control_inputs[current_slot]
-            self.control_inputs[current_slot] = control_output_blob
-        self.counter += 1
-        return common_world, control_input
-
-
-def _AllReduceBlobsDistributed(
-    blob_names,
-    devices,
-    model,
-    net,
-    rendezvous,
-    max_concurrent_distributed_ops,
-):
-    num_workers = model.net.Proto().num_workers
-    assert num_workers > 1, "Please specify more than 1 worker"
-    all_reduce_engine = rendezvous['engine']
-
-    master_device_opt = core.DeviceOption(model._device_type, devices[0])
-
-    reducing_device_opt = master_device_opt
-
-    context = CollectivesConcurrencyControl(
-        "allreduce",
-        max_concurrent_distributed_ops,
-        model.param_init_net,
-        rendezvous
-    )
-
-    nccl_control_blob = None
-
-    for blob_name in blob_names:
-        master_blob = model._device_grouped_blobs[blob_name][devices[0]]
-        blobs_group = list(model._device_grouped_blobs[blob_name].values())
-
-        assert master_blob in blobs_group
-
-        # Remark: NCCLReduce does not support in-place modifications
-        # so we need a temporary blob
-        reduced_blob = str(master_blob) + "_red"
-
-        def allreduce(blobs, **kwargs):
-            with core.DeviceScope(reducing_device_opt):
-                comm_world, control_input = \
-                    context.get_control_and_context(blobs[0])
-                net.Allreduce(
-                    inputs=[comm_world] + blobs,
-                    outputs=blobs,
-                    name=blob_name,
-                    engine=all_reduce_engine,
-                    control_input=control_input,
-                    **kwargs
-                )
-
-        if rendezvous['engine'] == 'GLOO':
-            # With Gloo cross GPU and cross machine allreduce
-            # can be executed in a single operation.
-            # Try to use GPUDirect if transport == ibverbs.
-            allreduce(
-                blobs_group,
-                gpu_direct=(rendezvous.get("transport", None) == "ibverbs"),
-            )
-        else:
-            # Step 1: sum blobs from local GPUs to master GPU
-            with core.DeviceScope(master_device_opt):
-                model.ConstantFill(master_blob, reduced_blob, value=0.0)
-
-                # Temp fix since NCCLReduce does not work
-                net.NCCLAllreduce(
-                    blobs_group,
-                    blobs_group,
-                    control_input=nccl_control_blob,
-                )
-                nccl_control_blob = blobs_group[0]
-                net.Copy(master_blob, reduced_blob)
-
-            # Step 2: allreduce between all hosts, between master GPUs
-            allreduce([reduced_blob])
-
-            with core.DeviceScope(master_device_opt):
-                net.Copy(reduced_blob, master_blob)
-
-            # Step 3: broadcast locally
-            _Broadcast(devices, model, net, blob_name)
-
-
-def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl):
-    """Performs NCCL AllReduce to distribute blobs to all the GPUs."""
-
-    if len(devices) == 1:
-        return
-
-    # Now we need to Allreduce blobs on all the GPUs.
-    # Pick GPU #0 as a master GPU.
-    master_device_opt = core.DeviceOption(model._device_type, devices[0])
-    last_out = None
-    concatenated_idx = set()
-
-    for blob_name in blob_names:
-        # Group by blob_name for reduce.
-        blobs_group = list(model._device_grouped_blobs[blob_name].values())
-        if len(blobs_group) == 1:
-            # Non-reducible
-            continue
-        assert len(blobs_group) == len(devices), \
-            "Each GPU from {}, should have a copy of {}.".format(
-                devices, blob_name)
-
-        if _IsGPUBlob(model, blob_name):
-            with core.DeviceScope(master_device_opt):
-                if not isinstance(blobs_group[0], core.GradientSlice):
-                    _AllReduce(
-                        devices, model, net, blob_name, use_nccl, last_out
-                    )
-                    # last_out is used to serialize the execution of nccls
-                    last_out = blobs_group[0]
-
-                else:
-                    # Sparse gradients: all-gather for indices and values
-                    master_ns = "{}_{}".format(model._device_prefix, devices[0])
-                    '''
-                    Skip if we have already copied concatenated indices
-                    to the indices of GradientSlice. This happens when two
-                    or more grad blobs are gathered with the same indices
-                    blob
-                    '''
-                    skip_idx_concat = False
-                    for g in blobs_group:
-                        if g.indices in concatenated_idx:
-                            skip_idx_concat = True
-
-                    if not skip_idx_concat:
-                        grad_idx_concat, _ = net.Concat(
-                            [g.indices for g in blobs_group],
-                            ["{}/{}_index_concat".format(master_ns, blob_name),
-                             "{}/{}_index_splitinfo".format(master_ns, blob_name)],
-                            axis=0,
-                            name="note:data_parallel_model")
-
-                        for gpu, g in model._device_grouped_blobs[blob_name].items():
-                            device_opt = core.DeviceOption(model._device_type, gpu)
-                            with core.DeviceScope(device_opt):
-                                model.Copy(grad_idx_concat, g.indices)
-                                concatenated_idx.add(g.indices)
-
-                    grad_val_concat, _ = net.Concat(
-                        [g.values for g in blobs_group],
-                        ["{}/{}_val_concat".format(master_ns, blob_name),
-                         "{}/{}_val_splitinfo".format(master_ns, blob_name)],
-                        axis=0, name="note:data_parallel_model")
-
-                    for gpu, g in model._device_grouped_blobs[blob_name].items():
-                        device_opt = core.DeviceOption(model._device_type, gpu)
-                        with core.DeviceScope(device_opt):
-                            model.Copy(grad_val_concat, g.values)
-
-        elif _IsIDEEPBlob(model, blob_name):
-            assert not isinstance(blobs_group[0], core.GradientSlice), \
-                "Synchronizing gradient slices not supported"
-            with core.DeviceScope(core.DeviceOption(caffe2_pb2.IDEEP)):
-                net.Sum(blobs_group, [blobs_group[0]])
-                if not model._shared_model:
-                    _Broadcast(devices, model, net, blob_name)
-
-        else:
-            assert not isinstance(blobs_group[0], core.GradientSlice), \
-                "Synchronizing gradient slices not supported"
-            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-                # Poor man's allreduce
-                net.Sum(blobs_group, [blobs_group[0]])
-                if not model._shared_model:
-                    _Broadcast(devices, model, net, blob_name)
-
-
-def _BroadcastComputedParams(devices, model, rendezvous, use_nccl=False):
-    if rendezvous is None:
-        _BroadcastComputedParamsSingleHost(devices, model, use_nccl)
-    else:
-        _BroadcastComputedParamsDistributed(devices, model, rendezvous, use_nccl)
-
-
-def _BroadcastComputedParamsDistributed(
-    devices,
-    model,
-    rendezvous,
-    use_nccl=False
-):
-    _BroadcastComputedParamsSingleHost(devices, model, use_nccl)
-    log.warn("Distributed broadcast of computed params is not implemented yet")
-
-
-def _BroadcastComputedParamsSingleHost(devices, model, use_nccl=False):
-    '''
-    Average computed params over all devices
-    '''
-    if len(devices) == 1:
-        return
-
-    for param_name in model._computed_param_names:
-        # Copy from master to others -- averaging would be perhaps better,
-        # but currently NCCLAllReduce is too prone to deadlock
-        _Broadcast(devices, model, model.net, param_name, use_nccl)
-
-
-def _GetReverseOrderedGrads(model):
-    '''
-    Returns the gradients in reverse order (namespace stripped),
-    for the optimal synchronization order.
-    '''
-    return list(reversed(model._grad_names))
-
-
-# A helper function to extract a parameter's name
-def stripBlobName(param):
-    # Format is "a/b/c/d" -> "b/c/d"
-    if isinstance(param, core.GradientSlice):
-        return stripBlobName(param.indices) + ":" + stripBlobName(param.values)
-    else:
-        name = str(param)
-    return name[name.index(scope._NAMESCOPE_SEPARATOR) + 1:]
-
-
-def _AnalyzeOperators(model):
-    '''
-    Look at all the operators and check that they do not cross device scopes
-    '''
-    for op in model.Proto().op:
-        if "NCCL" in op.type or "Copy" in op.type or "Concat" in op.type:
-            continue
-        if "Sum" == op.type and op.name == "dpm":
-            continue
-        if "Allreduce" in op.type and "GLOO" in op.engine:
-            continue
-
-        op_dev = op.device_option
-        op_gpu = op_dev.device_id
-
-        # This avoids failing on operators that are only for CPU
-        if not core.IsGPUDeviceType(op_dev.device_type):
-            continue
-
-        namescope = "{}_{}/".format(model._device_prefix, op_gpu)
-        for inp in list(op.input) + list(op.output):
-            if inp.startswith("{}_".format(model._device_prefix)
-                             ) and not inp.startswith(namescope):
-                raise Exception(
-                    "Blob {} of op {}, should have namescope {}. Op: {}".format(
-                        inp,
-                        op.type,
-                        "{}_{}/".format(model._device_prefix, op_gpu),
-                        str(op),
-                    )
-                )
-
-
-def _InferBlobDevice(model):
-    '''
-    Assign blob to device option based on the operator outputing it
-    '''
-    mapping = {}
-
-    def map_ops(proto):
-        for op in proto.op:
-            device_option = op.device_option
-            if op.type == "Iter":
-                # Hack for Iters which have blob in CPU context
-                device_option = caffe2_pb2.DeviceOption()
-                device_option.device_type = caffe2_pb2.CPU
-            for b in list(op.input) + list(op.output):
-                if b not in mapping:
-                    mapping[b] = device_option
-            if op.type.startswith('RecurrentNetwork'):
-                step_args = [a for a in op.arg if a.name.endswith("step_net")]
-                for step_arg in step_args:
-                    map_ops(step_arg.n)
-    map_ops(model.param_init_net.Proto())
-    map_ops(model.net.Proto())
-    model._blob_to_device = mapping
-
-def _IsIDEEPBlob(model, blob_name):
-    if blob_name in model._blob_to_device:
-        return model._blob_to_device[blob_name].device_type == caffe2_pb2.IDEEP
-    else:
-        blob_name = "{}_{}/{}".format(
-            model._device_prefix, model._devices[0], blob_name
-        )
-        if blob_name not in model._blob_to_device:
-            return model._device_type == caffe2_pb2.IDEEP
-        return model._blob_to_device[blob_name].device_type == caffe2_pb2.IDEEP
-
-def _IsGPUBlob(model, blob_name):
-    if blob_name in model._blob_to_device:
-        return core.IsGPUDeviceType(model._blob_to_device[blob_name].device_type)
-    else:
-        blob_name = "{}_{}/{}".format(
-            model._device_prefix, model._devices[0], blob_name
-        )
-        if blob_name not in model._blob_to_device:
-            return core.IsGPUDeviceType(model._device_type)
-        return core.IsGPUDeviceType(model._blob_to_device[blob_name].device_type)
-
-
-def _GroupByDevice(model, devices, params, non_data_params):
-    '''
-    Groups blobs by device, returning a map of [blobname] = {0: BlobRef, 1: ..}.
-    Returns ordered dictionary, ensuring the original order.
-    '''
-    grouped = OrderedDict()
-    # Only consider params that were created to be  "data parallel"
-    params = params[len(non_data_params):]
-
-    for _i, p in enumerate(params):
-        assert isinstance(p, core.BlobReference) or \
-            isinstance(p, core.GradientSlice), \
-            "Param {} is not BlobReference or GradientSlice".format(p)
-
-        name = stripBlobName(p)
-        gpuid = None
-
-        if isinstance(p, core.BlobReference):
-            gpuid = int(p.GetNameScope().split("_")[1].split("/")[0])
-            assert "{}_{}/".format(model._device_prefix, gpuid) in p.GetNameScope(),\
-                "Param {} expected to have namescope '{}_{}'".format(str(p), model._device_prefix, gpuid)
-        else:
-            gpuid = int(p.indices.GetNameScope().split("_")[1].split("/")[0])
-            assert "{}_{}/".format(model._device_prefix, gpuid) in p.indices.GetNameScope(),\
-                "Indices {} expected to have namescope '{}_{}'".format(str(p), model._device_prefix, gpuid)
-            assert "{}_{}/".format(model._device_prefix, gpuid) in p.values.GetNameScope(),\
-                "Values {} expected to have namescope '{}_{}'".format(str(p), model._device_prefix, gpuid)
-
-        if name not in grouped:
-            grouped[name] = {}
-        grouped[name][gpuid] = p
-
-    return grouped
-
-
-def _ValidateParams(params):
-    set_params = set(params)
-    if len(params) > len(set_params):
-        dupes = []
-        sp = sorted(params)
-        for j, p in enumerate(sp):
-            if j > 0 and sp[j - 1] == p:
-                dupes.append(p)
-
-        assert len(params) == len(set_params), \
-            "Duplicate entries in params: {}".format(dupes)
-
-
-def _ComputeBlobsToSync(model):
-    '''
-    We sync all blobs that are generated by param init net and
-    are 'data parallel', i.e assigned to a device
-    '''
-    sync_names = set()
-
-    # We don't sync params if the model is shared
-    if model._shared_model:
-        blobs_to_sync = [str(p) for p in model.GetComputedParams('')]
-        sync_names = [stripBlobName(p) for p in blobs_to_sync]
-    else:
-        blobs_to_sync = []
-
-        for op in model.param_init_net.Proto().op:
-            dp_outputs = [
-                o for o in op.output
-                if o.startswith("{}_".format(model._device_prefix))
-            ]
-            sync_names.update([stripBlobName(o) for o in dp_outputs])
-            blobs_to_sync.extend(dp_outputs)
-
-        # Sanity check
-        diff = set(model._param_names) - sync_names
-        assert diff == set(), \
-           "Some params not instantiated in param init net: {}".format(diff)
-
-    # Remove duplicates and sort
-    prefixlen = len(model._device_prefix) + 1
-
-    def extract_sort_key(b):
-        # Sort first based on device id, and then by whole string
-        deviceid = int(b[prefixlen:b.index(scope._NAMESCOPE_SEPARATOR)])
-        return (deviceid, b)
-
-    blobs_to_sync = sorted(
-        list(set(blobs_to_sync)),
-        key=extract_sort_key)
-
-    blobs_to_sync = [core.BlobReference(b) for b in blobs_to_sync]
-    return (blobs_to_sync, sync_names)
-
-
-def _OptimizeGradientMemorySimple(model, losses_by_gpu, devices):
-    log.warning("------- DEPRECATED API, please use " +
-                   "data_parallel_model.OptimizeGradientMemory() ----- ")
-    for device in devices:
-        namescope = "{}_{}/".format(model._device_prefix, device)
-        model.net._net = memonger.share_grad_blobs(
-            model.net,
-            losses_by_gpu[device],
-            set(model.param_to_grad.values()),
-            namescope,
-            share_activations=False,
-        )
-
-
-def _AddDynamicMemoryOptimization(model, blobs_to_keep, devices):
-    blobs_to_keep_all_devices = set()
-    if blobs_to_keep is not None:
-        for device in devices:
-            for blob_name in blobs_to_keep:
-                blobs_to_keep_all_devices.add(
-                    "{}_{}/{}".format(model._device_prefix, device, blob_name)
-                )
-
-    if model._rendezvous is not None:
-        # GLOO operators expect the tensor addresses to remain same over
-        # iterations so we need to remove param grads from the dynamic memory
-        # management.
-        blobs_to_keep_all_devices.update(
-            [str(b) for b in model.param_to_grad.values()]
-        )
-
-    model.net._net = memonger.release_blobs_when_used(
-        model.net.Proto(),
-        blobs_to_keep_all_devices
-    )
-
-
-def OptimizeGradientMemory(model,
-                           input_shapes,
-                           excluded_blobs,
-                           recycle_activations):
-    """
-    Optimize memory usage of the backward pass by recycling blobs for gradient
-    inputs that have been 'used'.
-    input_shapes:  dict of blob name to shape for the inputs of the model.
-                   Pass empty dictionary if not known.
-    excluded_blobs: list of blobs that cannot be recycled. These are blobs
-                   that you will access externally.
-    recycle_activations: whether to also recycle forward pass activations
-    """
-    if input_shapes is not None:
-        input_shapes_all_devices = {}
-        for b, shp in input_shapes.items():
-            for d in model._devices:
-                input_shapes_all_devices["{}_{}/{}".
-                                         format(model._device_prefix, d, b)] = shp
-
-        (shapes, types) = workspace.InferShapesAndTypes(
-            [model.param_init_net, model.net],
-            input_shapes_all_devices,
-        )
-    else:
-        shapes = None
-
-    for device in model._devices:
-        namescope = "{}_{}/".format(model._device_prefix, device)
-        excluded_blobs_by_device = set(namescope + b for b in excluded_blobs)
-        model.net._net = memonger.share_grad_blobs(
-            model.net,
-            model._losses_by_gpu[device],
-            set(model.param_to_grad.values()),
-            namescope,
-            dont_share_blobs=excluded_blobs_by_device,
-            share_activations=recycle_activations,
-            blob_shapes=shapes,
-        )
-
-
-def _CreateOrCloneCommonWorld(
-        net,
-        common_world_blob,
-        rendezvous,
-        name=None,
-        timeout_sec=None):
-
-    if timeout_sec is None:
-        timeout_sec = _DEFAULT_TIMEOUT_SEC
-
-    timeout_ms = timeout_sec * 1000
-
-    # Check if there is an existing CreateCommonWorld
-    # with the same timeout we're looking for. If so,
-    # we can clone it instead of creating a new one.
-    existing = None
-    for op in net.Proto().op:
-        if op.type != "CreateCommonWorld":
-            continue
-
-        # Find common world timeout
-        op_timeout_ms = -1
-        for arg in op.arg:
-            if arg.name == 'timeout_ms':
-                op_timeout_ms = arg.i
-                break
-        if op_timeout_ms != timeout_ms:
-            continue
-
-        # This common world was created with the same timeout we're
-        # looking for, so we can clone it
-        existing = op.output[0]
-        break
-
-    if name is None:
-        name = "{}_op".format(common_world_blob)
-
-    if existing is not None:
-        comm_world = net.CloneCommonWorld(
-            [existing],
-            common_world_blob,
-            name=name,
-            engine=rendezvous['engine'],
-        )
-    else:
-        kwargs=dict()
-        if 'transport' in rendezvous:
-            kwargs['transport'] = rendezvous['transport']
-        if 'interface' in rendezvous:
-            kwargs['interface'] = rendezvous['interface']
-        if 'mpi_rendezvous' in rendezvous:
-            kwargs['mpi_rendezvous'] = rendezvous['mpi_rendezvous']
-        comm_world = net.CreateCommonWorld(
-            rendezvous['kv_handler'] or [],
-            common_world_blob,
-            name=name,
-            size=rendezvous['num_shards'],
-            rank=rendezvous['shard_id'],
-            engine=rendezvous['engine'],
-            timeout_ms=timeout_ms,
-            **kwargs
-        )
-
-    return comm_world
-
-
-def _RunComparison(model, blob_name, device=None):
-    if device is None:
-        device = model._blob_to_device[blob_name]
-    with core.DeviceScope(device):
-        rendezvous = model._rendezvous
-        if rendezvous is None or rendezvous['num_shards'] == 1:
-            return True
-
-        test_data_arr = np.zeros(rendezvous['num_shards']).astype(np.float32)
-        test_data_arr[rendezvous['shard_id']] = 1
-        workspace.FeedBlob("compare_arr", test_data_arr)
-
-        comparison_net = core.Net("allcompare_net")
-
-        kwargs=dict()
-        if 'mpi_rendezvous' in rendezvous:
-            kwargs['mpi_rendezvous'] = rendezvous['mpi_rendezvous']
-        comm_world = comparison_net.CreateCommonWorld(
-            rendezvous['kv_handler'] or [],
-            "initial_sync",
-            name=model.net.Proto().name + ".cw_master_select",
-            size=rendezvous['num_shards'],
-            rank=rendezvous['shard_id'],
-            engine=rendezvous['engine'],
-            **kwargs
-        )
-
-        blob_name_checksum = blob_name + "_checksum"
-        comparison_net.SumSqrElements(
-            [blob_name], [blob_name_checksum], average=False
-        )
-
-        blob_name_gather = blob_name + "_gather"
-        comparison_net.Mul(
-            inputs=["compare_arr", blob_name_checksum],
-            outputs=blob_name_gather,
-            broadcast=1
-        )
-
-        comparison_net.Allreduce(
-            inputs=[comm_world, blob_name_gather],
-            outputs=[blob_name_gather],
-            engine=rendezvous['engine'],
-        )
-
-        workspace.RunNetOnce(comparison_net)
-        gather_arr = workspace.FetchBlob(blob_name_gather)
-
-        baseline = gather_arr[0]
-        for i in range(rendezvous['num_shards']):
-            assert gather_arr[i] == baseline, \
-                "allcompare failed on shard {}.".format(rendezvous['shard_id'])
-
-        return True
-
-
-def _InterleaveOps(model):
-    '''
-    Data Parallel Model creates a net with ops in one device grouped together.
-    This will interleave the ops so that each op for each device is next
-    to each other in the net. Kind of like combining decks of cards. This
-    ensures that progress is made along the critical path roughly concurrently
-    for each device, which is important due to the extra intra-node
-    synchronization required for multi-device batch normalization.
-    '''
-    orig_ops = list(model.net.Proto().op)
-    num_devices = len(model._devices)
-    num_ops_per_dev = len(orig_ops) // num_devices
-    assert num_devices * num_ops_per_dev == len(orig_ops), \
-           'Number of ops per device in original net is not uniform'
-    new_ops = []
-    ops = {d: [] for d in range(num_devices)}
-    for op in orig_ops:
-        ops[op.device_option.device_id].append(op)
-
-    for j in range(num_ops_per_dev):
-        tp = None
-        for d in model._devices:
-            if tp is None:
-                tp = ops[d][j].type
-            new_ops.append(ops[d][j])
-            # Sanity
-            assert ops[d][j].type == tp, \
-                "Type mismatch {} / {}".format(tp, ops[d][j].type)
-
-    del model.net.Proto().op[:]
-    model.net.Proto().op.extend(new_ops)
-
-
-def _CPUInterDeviceBatchNormalization(model):
-    orig_ops = list(model.net.Proto().op)
-    new_ops = []
-    num_devices = len(model._devices)
-    batch_norm_ops = []
-    injected_ops = []
-
-    spatial_bn_phase = False
-    sums_blobs = []
-    sumsq_blobs = []
-    name = []
-    input_blob_name = None
-
-    spatial_bn_gradient_phase = False
-    scale_grad_blobs = []
-    bias_grad_blobs = []
-
-    def _cpuReduce(param, input_blobs, destination_blobs):
-        """
-        Reduce results from multiple cpus and distributes the results back
-        to each device. This is done by copying values to cpu_0 and summing
-        them. The cpu_0 result is then copied back to each of the devices.
-
-        param: the name of the data (blobs) to reduce
-        input_blobs: the list of blobs to reduce
-        destination_blobs: list of blobs to copy the result to
-        """
-        added_ops = []
-        result_blob = "cpu_0/" + param + "_combined"
-        added_ops.append(core.CreateOperator("Sum", input_blobs, result_blob))
-        for blob in destination_blobs:
-            added_ops.append(core.CreateOperator("Copy", result_blob, blob))
-        return added_ops
-
-    for op in orig_ops:
-        if op.type != 'SpatialBN' and op.type != 'SpatialBNGradient':
-            if spatial_bn_phase:
-                new_ops.extend(injected_ops)
-                new_ops.append(
-                    core.CreateOperator("Sum",
-                                        sums_blobs,
-                                        input_blob_name + "_sums_combined"))
-                new_ops.append(
-                    core.CreateOperator("Sum",
-                                        sumsq_blobs,
-                                        input_blob_name + "_sumsq_combined"))
-                new_ops.extend(batch_norm_ops)
-                injected_ops = []
-                batch_norm_ops = []
-                sums_blobs = []
-                sumsq_blobs = []
-                spatial_bn_phase = False
-                input_blob_name = None
-            elif spatial_bn_gradient_phase:
-                new_ops.extend(injected_ops)
-                new_ops.extend(_cpuReduce(
-                    stripBlobName(scale_grad_blobs[0]),
-                    scale_grad_blobs,
-                    scale_grad_blobs))
-                new_ops.extend(_cpuReduce(
-                    stripBlobName(bias_grad_blobs[0]),
-                    bias_grad_blobs,
-                    bias_grad_blobs))
-                new_ops.extend(batch_norm_ops)
-                injected_ops = []
-                batch_norm_ops = []
-                scale_grad_blobs = []
-                bias_grad_blobs = []
-                spatial_bn_gradient_phase = False
-            new_ops.append(op)
-        elif op.type == 'SpatialBN':
-            spatial_bn_phase = True
-            if input_blob_name is None:
-                input_blob_name = op.input[0]
-            name = op.input[0]
-            injected_ops.append(
-                core.CreateOperator(
-                    "ChannelStats",
-                    name,
-                    [name + "_sums", name + "_sumsq"]))
-            sums_blobs.append(name + "_sums")
-            sumsq_blobs.append(name + "_sumsq")
-            op.input.append(input_blob_name + "_sums_combined")
-            op.input.append(input_blob_name + "_sumsq_combined")
-            op.arg.extend([utils.MakeArgument("num_batches", num_devices)])
-            batch_norm_ops.append(op)
-        elif op.type == 'SpatialBNGradient':
-            spatial_bn_gradient_phase = True
-            injected_ops.append(
-                core.CreateOperator("ChannelBackpropStats",
-                                    [op.input[0], op.input[3], op.input[4],
-                                     op.input[2]],
-                                    [op.output[1], op.output[2]]))
-            scale_grad_blobs.append(op.output[1])
-            bias_grad_blobs.append(op.output[2])
-            op.arg.extend([utils.MakeArgument("num_batches", num_devices)])
-            op.input.extend([op.output[1], op.output[2]])
-            batch_norm_ops.append(op)
-
-    assert not spatial_bn_phase, \
-        "Net modification for cpu inter-device batch normalization failed"
-    del model.net.Proto().op[:]
-    model.net.Proto().op.extend(new_ops)
-
-
-def _GPUInterDeviceBatchNormalization(model):
-    orig_ops = list(model.net.Proto().op)
-    new_ops = []
-    num_devices = len(model._devices)
-    batch_norm_ops = []
-    injected_ops = []
-
-    spatial_bn_phase = False
-    sums_blobs = []
-    sumsq_blobs = []
-    name = []
-    input_blob_name = None
-
-    spatial_bn_gradient_phase = False
-    scale_grad_blobs = []
-    bias_grad_blobs = []
-    master_device = "cpu_0"
-    master_device_option = core.DeviceOption(caffe2_pb2.CPU)
-
-    def _gpuReduce(param, num_devices, master_device, result_blobs=None):
-        """
-        Reduces results from multiple gpus and distributes the results back
-        to each device. This is done by copying values to the master device
-        and summing them. The master device result is then copied back to
-        each of the devices.
-
-        param: the name of the data (blobs) to reduce
-        num_devices: the number of devices
-        master_device: the device to copy/compute values on
-        result_blobs: optional list of result blobs to copy to
-        """
-        added_ops = []
-        source_blobs = []
-        destination_blobs = []
-        if result_blobs is None:
-            result_blobs = [
-                "gpu_{}/{}_combined".format(i, param) for i in range(num_devices)
-            ]
-        for i in range(num_devices):
-            device_option = core.DeviceOption(model._device_type, i)
-            source_blobs.append("gpu_{}/{}".format(i, param))
-            destination_blobs.append(
-                "{}/{}_gpu_{}_copy".format(master_device, param, i))
-            added_ops.append(
-                core.CreateOperator(
-                    "CopyGPUToCPU",
-                    source_blobs[i],
-                    destination_blobs[i],
-                    device_option=device_option))
-        added_ops.append(
-            core.CreateOperator(
-                "Sum",
-                destination_blobs,
-                "{}/{}_combined".format(master_device, param),
-                device_option=master_device_option))
-        for i in range(num_devices):
-            device_option = core.DeviceOption(model._device_type, i)
-            added_ops.append(
-                core.CreateOperator(
-                    "CopyCPUToGPU",
-                    "{}/{}_combined".format(master_device, param),
-                    result_blobs[i],
-                    device_option=device_option))
-        return added_ops
-
-    for op in orig_ops:
-        if op.type != 'SpatialBN' and op.type != 'SpatialBNGradient':
-            if spatial_bn_phase:
-                new_ops.extend(injected_ops)
-                new_ops.extend(_gpuReduce(
-                    stripBlobName(input_blob_name) + "_sums",
-                    num_devices,
-                    master_device,
-                ))
-                new_ops.extend(_gpuReduce(
-                    stripBlobName(input_blob_name) + "_sumsq",
-                    num_devices,
-                    master_device,
-                ))
-                new_ops.extend(batch_norm_ops)
-                injected_ops = []
-                batch_norm_ops = []
-                sums_blobs = []
-                sumsq_blobs = []
-                spatial_bn_phase = False
-                input_blob_name = None
-            elif spatial_bn_gradient_phase:
-                new_ops.extend(injected_ops)
-                new_ops.extend(_gpuReduce(
-                    stripBlobName(scale_grad_blobs[0]),
-                    num_devices,
-                    master_device,
-                    scale_grad_blobs,
-                ))
-                new_ops.extend(_gpuReduce(
-                    stripBlobName(bias_grad_blobs[0]),
-                    num_devices,
-                    master_device,
-                    bias_grad_blobs,
-                ))
-                new_ops.extend(batch_norm_ops)
-                injected_ops = []
-                batch_norm_ops = []
-                scale_grad_blobs = []
-                bias_grad_blobs = []
-                spatial_bn_gradient_phase = False
-            new_ops.append(op)
-        elif op.type == 'SpatialBN':
-            spatial_bn_phase = True
-            if input_blob_name is None:
-                input_blob_name = op.input[0]
-            name = op.input[0]
-            device_option = core.DeviceOption(
-                model._device_type,
-                op.device_option.device_id,
-            )
-            injected_ops.append(
-                core.CreateOperator(
-                    "ChannelStats",
-                    name,
-                    [name + "_sums", name + "_sumsq"],
-                    device_option=device_option))
-            sums_blobs.append(name + "_sums")
-            sumsq_blobs.append(name + "_sumsq")
-            op.input.append(name + "_sums_combined")
-            op.input.append(name + "_sumsq_combined")
-            op.arg.extend([utils.MakeArgument("num_batches", num_devices)])
-            batch_norm_ops.append(op)
-        elif op.type == 'SpatialBNGradient':
-            spatial_bn_gradient_phase = True
-            device_option = core.DeviceOption(
-                model._device_type,
-                op.device_option.device_id,
-            )
-            injected_ops.append(
-                core.CreateOperator("ChannelBackpropStats",
-                                    [op.input[0], op.input[3], op.input[4],
-                                     op.input[2]],
-                                    [op.output[1], op.output[2]],
-                                    device_option=device_option))
-            scale_grad_blobs.append(op.output[1])
-            bias_grad_blobs.append(op.output[2])
-            op.arg.extend([utils.MakeArgument("num_batches", num_devices)])
-            op.input.extend([op.output[1], op.output[2]])
-            batch_norm_ops.append(op)
-
-    assert not spatial_bn_phase, \
-        "Net modification for gpu inter-device batch normalization failed"
-    del model.net.Proto().op[:]
-    model.net.Proto().op.extend(new_ops)
diff --git a/caffe2/python/data_parallel_model_test.py b/caffe2/python/data_parallel_model_test.py
deleted file mode 100644
index 1d3be0ff3ecc..000000000000
--- a/caffe2/python/data_parallel_model_test.py
+++ /dev/null
@@ -1,1427 +0,0 @@
-
-
-
-
-from multiprocessing import Process, Queue
-import numpy as np
-import os
-import shutil
-import tempfile
-import unittest
-import time
-from unittest.mock import Mock
-from hypothesis import assume, given, settings
-import hypothesis.strategies as st
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import brew, core, cnn, data_parallel_model, dyndep, \
-    model_helper, optimizer, rnn_cell, workspace
-from caffe2.python.test_util import TestCase
-
-
-dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
-
-
-class TemporaryDirectory:
-    def __enter__(self):
-        self.tmpdir = tempfile.mkdtemp()
-        return self.tmpdir
-
-    def __exit__(self, type, value, traceback):
-        shutil.rmtree(self.tmpdir)
-
-# Note(jiayq): we are yet to find out why Travis gives out an error in gloo
-# like:
-# RuntimeError: [enforce fail at /home/travis/build/caffe2/caffe2/third_party/gloo/gloo/transport/tcp/device.cc:113] ifa != nullptr. Unable to find interface for: [127.0.1.1]
-# See for example https://travis-ci.org/caffe2/caffe2/jobs/262433866
-# As a result, we will check if this is travis, and if yes, disable it.
-@unittest.skipIf(os.environ.get("TRAVIS"), "DPMTest has a known issue with Travis.")
-class DataParallelModelTest(TestCase):
-
-    def run_model(self, devices, gpu):
-        '''
-        Helper function for test_equiv
-        '''
-        def input_builder_fun(model):
-            return None
-
-        def model_build_fun(model, loss_scale):
-            fc = model.FC("data", "fc", 16, 1,
-                          ("ConstantFill", {}), ("ConstantFill", {}))
-            fc_fl = model.FlattenToVec(fc, "fc_fl")
-            sigm = model.Sigmoid(fc_fl, "sigm")
-            sq = model.SquaredL2Distance([sigm, "label"], "sq")
-            loss = model.AveragedLoss(sq, "loss")
-            loss = model.Scale(loss, scale=loss_scale)
-
-            # For testing explicit sync
-            model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
-            return [loss]
-
-        def add_optimizer(model):
-            return optimizer.build_sgd(
-                model,
-                0.1,
-                policy="fixed",
-                max_gradient_norm=5.0,
-                allow_lr_injection=True,
-            )
-
-        workspace.ResetWorkspace()
-        model = cnn.CNNModelHelper(
-            order="NHWC",
-            name="test{}".format(devices),
-        )
-        data_parallel_model.Parallelize(
-            model,
-            input_builder_fun=input_builder_fun,
-            forward_pass_builder_fun=model_build_fun,
-            optimizer_builder_fun=add_optimizer,
-            devices=devices,
-            cpu_device=not gpu,
-            shared_model=not gpu,
-            combine_spatial_bn=not gpu,
-        )
-        data_parallel_model.AddBlobSync(model, ["sync_num"])
-
-        # Light test for LR names
-        lr_names = data_parallel_model.GetLearningRateBlobNames(model)
-        self.assertGreater(len(lr_names), 0)
-
-        np.random.seed(2603)
-
-        # Each run has same input, independent of number of gpus
-        batch_size = 64
-        for i in range(0, 10):
-            full_data = np.random.rand(batch_size, 16)
-            full_labels = np.round(full_data[:, 0])
-            batch_per_device = batch_size // len(devices)
-
-            for (j, g) in enumerate(devices):
-                st = j * batch_per_device
-                en = st + batch_per_device
-                data = full_data[st:en, :].astype(np.float32)
-                labels = full_labels[st:en].astype(np.float32)
-                with core.DeviceScope(core.DeviceOption(model._device_type, g)):
-                    workspace.FeedBlob(
-                        "{}_{}/data".format(model._device_prefix, g), data
-                    )
-                    workspace.FeedBlob(
-                        "{}_{}/label".format(model._device_prefix, g), labels
-                    )
-
-            if i == 0:
-                workspace.RunNetOnce(model.param_init_net)
-                workspace.CreateNet(model.net)
-
-            workspace.FeedBlob(
-                model._device_prefix + "_0/sync_num",
-                np.array([i * 2]).astype(np.float32),
-                device_option=core.DeviceOption(model._device_type, 0))
-            workspace.RunNet(model.net.Proto().name)
-
-            # Test AddBlobSync
-            for j in model._devices:
-                sync = workspace.FetchBlob(
-                    model._device_prefix + "_{}/sync_num".format(j))[0]
-                self.assertTrue(abs(sync - i * 2) < 0.01)
-
-        return workspace.FetchBlob("{}_0/fc_w".format(model._device_prefix))
-
-    def run_test_locally(self, fn, device_option=None, **kwargs):
-        # Queue for assertion errors on subprocesses
-        queue = Queue()
-
-        # Capture any exception thrown by the subprocess
-        def run_fn(*args, **kwargs):
-            try:
-                if device_option is None:
-                    fn(*args, **kwargs)
-                    workspace.ResetWorkspace()
-                else:
-                    with core.DeviceScope(device_option):
-                        fn(*args, **kwargs)
-                        workspace.ResetWorkspace()
-            except Exception as ex:
-                queue.put(ex)
-
-        # Start N processes in the background
-        procs = []
-        for i in range(kwargs['comm_size']):
-            kwargs['comm_rank'] = i
-            proc = Process(
-                target=run_fn,
-                kwargs=kwargs)
-            proc.start()
-            procs.append(proc)
-
-        # Test complete, join background processes
-        while len(procs) > 0:
-            proc = procs.pop(0)
-            while proc.is_alive():
-                proc.join(1)
-
-                # Raise exception if we find any.
-                # Note that the following is executed ALSO after
-                # the last process was joined, so if ANY exception
-                # was raised, it will be re-raised here.
-                if not queue.empty():
-                    raise queue.get()
-
-    def test_equiv(self):
-        '''
-        Test that the model produces exactly same results given
-        total batchsize, independent of number of GPUs.
-        '''
-        for gpu in [True, False]:
-            if gpu and (not workspace.has_gpu_support or
-                        workspace.NumCudaDevices() < 2):
-                continue
-            result_2gpus = self.run_model([0, 1], gpu=gpu)
-            result_1gpus = self.run_model([0], gpu=gpu)
-
-            self.assertTrue(np.allclose(result_1gpus, result_2gpus))
-
-            if not gpu or workspace.NumCudaDevices() >= 4:
-                result_4gpus = self.run_model(list(range(4)), gpu=gpu)
-                self.assertTrue(np.allclose(result_1gpus, result_4gpus))
-
-            if not gpu or workspace.NumCudaDevices() >= 8:
-                result_8gpus = self.run_model(list(range(8)), gpu=gpu)
-                self.assertTrue(np.allclose(result_1gpus, result_8gpus))
-
-            if not gpu or workspace.NumCudaDevices() >= 16:
-                result_16gpus = self.run_model(list(range(16)), gpu=gpu)
-                self.assertTrue(np.allclose(result_1gpus, result_16gpus))
-
-    def test_checkpoint_params(self):
-        def add_input_ops(model):
-            pass
-
-        def add_model_ops(model, loss_scale):
-            model.NHWC2NCHW("data", "data_nchw")
-            model.Conv("data_nchw", 'conv1', 3, 64,
-                       weight_init=("MSRAFill", {}), kernel=7,
-                       stride=2, pad=3, no_bias=0)
-            model.SpatialBN('conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3, is_test=False)
-            model.Relu('conv1_spatbn_relu', 'conv1_spatbn_relu')
-            model.MaxPool('conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
-            model.FC('pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=100)
-            model.Sigmoid('fc', 'fc_sigm')
-            model.Softmax('fc_sigm', 'softmax')
-            model.LabelCrossEntropy(['softmax', 'label'], 'xent')
-            loss = model.AveragedLoss('xent', 'loss')
-
-            # Add a duplicate param init to ensure it does not cause issues
-            model.param_init_net.ConstantFill(
-                [], ["fc_w"], shape=((64 * 56 * 56), 1000)
-            )
-            return [loss]
-
-        def add_optimizer(model):
-            optimizer.build_sgd(model, 0.1, policy="fixed", momentum=0.9)
-
-        model = cnn.CNNModelHelper(
-            order="NHWC",
-            name="test",
-        )
-        data_parallel_model.Parallelize_CPU(
-            model,
-            input_builder_fun=add_input_ops,
-            forward_pass_builder_fun=add_model_ops,
-            optimizer_builder_fun=add_optimizer,
-            devices=[1, 2, 3],
-        )
-
-        # Only gpu_1 params should be returned (gpu_1 is the first gpu)
-        checkpoint_params = data_parallel_model.GetCheckpointParams(model)
-        for p in model.GetParams("cpu_1/"):
-            self.assertTrue(p in checkpoint_params)
-            self.assertTrue(p + "_momentum" in checkpoint_params)
-        for p in model.GetParams("cpu_2/"):
-            self.assertFalse(p in checkpoint_params)
-        self.assertTrue(
-            core.BlobReference("cpu_1/fc_w_momentum") in checkpoint_params)
-        for c in model.GetComputedParams("cpu_1/"):
-            self.assertTrue(c in checkpoint_params)
-        for c in model.GetComputedParams("cpu_2/"):
-            self.assertFalse(c in checkpoint_params)
-        self.assertFalse(core.BlobReference("cpu_1/data") in checkpoint_params)
-        self.assertTrue(core.BlobReference("optimizer_iteration") in checkpoint_params)
-
-    def test_net_conversion_and_append_net(self):
-        other = model_helper.ModelHelper()
-        fc1 = brew.fc(other, "data", "other_fc1", dim_in=3*227*227, dim_out=10)
-        fc2 = brew.fc(other, fc1, "other_fc2", dim_in=10, dim_out=10)
-        brew.fc(other, fc2, "other_fc3", dim_in=10, dim_out=10)
-
-        def add_input_ops(model):
-            model.net.UniformFill([], ["data"], shape=[4, 227, 227, 3])
-            model.net.UniformFill([], ["label"], shape=[4])
-
-        def add_model_ops(model, loss_scale):
-            model.NHWC2NCHW("data", "data_nchw")
-            model.Conv("data_nchw", 'conv1', 3, 64,
-                       weight_init=("MSRAFill", {}), kernel=7,
-                       stride=2, pad=3, no_bias=0)
-            model.SpatialBN('conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3, is_test=False)
-            model.Relu('conv1_spatbn_relu', 'conv1_spatbn_relu')
-            model.MaxPool('conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
-            model.FC('pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=10)
-
-            # Append the net and param_init_net of the other model
-            appendnet = data_parallel_model.ConvertNetForDevice(other.net)
-            model.net.AppendNet(appendnet)
-
-            model.param_init_net.AppendNet(
-                data_parallel_model.ConvertNetForDevice(other.param_init_net))
-
-            model.Sigmoid('fc', 'fc_sigm')
-            model.Softmax('fc_sigm', 'softmax')
-            loss = model.AveragedLoss('softmax', 'loss')
-            return [loss]
-
-        def add_optimizer(model):
-            optimizer.build_sgd(model, 0.1, policy="fixed", momentum=0.9)
-
-        model = cnn.CNNModelHelper(
-            order="NCHW",
-            name="test",
-        )
-        data_parallel_model.Parallelize_CPU(
-            model,
-            input_builder_fun=add_input_ops,
-            forward_pass_builder_fun=add_model_ops,
-            optimizer_builder_fun=add_optimizer,
-            devices=range(4)
-        )
-
-        # Just create and run net and confirm no exception is thrown
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.CreateNet(model.net)
-        workspace.RunNet(model.net)
-
-    @unittest.skip("Test fails on GPU/RE")
-    def test_synchronization_barrier(self):
-        def run(comm_rank, comm_size, tmpdir):
-            def add_input_ops(model):
-                pass
-
-            def add_model_ops(model, loss_scale):
-                return []
-
-            def add_optimizer(model):
-                pass
-
-            store_handler = "store_handler"
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "FileStoreHandlerCreate",
-                    [],
-                    [store_handler],
-                    path=tmpdir))
-            rendezvous = dict(
-                kv_handler=store_handler,
-                shard_id=comm_rank,
-                num_shards=comm_size,
-                engine='GLOO',
-            )
-
-            model = cnn.CNNModelHelper(
-                order="NHWC",
-                name="test",
-            )
-            data_parallel_model.Parallelize_CPU(
-                model,
-                input_builder_fun=add_input_ops,
-                forward_pass_builder_fun=add_model_ops,
-                optimizer_builder_fun=add_optimizer,
-                devices=[1, 2, 3],
-                rendezvous=rendezvous
-            )
-            data_parallel_model.RunInitNet(model)
-
-            for _ in range(2):
-                data_parallel_model.Synchronize(model)
-
-        with TemporaryDirectory() as tmpdir:
-            self.run_test_locally(
-                run,
-                comm_size=2,
-                device_option=None,
-                tmpdir=tmpdir)
-
-    @unittest.skip("Test fails on GPU/RE")
-    def test_pre_train_synchronization_barrier(self):
-        def run(comm_rank, comm_size, tmpdir):
-            def add_input_ops(model):
-                pass
-
-            def add_model_ops(model, loss_scale):
-                return []
-
-            def add_optimizer(model):
-                pass
-
-            workspace.ResetWorkspace()
-            store_handler = "store_handler"
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "FileStoreHandlerCreate",
-                    [],
-                    [store_handler],
-                    path=tmpdir))
-            rendezvous = dict(
-                kv_handler=store_handler,
-                shard_id=comm_rank,
-                num_shards=comm_size,
-                engine='GLOO',
-            )
-
-            model = cnn.CNNModelHelper(
-                order="NHWC",
-                name="test",
-            )
-            # Set network timeout to 2 seconds, and add a 3 seconds
-            # sleep for 1 host.  Make sure there is no timeout on the
-            # second RunNet.
-            data_parallel_model._DEFAULT_TIMEOUT_SEC = 2
-            data_parallel_model.Parallelize_CPU(
-                model,
-                input_builder_fun=add_input_ops,
-                forward_pass_builder_fun=add_model_ops,
-                optimizer_builder_fun=add_optimizer,
-                devices=[1, 2, 3],
-                rendezvous=rendezvous,
-                barrier_net_timeout_sec=5
-            )
-            data_parallel_model.RunInitNet(model)
-            data_parallel_model.RunNet(model, 2)
-            if comm_rank == 0:
-                time.sleep(data_parallel_model._DEFAULT_TIMEOUT_SEC)
-            data_parallel_model.RunNet(model, 2)
-
-        with TemporaryDirectory() as tmpdir:
-            self.run_test_locally(
-                run,
-                comm_size=2,
-                device_option=None,
-                tmpdir=tmpdir)
-
-    def test_device_scope_check(self):
-        with self.assertRaises(AssertionError):
-            with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
-                data_parallel_model.Parallelize_GPU(None, None, None)
-
-    def test_net_transformer_function(self):
-        devices = [1, 2, 3]
-
-        def add_input_ops(model):
-            model.param_init_net.UniformFill([], ["data"], shape=[32, 8])
-
-        def add_optimizer(model):
-            optimizer.build_sgd(model, 0.1)
-
-        def add_model_ops(model, loss_scale):
-            fc1 = brew.fc(model, "data", "fc1", dim_in=8, dim_out=8)
-            return [fc1]
-
-        kwargs = {
-            'input_builder_fun': add_input_ops,
-            'forward_pass_builder_fun': add_model_ops,
-            'devices': devices,
-        }
-
-        # assert that the transformer is called for both train and test cases
-        transform = Mock()
-        kwargs['net_transformer_fun'] = transform
-        model = model_helper.ModelHelper(name="r", init_params=False)
-        data_parallel_model.Parallelize_CPU(model, **kwargs)
-        self.assertTrue(transform.called)
-        self.assertEqual(transform.call_count, 1)
-
-        transform = Mock()
-        kwargs['net_transformer_fun'] = transform
-        kwargs['optimizer_builder_fun'] = add_optimizer
-        model = model_helper.ModelHelper(name="r", init_params=True)
-        data_parallel_model.Parallelize_CPU(model, **kwargs)
-        self.assertTrue(transform.called)
-        self.assertEqual(transform.call_count, 1)
-
-    @given(seed=st.integers(0, 65535), batch_size=st.integers(1, 20))
-    @settings(deadline=2000)
-    def test_multi_device_bn_op_level_cpu(self, seed, batch_size):
-        self._bn_check_op_level("cpu", seed, batch_size)
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
-    @unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
-    @given(seed=st.integers(0, 65535), batch_size=st.integers(1, 20))
-    @settings(deadline=2000)
-    def test_multi_device_bn_op_level_gpu(self, seed, batch_size):
-        self._bn_check_op_level("gpu", seed, batch_size)
-
-    def _bn_check_op_level(self, device_type, seed, batch_size):
-        '''
-        Test multi device batch normalization at the operation level. This is
-        done by checking the outputs of batch normalization and its gradient
-        operator. We compare values produced with our manually calculated
-        batch normalization values and gradients.
-        '''
-        devices = [0, 1]
-        epsilon = 1e-3
-        tolerance = 1e-3
-
-        def _test_forward_pass(x, devices, device_type, scale, bias, epsilon):
-            x_concat = np.concatenate(x)
-            mean = np.mean(x_concat, axis=0)
-            var = np.var(x_concat, axis=0)
-            for device in devices:
-                x_i = x[device]
-                x_hat = (x_i - mean) / (np.sqrt(var + epsilon))
-                expected_out = scale * x_hat + bias
-                spatial_out = workspace.FetchBlob(
-                    "{}_{}/bn_out".format(device_type, device))
-                rel_error = np.linalg.norm(spatial_out - expected_out) \
-                            / np.linalg.norm(expected_out)
-                self.assertTrue(rel_error < 0.005)
-
-        def _test_backward_pass(x, devices, device_type, scale, tolerance):
-            dBias_arr = []
-            dY_arr = []
-            dGamma_arr = []
-            num_devices = len(devices)
-            mean = np.array(workspace.FetchBlob(
-                "{}_0/bn_out_sm".format(device_type)), dtype=np.float32)
-            inv_var = np.array(workspace.FetchBlob(
-                "{}_0/bn_out_siv".format(device_type)), dtype=np.float32)
-
-            # dBias
-            # Sum dBias values over all devices to find the average gradient
-            for device in devices:
-                dY_blob = workspace.FetchBlob(
-                    "{}_{}/bn_out_grad".format(device_type, device))
-                dY = np.array(dY_blob, dtype=np.float32)
-                dY_arr.append(dY)
-                dBias_arr.append(np.array(np.sum(dY, axis=0), dtype=np.float32))
-            dBias = np.sum(dBias_arr, dtype=np.float32)
-            dBias_avg = dBias / num_devices
-            for device in devices:
-                dBiasActual = np.sum(workspace.FetchBlob("{}_{}/bn_out_b_grad"
-                    .format(device_type, device)), dtype=np.float32)
-                self.assertTrue(np.isclose([dBiasActual], [dBias], atol=tolerance))
-
-            # dGamma
-            # Sum dGamma values over all devices to find the average gradient
-            for device in devices:
-                dGamma = np.sum((x[device] - mean) * inv_var * dY_arr[device],
-                    axis=0, dtype=np.float32)
-                dGamma_arr.append(dGamma)
-            dGamma = np.sum(dGamma_arr, axis=0, dtype=np.float32)
-            dGamma_avg = dGamma / num_devices
-            for device in devices:
-                dGammaActual = workspace.FetchBlob(
-                    "{}_{}/bn_out_s_grad".format(device_type, device))
-                self.assertTrue(np.isclose([dGamma], [dGammaActual], atol=tolerance))
-
-            # dX
-            scale_inv_var = scale * inv_var / batch_size
-            for device in devices:
-                dX = scale_inv_var * (dY_arr[device] * batch_size - dBias_avg
-                    - (x[device] - mean) * dGamma_avg * inv_var)
-                dX_actual = workspace.FetchBlob(
-                    "{}_{}/tanh_grad".format(device_type, device))
-                self.assertTrue(np.isclose([dX], [dX_actual], atol=tolerance).all())
-
-        def add_input_ops(model):
-            for device in devices:
-                data = np.random.rand(batch_size, 1, 1, 1).astype(np.float32)
-                workspace.FeedBlob("{}_{}/data".format(device_type, device), data)
-
-        def add_model_ops(model, loss_scale):
-            if device_type == "gpu":
-                model.CopyCPUToGPU("data", "device_data")
-                model.Tanh("device_data", "tanh")
-            else:
-                model.Tanh("data", "tanh")
-            model.SpatialBN("tanh", "bn_out", 1, epsilon=epsilon, is_test=False)
-            model.Sqr("bn_out", "sqr")
-            loss = model.SumElements("sqr", "loss")
-            return [loss]
-
-        def add_optimizer(model):
-            return optimizer.build_sgd(model, 0.1)
-
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-        model = cnn.CNNModelHelper(
-            order="NCHW",
-            name="test"
-        )
-        data_parallel_model.Parallelize(
-            model,
-            input_builder_fun=add_input_ops,
-            forward_pass_builder_fun=add_model_ops,
-            optimizer_builder_fun=add_optimizer,
-            devices=devices,
-            cpu_device=device_type == "cpu",
-            shared_model=False,
-            combine_spatial_bn=True,
-        )
-
-        workspace.RunNetOnce(model.param_init_net)
-        scale = workspace.FetchBlob("{}_0/bn_out_s".format(device_type))
-        bias = workspace.FetchBlob("{}_0/bn_out_b".format(device_type))
-        workspace.RunNetOnce(model.net)
-
-        x = []
-        for device in devices:
-            x_blob = workspace.FetchBlob("{}_{}/tanh".format(device_type, device))
-            x_i = np.array(x_blob, dtype=np.float32)
-            x.append(x_i)
-
-        _test_forward_pass(x, devices, device_type, scale, bias, epsilon)
-        _test_backward_pass(x, devices, device_type, scale, tolerance)
-
-    @given(seed=st.integers(0, 65535), batch_size=st.integers(1, 20))
-    @settings(deadline=2000)
-    def test_multi_device_bn_net_lvl_cpu(self, seed, batch_size):
-        if batch_size % 2 == 1:
-            batch_size += 1
-        self._test_multi_device_bn_net_lvl("cpu", seed, batch_size)
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
-    @unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
-    @given(seed=st.integers(0, 65535), batch_size=st.integers(1, 20))
-    @settings(deadline=2000)
-    def test_multi_device_bn_net_lvl_gpu(self, seed, batch_size):
-        if batch_size % 2 == 1:
-            batch_size += 1
-        self._test_multi_device_bn_net_lvl("gpu", seed, batch_size)
-
-    def _test_multi_device_bn_net_lvl(self, device_type, seed, batch_size):
-        '''
-        Test multi device batch normalization at the net level. This is done
-        by verifying that the final batch normalization outputs and the
-        gradient outputs from multiple devices are the same as those produced
-        from a single device
-        '''
-
-        # Verify that the gradients calculated over multiple devices are the
-        # same as the gradients calculated over one device. These values should
-        # be equivalent because combine_spatial_bn sums values over all devices
-        def _verify_bn_outputs(
-            devices,
-            device_type,
-            tolerance,
-            single_device_bn_out,
-            two_device_bn_out_vals,
-            single_device_grads,
-            two_device_grads,
-        ):
-            two_device_bn_out = np.concatenate(two_device_bn_out_vals)
-            self.assertTrue(np.isclose(
-                [single_device_bn_out], [two_device_bn_out], atol=tolerance).all())
-
-            # Scalar and Bias gradients should be the same across devices
-            gradient_names = ["bn_out_s_grad", "bn_out_b_grad"]
-            for name in gradient_names:
-                expected_grad = single_device_grads[name]
-                for device in devices:
-                    actual_grad = two_device_grads[device][name]
-                    self.assertTrue(
-                        np.isclose([actual_grad], [expected_grad], atol=tolerance))
-
-            # Expected tanh_grad should be the combined tanh_grad vectors
-            # across the devices
-            first_grad = two_device_grads[0]["tanh_grad"]
-            second_grad = two_device_grads[1]["tanh_grad"]
-            actual_grad = np.concatenate([first_grad, second_grad])
-            expected_grad = single_device_grads["tanh_grad"]
-            rel_error = np.linalg.norm(actual_grad - expected_grad) \
-                / np.linalg.norm(expected_grad)
-            self.assertTrue(rel_error < 1e-3)
-
-        def _create_model(multiple_devices):
-            def add_input_ops_no_combine(model):
-                workspace.FeedBlob("{}_0/data".format(device_type), data)
-
-            def add_input_ops_combine(model):
-                half = int(batch_size / 2)
-                workspace.FeedBlob("{}_0/data".format(device_type), data[:half])
-                workspace.FeedBlob("{}_1/data".format(device_type), data[half:])
-
-            def add_model_ops(model, loss_scale):
-                if device_type == "gpu":
-                    model.CopyCPUToGPU("data", "device_data")
-                    model.Tanh("device_data", "tanh")
-                else:
-                    model.Tanh("data", "tanh")
-                model.SpatialBN("tanh", "bn_out", 1, epsilon=epsilon, is_test=False)
-                model.Sqr("bn_out", "sqr")
-                loss = model.SumElements("sqr", "loss")
-                return [loss]
-
-            def add_optimizer(model):
-                return optimizer.build_sgd(model, 0.1)
-
-            if multiple_devices:
-                input_fun = add_input_ops_combine
-                devices = [0, 1]
-                combine_spatial_bn = True
-            else:
-                input_fun = add_input_ops_no_combine
-                devices = [0]
-                combine_spatial_bn = False
-            model = cnn.CNNModelHelper(
-                order="NCHW",
-                name="test"
-            )
-            data_parallel_model.Parallelize(
-                model,
-                input_builder_fun=input_fun,
-                forward_pass_builder_fun=add_model_ops,
-                optimizer_builder_fun=add_optimizer,
-                devices=devices,
-                cpu_device=device_type == "cpu",
-                shared_model=False,
-                combine_spatial_bn=combine_spatial_bn,
-            )
-            return model
-
-        devices = [0, 1]
-        epsilon = 1e-3
-        tolerance = 1e-3
-        # We are generating random data
-        np.random.seed(seed)
-        data = np.random.rand(batch_size, 1, 1, 1).astype(np.float32)
-        data = np.reshape(data, (batch_size, 1, 1, 1))
-
-        # Get values calculated without combine_spatial_bn
-        workspace.ResetWorkspace()
-        model_no_combine = _create_model(multiple_devices=False)
-        workspace.RunNetOnce(model_no_combine.param_init_net)
-        workspace.RunNetOnce(model_no_combine.net)
-        single_device_bn_out = workspace.FetchBlob("{}_0/bn_out".format(device_type))
-        single_device_grads = {}
-        single_device_grads["bn_out_s_grad"] = workspace.FetchBlob(
-            "{}_0/bn_out_s_grad".format(device_type))
-        single_device_grads["bn_out_b_grad"] = workspace.FetchBlob(
-            "{}_0/bn_out_b_grad".format(device_type))
-        single_device_grads["tanh_grad"] = workspace.FetchBlob(
-            "{}_0/tanh_grad".format(device_type))
-
-        # Get values calculated over multiple devices with combine_spatial_bn true
-        workspace.ResetWorkspace()
-        model_combine = _create_model(multiple_devices=True)
-        workspace.RunNetOnce(model_combine.param_init_net)
-        workspace.RunNetOnce(model_combine.net)
-        two_device_bn_out_vals = []
-        two_device_grads = {}
-        for device in devices:
-            bn_out_blob = "{}_{}/bn_out".format(device_type, device)
-            two_device_bn_out_vals.append(workspace.FetchBlob(bn_out_blob))
-            two_device_grads[device] = {}
-            two_device_grads[device]["bn_out_s_grad"] = workspace.FetchBlob(
-                "{}_{}/bn_out_s_grad".format(device_type, device))
-            two_device_grads[device]["bn_out_b_grad"] = workspace.FetchBlob(
-                "{}_{}/bn_out_b_grad".format(device_type, device))
-            two_device_grads[device]["tanh_grad"] = workspace.FetchBlob(
-                "{}_{}/tanh_grad".format(device_type, device))
-
-        # Check to see if the combined values are equivalent
-        _verify_bn_outputs(
-            devices,
-            device_type,
-            tolerance,
-            single_device_bn_out,
-            two_device_bn_out_vals,
-            single_device_grads,
-            two_device_grads
-        )
-
-class RecurrentNetworkParallelTest(TestCase):
-
-    def run_model(self, devices, gpu):
-
-        '''
-        Helper function for test_equiv
-        '''
-        def input_builder_fun(model):
-            return None
-
-        def model_build_fun(model, loss_scale):
-            workspace.FeedBlob(
-                core.ScopedBlobReference("seq_lengths"),
-                np.array([self.T] * self.batch_per_device, dtype=np.int32)
-            )
-            model.param_init_net.ConstantFill(
-                [],
-                "hidden_init",
-                value=0.0,
-                shape=[1, self.batch_per_device, self.hidden_dim]
-            )
-            model.param_init_net.ConstantFill(
-                [],
-                "cell_init",
-                value=0.0,
-                shape=[1, self.batch_per_device, self.hidden_dim]
-            )
-
-            output, _last_hidden, _, _last_state, = rnn_cell.LSTM(
-                model=model,
-                input_blob="data",
-                seq_lengths="seq_lengths",
-                initial_states=("hidden_init", "cell_init"),
-                dim_in=self.input_dim,
-                dim_out=self.hidden_dim,
-                scope="partest",
-            )
-
-            # A silly loss function
-            loss = model.AveragedLoss(
-                model.Sub([output, "target"], "dist"),
-                "loss",
-            )
-            loss = model.Scale(loss, "loss_scaled", scale=loss_scale)
-            return [loss]
-
-        def param_update_fun(model):
-            ITER = model.Iter("ITER")
-            LR = model.net.LearningRate(
-                [ITER],
-                "LR",
-                base_lr=(-0.1),
-                policy="fixed",
-            )
-            ONE = model.param_init_net.ConstantFill(
-                [], "ONE", shape=[1], value=1.0,
-            )
-            for param in model.GetParams():
-                param_grad = model.param_to_grad[param]
-                model.WeightedSum([param, ONE, param_grad, LR], param)
-
-            assert len(model.GetParams()) == len(model.params) // len(model._devices)
-
-        workspace.ResetWorkspace()
-        model = cnn.CNNModelHelper(
-            name="recurrent_test{}".format(devices),
-        )
-
-        self.T = 8
-        self.batch_size = 64
-        self.input_dim = 8
-        self.hidden_dim = 31
-        self.batch_per_device = self.batch_size // len(devices)
-
-        data_parallel_model.Parallelize(
-            model,
-            input_builder_fun=input_builder_fun,
-            forward_pass_builder_fun=model_build_fun,
-            param_update_builder_fun=param_update_fun,
-            devices=devices,
-            optimize_gradient_memory=True,
-            cpu_device=not gpu,
-        )
-
-        # Change all initialization to be ConstantFills so that
-        # the everything is deterministic
-        for op in model.param_init_net.Proto().op:
-            if op.type.endswith('Fill'):
-                op.type = 'ConstantFill'
-
-        # Each run has same input, independent of number of gpus
-        np.random.seed(20150210)
-        for i in range(0, 10):
-            full_data = np.random.rand(self.T, self.batch_size, self.input_dim)
-            full_target = np.random.rand(
-                self.T, self.batch_size, self.hidden_dim
-            )
-
-            for (j, g) in enumerate(devices):
-                st = j * self.batch_per_device
-                en = st + self.batch_per_device
-                data = full_data[:, st:en, :].astype(np.float32)
-                targets = full_target[:, st:en, :].astype(np.float32)
-                with core.DeviceScope(core.DeviceOption(model._device_type, g)):
-                    workspace.FeedBlob(
-                        "{}_{}/data".format(model._device_prefix, g), data
-                    )
-                    workspace.FeedBlob(
-                        "{}_{}/target".format(model._device_prefix, g), targets
-                    )
-
-            if i == 0:
-                workspace.RunNetOnce(model.param_init_net)
-                workspace.CreateNet(model.net)
-
-            workspace.RunNet(model.net.Proto().name)
-
-        return workspace.FetchBlob("{}_0/partest/i2h_w".format(model._device_prefix))
-
-    @unittest.skip("Test is flaky: https://github.com/pytorch/pytorch/issues/10322")
-    def test_equiv_recurrent(self):
-        '''
-        Test that the model produces exactly same results given
-        total batchsize, independent of number of GPUs/CPUs.
-        '''
-        for gpu in [True, False]:
-            if gpu and not workspace.has_gpu_support:
-                continue
-            result_2gpus = self.run_model([0, 1], gpu)
-            result_1gpus = self.run_model([0], gpu)
-
-            self.assertTrue(np.allclose(result_1gpus, result_2gpus))
-
-            if not gpu or workspace.NumCudaDevices() >= 4:
-                result_4gpus = self.run_model(list(range(4)), gpu)
-                self.assertTrue(np.allclose(result_1gpus, result_4gpus))
-
-            if not gpu or workspace.NumCudaDevices() >= 8:
-                result_8gpus = self.run_model(list(range(8)), gpu)
-                self.assertTrue(np.allclose(result_1gpus, result_8gpus))
-
-
-@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
-@unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPUs.")
-class SparseDataParallelModelTest(TestCase):
-
-    '''
-    Create and run the model. We try with both storing indices for gather
-    on CPU and on GPU
-    '''
-    def run_model(self, V, gpu_devices, cpu_indices):
-
-        def input_builder_fun(model):
-            return None
-
-        def model_build_fun(model, loss_scale):
-            if cpu_indices:
-                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-                    gathered_cpu = model.net.Gather(
-                        [self.vecs, 'indices'], 'gathered_cpu')
-
-                gathered = model.CopyCPUToGPU(gathered_cpu, "gathered")
-            else:
-                gpu_vecs = model.param_init_net.CopyCPUToGPU(
-                    self.vecs, "gpuvecs",
-                )
-                model.params.append(gpu_vecs)
-                gathered = model.net.Gather([gpu_vecs, 'indices'], 'gathered')
-            flattened = model.Flatten(gathered, "flattened")
-            fc = model.FC(flattened, "fc", 16 * 16, 1,
-                          ("ConstantFill", {}), ("ConstantFill", {}))
-            fc_fl = model.FlattenToVec(fc, "fc_fl")
-            sigm = model.Sigmoid(fc_fl, "sigm")
-            sq = model.SquaredL2Distance([sigm, "label"], "sq")
-            loss = model.AveragedLoss(sq, "loss")
-            loss = model.Scale(loss, scale=loss_scale)
-            return [loss]
-
-        def param_update_fun(model):
-            ONE = model.param_init_net.ConstantFill(
-                [], "ONE", shape=[1], value=1.0,
-            )
-            LR = model.CopyCPUToGPU(self.LR, "LR")
-            for param in model.GetParams():
-                param_grad = model.param_to_grad[param]
-                if not isinstance(param_grad, core.GradientSlice):
-                    model.WeightedSum([param, ONE, param_grad, LR], param)
-                else:
-                    param_momentum = model.param_init_net.ConstantFill(
-                        [param],
-                        param + '_momentum',
-                        value=0.0,
-                    )
-                    model.net.SparseMomentumSGDUpdate(
-                        [
-                            param_grad.values,
-                            param_momentum,
-                            LR,
-                            param,
-                            param_grad.indices,
-                        ],
-                        [
-                            param_grad.values, param_momentum, param
-                        ],
-                        momentum=0.1,
-                        nesterov=0,
-                    )
-
-        workspace.ResetWorkspace()
-        model = cnn.CNNModelHelper(
-            order="NHWC",
-            name="sparse_test{}".format(gpu_devices),
-        )
-
-        with core.NameScope("cpu"):
-            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-                self.ITER = model.Iter("ITER")
-                self.LR = model.net.LearningRate(
-                    [self.ITER],
-                    "LR",
-                    base_lr=(-0.1),
-                    policy="fixed",
-                )
-                self.vecs = model.param_init_net.UniformFill(
-                    [], "vecs", shape=[V, 16])
-                if cpu_indices:
-                    model.params.append(self.vecs)
-                self.ONE_CPU = model.param_init_net.ConstantFill(
-                    [], "ONE_CPU", shape=[1], value=1.0,
-                )
-
-        data_parallel_model.Parallelize_GPU(
-            model,
-            input_builder_fun=input_builder_fun,
-            forward_pass_builder_fun=model_build_fun,
-            param_update_builder_fun=param_update_fun,
-            devices=gpu_devices,
-        )
-
-        # Update the vecs
-        if cpu_indices:
-            with core.NameScope("cpu"):
-                with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-                    for param in model.GetParams():
-                        param_grad = model.param_to_grad[param]
-                        model.ScatterWeightedSum([param, self.ONE_CPU,
-                                                  param_grad.indices,
-                                                  param_grad.values,
-                                                  self.LR],
-                                                  self.vecs)
-        else:
-            with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
-                model.CopyGPUToCPU("gpu_0/gpuvecs", self.vecs)
-
-        np.random.seed(2603)
-
-        # Each run has same input, independent of number of gpus
-        batch_size = 64
-        for i in range(0, 10):
-            full_indices = np.random.permutation(V)[:batch_size * 16].reshape(
-                batch_size, 16
-            )
-            full_labels = full_indices[:, 0] % 2
-            batch_per_device = batch_size // len(gpu_devices)
-
-            for (j, g) in enumerate(gpu_devices):
-                st = j * batch_per_device
-                en = st + batch_per_device
-                indices = full_indices[st:en, :].astype(np.int32)
-                labels = full_labels[st:en].astype(np.float32)
-
-                device_for_indices = core.DeviceOption(caffe2_pb2.CPU)
-                if not cpu_indices:
-                    device_for_indices = core.DeviceOption(workspace.GpuDeviceType, g)
-
-                with core.DeviceScope(device_for_indices):
-                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)
-
-                with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, g)):
-                    workspace.FeedBlob("gpu_{}/label".format(g), labels)
-
-            if i == 0:
-                workspace.RunNetOnce(model.param_init_net)
-                # Force vecs to be same on all runs
-                orig_vecs = np.random.rand(V, 16).astype(np.float32)
-                workspace.FeedBlob(
-                    self.vecs,
-                    orig_vecs
-                )
-                if not cpu_indices:
-                    for g in gpu_devices:
-                        workspace.FeedBlob(
-                            "gpu_{}/gpuvecs".format(g),
-                            orig_vecs,
-                            device_option=core.DeviceOption(workspace.GpuDeviceType, g),
-                        )
-                workspace.CreateNet(model.net)
-
-            workspace.RunNet(model.net.Proto().name)
-            if len(gpu_devices) == 2:
-                if not cpu_indices:
-                    idx = workspace.FetchBlob("gpu_0/indices")
-                    idx = list(idx.flatten())
-                    n = len(idx)
-                    nu = len(set(idx))
-                    assert n == nu, "We cannot have duplicate indices"
-
-        # Sanity check to see the vecs were updated
-        self.assertFalse(
-            np.allclose(workspace.FetchBlob(self.vecs), orig_vecs))
-        return [workspace.FetchBlob(self.vecs if cpu_indices else "gpu_0/gpuvecs"),
-                workspace.FetchBlob("gpu_0/fc_w")]
-
-    def _test_equiv_sparse(self, cpu_indices):
-        '''
-            Test that the model produces exactly same results given
-            total batchsize, independent of number of GPUs.
-        '''
-        V = 10000
-        result_2gpus = self.run_model(V, [0, 1], cpu_indices)
-        result_1gpus = self.run_model(V, [0], cpu_indices)
-
-        self.assertTrue(np.allclose(result_1gpus[0], result_2gpus[0]))
-        self.assertTrue(np.allclose(result_1gpus[1], result_2gpus[1]))
-
-        if workspace.NumCudaDevices() >= 4:
-            result_4gpus = self.run_model(V, list(range(4)), cpu_indices)
-            self.assertTrue(np.allclose(result_1gpus[0], result_4gpus[0]))
-            self.assertTrue(np.allclose(result_1gpus[1], result_4gpus[1]))
-
-        if workspace.NumCudaDevices() >= 8:
-            result_8gpus = self.run_model(V, list(range(8)), cpu_indices)
-            self.assertTrue(np.allclose(result_1gpus[0], result_8gpus[0]))
-            self.assertTrue(np.allclose(result_1gpus[1], result_8gpus[1]))
-
-    def test_equiv_sparse(self):
-        self._test_equiv_sparse(True)
-        self._test_equiv_sparse(False)
-
-
-@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
-@unittest.skipIf(workspace.NumGpuDevices() < 2, "Need at least 2 GPUs.")
-class ParallelizeBMUFTest(TestCase):
-
-    def _run_model(self, gpu_devices):
-        '''
-        Helper function for test_equiv
-        '''
-        def input_builder_fun(model):
-            return None
-
-    def _model_build_fun(self, model, loss_scale):
-        fc = model.FC(
-            "data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})
-        )
-        fc_fl = model.FlattenToVec(fc, "fc_fl")
-        sigm = model.Sigmoid(fc_fl, "sigm")
-        sq = model.SquaredL2Distance([sigm, "label"], "sq")
-        loss = model.AveragedLoss(sq, "loss")
-        loss = model.Scale(loss, scale=loss_scale)
-
-        return [loss]
-
-    def _param_update_fun(self, model):
-        ITER = model.Iter("ITER")
-        LR = model.net.LearningRate(
-            [ITER],
-            "LR",
-            base_lr=(-0.1),
-            policy="fixed",
-        )
-        ONE = model.param_init_net.ConstantFill(
-            [], "ONE", shape=[1], value=1.0,
-        )
-        for param in model.GetParams():
-            grad = model.param_to_grad[param]
-            model.WeightedSum([param, ONE, grad, LR], param)
-
-    def _generate_data(self, devices, device_type, device_prefix):
-        np.random.seed(26)
-        # Each run has same input, independent of number of gpus
-        batch_size = 64
-        for _ in range(0, 10):
-            full_data = np.random.rand(batch_size, 16)
-            full_labels = np.round(full_data[:, 0])
-            batch_per_device = batch_size // len(devices)
-
-            for (j, g) in enumerate(devices):
-                st = j * batch_per_device
-                en = st + batch_per_device
-                data = full_data[st:en, :].astype(np.float32)
-                labels = full_labels[st:en].astype(np.float32)
-                with core.DeviceScope(core.DeviceOption(device_type, g)):
-                    workspace.FeedBlob("{}_{}/data".format(device_prefix, g), data)
-                    workspace.FeedBlob("{}_{}/label".format(device_prefix, g), labels)
-
-    @given(
-        cpu_device=st.booleans()
-    )
-    @settings(deadline=2000)
-    def test_parallelize_bmuf(self, cpu_device):
-        assume(cpu_device or workspace.has_gpu_support or workspace.has_hip_support)
-
-        workspace.ResetWorkspace()
-
-        model = cnn.CNNModelHelper(
-            order="NHWC",
-            name="test"
-        )
-        devices = [0, 1]
-
-        def input_builder_fun(model):
-            return None
-
-        if not cpu_device:
-            device_type = workspace.GpuDeviceType
-            device_prefix = "gpu"
-        else:
-            device_type = caffe2_pb2.CPU
-            device_prefix = "cpu"
-        self._generate_data(devices, device_type, device_prefix)
-
-        data_parallel_model.Parallelize_BMUF(
-            model,
-            input_builder_fun,
-            self._model_build_fun,
-            self._param_update_fun,
-            devices=devices,
-            cpu_device=cpu_device
-        )
-
-        data_parallel_model.RunInitNet(model)
-
-        # Check initial momentum params are zeros
-        self.assertEqual(
-            list(model._device_grouped_blobs.keys()), ['fc_w', 'fc_b']
-        )
-        self.assertEqual(workspace.FetchBlob('{}_0/fc_b_v'.format(device_prefix)), 0)
-        np.testing.assert_equal(
-            workspace.FetchBlob('{}_0/fc_w_v'.format(device_prefix)),
-            np.zeros(16).astype(np.float32).reshape(1, 16)
-        )
-
-        # Run the algorithm for one iteration to have non-zero params.
-        data_parallel_model.RunNet(model, 1)
-
-        # Save iteration momentum and post local update params
-        v_b_ = workspace.FetchBlob('{}_0/fc_b_v'.format(device_prefix))
-        v_w_ = workspace.FetchBlob('{}_0/fc_w_v'.format(device_prefix))
-
-        workspace.RunNetOnce(model.net)
-
-        b_0_ = workspace.FetchBlob('{}_0/fc_b'.format(device_prefix))
-        w_0_ = workspace.FetchBlob('{}_0/fc_w'.format(device_prefix))
-        b_1_ = workspace.FetchBlob('{}_1/fc_b'.format(device_prefix))
-        w_1_ = workspace.FetchBlob('{}_1/fc_w'.format(device_prefix))
-
-        # Compute block gradients.
-        b_g_ = workspace.FetchBlob('{}_0/fc_b_g'.format(device_prefix))
-        w_g_ = workspace.FetchBlob('{}_0/fc_w_g'.format(device_prefix))
-        workspace.RunNetOnce(model._global_model_param_updates_net)
-
-        g_b = (b_0_ + b_1_) / 2 - b_g_
-        g_w = (w_0_ + w_1_) / 2 - w_g_
-        v_b = workspace.FetchBlob('{}_0/fc_b_v'.format(device_prefix))
-        v_w = workspace.FetchBlob('{}_0/fc_w_v'.format(device_prefix))
-
-        w_g = workspace.FetchBlob('{}_0/fc_w_g'.format(device_prefix))
-        b_g = workspace.FetchBlob('{}_0/fc_b_g'.format(device_prefix))
-        w_0 = workspace.FetchBlob('{}_0/fc_w'.format(device_prefix))
-        b_0 = workspace.FetchBlob('{}_0/fc_b'.format(device_prefix))
-        w_1 = workspace.FetchBlob('{}_1/fc_w'.format(device_prefix))
-        b_1 = workspace.FetchBlob('{}_1/fc_b'.format(device_prefix))
-
-        # Check momentum update step
-        np.testing.assert_equal(v_b, 0.5 * v_b_ + g_b)
-        np.testing.assert_equal(v_w, 0.5 * v_w_ + g_w)
-
-        np.testing.assert_equal(w_g, w_0)
-        np.testing.assert_equal(w_g, w_1)
-        np.testing.assert_equal(b_g, b_0)
-        np.testing.assert_equal(b_g, b_1)
-
-        # Check params update step
-        np.testing.assert_equal(w_0, w_g_ + v_w)
-        np.testing.assert_equal(b_0, b_g_ + v_b)
-
-
-@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
-@unittest.skipIf(workspace.NumGpuDevices() < 2, "Need at least 2 GPUs.")
-class SparseDataParallelModelTestWithSharedIndices(TestCase):
-
-    '''
-    Create and run the model. We try with both storing indices for gather
-    on CPU and on GPU
-    '''
-    def run_model(self, V, gpu_devices):
-
-        def input_builder_fun(model):
-            return None
-
-        def model_build_fun(model, loss_scale):
-            gpu_vecs_gathered = []
-            gpu_vecs = []
-            for num, vec in enumerate(self.vecs):
-                gpu_vec = model.param_init_net.CopyCPUToGPU(
-                    vec, 'gpuvec_{}'.format(num),
-                )
-                if num != 2:
-                    model.params.append(gpu_vec)
-                gpu_vecs.append(gpu_vec)
-            for num, gpu_vec in enumerate(gpu_vecs):
-                gpu_vec_gathered = model.net.Gather(
-                    [gpu_vec, 'indices'],
-                    ['gpu_vec_gathered_{}'.format(num)]
-                )
-                gpu_vecs_gathered.append(gpu_vec_gathered)
-
-            assert len(gpu_vecs_gathered) == 3
-
-            fc = model.net.FC(
-                [
-                    gpu_vecs_gathered[2],
-                    gpu_vecs_gathered[0],
-                    gpu_vecs_gathered[1],
-                ],
-                ['fc'],
-            )
-            _, loss = model.net.SoftmaxWithLoss(
-                [fc, 'label'],
-                ['ce_loss', 'avg_loss'],
-                only_loss=True,
-            )
-            loss = model.Scale(loss, scale=loss_scale)
-            model.net.Print(loss, [], limit=10)
-            return [loss]
-
-        def param_update_fun(model):
-            ONE = model.param_init_net.ConstantFill(
-                [], "ONE", shape=[1], value=1.0,
-            )
-            LR = model.CopyCPUToGPU(self.LR, "LR")
-            for param in model.GetParams():
-                param_grad = model.param_to_grad[param]
-                if not isinstance(param_grad, core.GradientSlice):
-                    model.WeightedSum([param, ONE, param_grad, LR], param)
-                else:
-                    model.net.ScatterWeightedSum(
-                        [
-                            param,
-                            ONE,
-                            param_grad.indices,
-                            param_grad.values,
-                            ONE,
-                        ],
-                        param,
-                    )
-
-        workspace.ResetWorkspace()
-        model = cnn.CNNModelHelper(
-            order="NHWC",
-            name="sparse_test{}".format(gpu_devices),
-        )
-        batch_size = 32
-        batch_per_device = batch_size // len(gpu_devices)
-
-        with core.NameScope("cpu"):
-            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-                self.ITER = model.Iter("ITER")
-                self.LR = model.net.LearningRate(
-                    [self.ITER],
-                    "LR",
-                    base_lr=(-0.1),
-                    policy="fixed",
-                )
-                '''
-                self.vecs consists of 3 big blobs on which we call Gather:
-                1) FC weights, shape=(V, 16)
-                2) FC bias, shape=(V)
-                3) FC input, shape=(batch_per_device, 16)
-                '''
-                self.vecs = [
-                    model.param_init_net.UniformFill(
-                        [], "vec_{}".format(num), shape=[V, 16])
-                    for num in range(2)
-                ]
-                self.vecs.append(
-                    model.param_init_net.UniformFill(
-                        [],
-                        "vec_2", shape=[batch_per_device, 16]
-                    )
-                )
-                self.ONE_CPU = model.param_init_net.ConstantFill(
-                    [], "ONE_CPU", shape=[1], value=1.0,
-                )
-
-        data_parallel_model.Parallelize_GPU(
-            model,
-            input_builder_fun=input_builder_fun,
-            forward_pass_builder_fun=model_build_fun,
-            param_update_builder_fun=param_update_fun,
-            devices=gpu_devices,
-        )
-
-        # Update the vecs
-        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
-            for num, vec in enumerate(self.vecs[:-1]):
-                model.CopyGPUToCPU("gpu_0/gpuvec_{}".format(num), vec)
-
-        # Each run has same input, independent of number of gpus
-        for i in range(0, 10):
-            np.random.seed(2603)
-            full_indices = np.random.permutation(V)[:batch_size].reshape(
-                batch_size
-            )
-            full_labels = full_indices[:] % batch_per_device
-
-            for (j, g) in enumerate(gpu_devices):
-                st = j * batch_per_device
-                en = st + batch_per_device
-                indices = full_indices[st:en].astype(np.int32)
-                labels = full_labels[st:en].astype(np.int32)
-
-                with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, g)):
-                    workspace.FeedBlob("gpu_{}/indices".format(g), indices)
-                    workspace.FeedBlob("gpu_{}/label".format(g), labels)
-
-            if i == 0:
-                workspace.RunNetOnce(model.param_init_net)
-                # Force vecs to be same on all runs
-                orig_vecs = [
-                    np.random.rand(V, 16).astype(np.float32),
-                    np.random.rand(V).astype(np.float32),
-                    np.random.rand(V, 16).astype(np.float32),
-                ]
-                for vec, orig_vec in zip(self.vecs, orig_vecs):
-                    workspace.FeedBlob(
-                        vec,
-                        orig_vec
-                    )
-                for g in gpu_devices:
-                    for num, orig_vec in enumerate(orig_vecs):
-                        workspace.FeedBlob(
-                            "gpu_{}/gpuvec_{}".format(g, num),
-                            orig_vec,
-                            device_option=core.DeviceOption(
-                                workspace.GpuDeviceType, g),
-                        )
-                workspace.CreateNet(model.net)
-
-            workspace.RunNet(model.net.Proto().name)
-
-            idx = workspace.FetchBlob('gpu_0/indices')
-            grad_slices = [
-                workspace.FetchBlob(
-                    'gpu_{}/gpu_vec_gathered_{}_grad'.format(g, num))
-                for g in gpu_devices for num in range(2)
-            ]
-            for grad_slice in grad_slices:
-                # print (len(idx), len(grad_slice))
-                assert len(idx) == len(grad_slice), (
-                    'Number of indices {} is not same as number of gradient '
-                    'slices {}. This might lead to illegal memory access'.format(
-                        len(idx), len(grad_slice)
-                    )
-                )
-
-    def test_sparse_shared_indices_gpu(self):
-        '''
-            Test that the model has same number of indices and gradient rows
-            given total batchsize, independent of number of GPUs.
-        '''
-        V = 10000
-        self.run_model(V, [0, 1])
-        self.run_model(V, [0])
-
-        if workspace.NumGpuDevices() >= 4:
-            self.run_model(V, list(range(4)))
-
-        if workspace.NumGpuDevices() >= 8:
-            self.run_model(V, list(range(8)))
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/data_workers.py b/caffe2/python/data_workers.py
deleted file mode 100644
index acdd8cfbb24b..000000000000
--- a/caffe2/python/data_workers.py
+++ /dev/null
@@ -1,461 +0,0 @@
-## @package data_workers
-# Module caffe2.python.data_workers
-
-
-
-
-
-
-'''
-This module provides a python-land multithreaded data input mechanism
-for Caffe2 nets.
-
-Basic usage is as follows:
-   coordinator = data_workers.init_data_input_workers(
-      net,
-      ["data", "label"],
-      my_fetch_fun,
-      batch_size=32,
-      input_source_name="train",
-      dont_rebatch=False
-   )
-   ...
-   coordinator.start()
-
-First argument is the Caffe2 net (or model helper), and second argument
-is list of input blobs that are to be fed.
-
-Argument 'input_source_name' is used to distinguish different sources of data,
-such as train or test data. This is to ensure the data does not get mixed up,
-although two nets would share blobs.
-
-To do the actual data loading, one defines a "fetcher function"
-that has call signature
-   my_fetch_fun(worker_id, batch_size)
-
-Optionally, one can define a "init function" that is called once before
-threads start, and has call signature:
-   my_init_fun(data_coordinator, global_coordinator)
-
-If dont_rebatch is set to True, the data input is not batched into equal sized
-chunks but data directly provided by fetchers is used.
-
-'batch_columns' can be used to specify which dimension is the batch dimension,
-for each of the inputs. Default is 0 for all iputs.
-
-'timeout' is the timeout in seconds after which if no data is available, the
-net will fail (default 600s = 10 mins).
-
-This function returns a list of numpy arrays corresponding to the different
-input blobs. In the example above, it would return two arrays, one for the
-data blob and another for the labels. These arrays can have arbitrary number
-of elements (i.e they do not need to match the batch size). The batch size
-is provided for the function as a hint only.
-
-For example, fetcher function could download images from a remote service or
-load random images from a directory on a file system.
-
-For a dummy example, see the data_workers_test unit test.
-
-Note that for data_parallel_models, init_data_input_workers will be called
-for each GPU. Note that the 'coordinator' returned by the function is same
-each time.
-'''
-
-import queue as Queue
-from itertools import chain
-import logging
-import threading
-import numpy as np
-import time
-
-from caffe2.python import workspace, core, scope, utils
-from caffe2.proto import caffe2_pb2
-from caffe2.python.parallel_workers import Metrics, State, \
-    WorkerCoordinator, GlobalWorkerCoordinator, Worker, run_worker
-
-log = logging.getLogger("data_workers")
-log.setLevel(logging.INFO)
-LOG_INT_SECS = 60
-
-
-def get_worker_ids(num_workers):
-    return list(range(0, num_workers))
-
-
-def init_data_input_workers(
-    net,
-    input_blob_names,
-    fetch_fun,
-    batch_size,
-    num_worker_threads=2,
-    input_source_name="train",
-    max_buffered_batches=800,
-    init_fun=None,
-    external_loggers=None,
-    dont_rebatch=False,
-    batch_columns=None,
-    timeout=600
-):
-    global global_coordinator
-    device_option = scope.CurrentDeviceScope()
-    if (device_option is None):
-        device_option = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CPU)
-
-    metrics = Metrics(external_loggers)
-    batch_feeder = BatchFeeder(
-        net,
-        input_blob_names,
-        batch_size,
-        device_option,
-        scope.CurrentNameScope(),
-        input_source_name,
-        global_coordinator.get_queue(input_source_name, max_buffered_batches),
-        metrics,
-        dont_rebatch,
-        batch_columns,
-        timeout=timeout
-    )
-
-    # Launch fetch worker threads
-    worker_ids = [
-        global_coordinator.get_new_worker_id()
-        for i in range(num_worker_threads)
-    ]
-
-    # Create coordinator object
-    coordinator = WorkerCoordinator(
-        input_source_name, worker_ids, init_fun, batch_feeder)
-
-    workers = [
-        threading.Thread(
-            target=run_worker,
-            name="data_workers fetcher id {}".format(worker_id),
-            args=[coordinator,
-                  DataWorker(coordinator, worker_id, fetch_fun, metrics,
-                             batch_size, batch_feeder)],
-        ) for worker_id in worker_ids
-    ]
-
-    workers.append(threading.Thread(
-        target=enqueuer,
-        name="Enqueuer {} {}".format(input_source_name, scope.CurrentNameScope()),
-        args=[coordinator, batch_feeder]))
-    coordinator._workers = workers
-    global_coordinator.add(coordinator)
-
-    return global_coordinator
-
-
-class BatchFeeder(State):
-    def __init__(self, net, input_blob_names, batch_size,
-                 device_option, namescope, input_source_name, queue,
-                 metrics, dont_rebatch, batch_columns, timeout=600):
-        self._counter = 0
-        self._input_blob_names = input_blob_names
-        self._batch_size = batch_size
-        self._internal_queue = queue
-        self._queues = []
-        self._device_option = device_option
-        self._namescope = namescope
-        self._timeout = timeout
-        self._input_source_name = input_source_name
-        self._c2_queue_capacity = 4
-        self._create_caffe2_queues(net)
-        self._create_caffe2_ops(net)
-        self._inputs = 0
-        self._prev_seconds = 0
-        self._last_warning = time.time()
-        self._dont_rebatch = dont_rebatch
-        self._init_scratch()
-        self._metrics = metrics
-
-        if batch_columns is None:
-            batch_columns = [0 for _ in input_blob_names]
-        self._batch_columns = batch_columns
-
-    def start(self):
-        self._inputs = 0
-        self._prev_seconds = time.time()
-
-    def stop(self):
-        try:
-            for q in self._queues:
-                workspace.RunOperatorOnce(
-                    core.CreateOperator("CloseBlobsQueue", [q], [])
-                )
-        finally:
-            self._log_inputs_per_interval(0, force=True)
-
-    def cleanup(self):
-        utils.ResetBlobs(self._scratch_blob.values())
-        utils.ResetBlobs(self._scratch_status.values())
-
-    def _get(self, data_input_coordinator):
-        start_time = time.time()
-        last_warning = time.time()
-        while data_input_coordinator.is_active():
-            try:
-                return self._internal_queue.get(block=True, timeout=0.5)
-            except Queue.Empty:
-                if time.time() - last_warning > 10.0:
-                    log.warning("** Data input is slow: (still) no data in {} secs.".format(
-                        time.time() - start_time))
-                    last_warning = time.time()
-                continue
-        return None
-
-    def _validate_chunk(self, chunk):
-        if chunk is None:
-            log.warning("Fetcher function returned None")
-            return False
-
-        assert len(chunk) == len(self._input_blob_names), \
-            "Expecting data blob for each input"
-        for d in chunk:
-            assert isinstance(d, np.ndarray), \
-                "Fetcher function must return a numpy array"
-        if not self._dont_rebatch:
-            j = 1
-            for d in chunk[1:]:
-                assert d.shape[self._batch_columns[j]] == \
-                    chunk[0].shape[self._batch_columns[0]], \
-                    "Each returned input must have equal number of samples"
-                j += 1
-
-        if len(chunk) == 0:
-            log.warning("Worker provided zero length input")
-            return False
-
-        return True
-
-    def put(self, chunk, data_input_coordinator):
-        if not self._validate_chunk(chunk):
-            return
-
-        while data_input_coordinator.is_active():
-            try:
-                qsize = self._internal_queue.qsize()
-                if qsize < 2 and (time.time() - self._last_warning) > LOG_INT_SECS:
-                    log.warning("Warning, data loading lagging behind: " +
-                                "queue size={}, name={}".format(qsize, self._input_source_name))
-                    self._last_warning = time.time()
-                self._counter += 1
-                self._internal_queue.put(chunk, block=True, timeout=0.5)
-                self._log_inputs_per_interval(chunk[0].shape[0])
-                return
-            except Queue.Full:
-                log.debug("Queue full: stalling fetchers...")
-                continue
-
-    def _enqueue_batch_direct(self, data_input_coordinator):
-        data = self._get(data_input_coordinator)
-        if data is None:
-            return
-        if data_input_coordinator.is_active():
-            for b, q, c in zip(self._input_blob_names, self._queues, data):
-                self._enqueue(b, q, c)
-
-    def _enqueue_batch(self, data_input_coordinator):
-        '''
-        This pulls data from the python-side queue and collects them
-        into batch-sized pieces, unless dont_rebatch is set to true.
-        '''
-        if self._dont_rebatch:
-            self._enqueue_batch_direct(data_input_coordinator)
-            return
-
-        cur_batch = [np.array([]) for d in self._input_blob_names]
-        first_batch_col = self._batch_columns[0]
-
-        # Collect data until we have a full batch size
-        while (
-            cur_batch[0].shape[0] == 0 or
-            cur_batch[0].shape[first_batch_col] < self._batch_size
-        ) and data_input_coordinator.is_active():
-            chunk = self._get(data_input_coordinator)
-            if chunk is None:
-                continue
-
-            for j, chunk_elem in enumerate(chunk):
-                if cur_batch[j].shape[0] == 0:
-                    cur_batch[j] = chunk_elem.copy()
-                else:
-                    cur_batch[j] = np.append(
-                        cur_batch[j], chunk_elem, axis=self._batch_columns[j]
-                    )
-
-        start_time = time.time()
-        try:
-            # Return data over the batch size back to queue
-            if cur_batch[0].shape[0] > 0 and cur_batch[0].shape[
-                first_batch_col
-            ] > self._batch_size:
-                leftover = []
-                trimmed_batch = []
-                for j, b in enumerate(cur_batch):
-                    [c, l] = np.split(
-                        b, [self._batch_size], axis=self._batch_columns[j]
-                    )
-                    leftover.append(l)
-                    trimmed_batch.append(c)
-                cur_batch = trimmed_batch
-                try:
-                    self._internal_queue.put(leftover, block=False)
-                except Queue.Full:
-                    pass
-
-                assert cur_batch[0].shape[first_batch_col] == self._batch_size
-
-            if data_input_coordinator.is_active():
-                for b, q, c in zip(
-                    self._input_blob_names, self._queues, cur_batch
-                ):
-                    self._enqueue(b, q, c)
-        finally:
-            self._metrics.put_metric('enqueue_time', time.time() - start_time)
-
-    def _init_scratch(self):
-        self._scratch_blob = {}
-        self._scratch_status = {}
-        for blob_name in self._input_blob_names:
-            scratch_name = self._namescope + blob_name + \
-                "_scratch_" + self._input_source_name
-            self._scratch_blob[blob_name] = core.BlobReference(scratch_name)
-            self._scratch_status[blob_name] = core.BlobReference(
-                scratch_name + "_status"
-            )
-
-        # Feed empty arrays to the scratch blobs here, so that there won't be
-        # race conditions when calling FeedBlob (which calls wworkspace
-        # CreateBlob()) from enqueue threads
-        for b in chain(
-            self._scratch_blob.values(), self._scratch_status.values()
-        ):
-            workspace.FeedBlob(
-                b,
-                np.array([]).astype(np.float32),
-                device_option=self._device_option,
-            )
-
-    def _enqueue(self, blob_name, queue, data_arr):
-        '''
-        Enqueue the correctly sized batch arrays to Caffe2's queue.
-        '''
-        workspace.FeedBlob(
-            self._scratch_blob[blob_name],
-            data_arr,
-            device_option=self._device_option
-        )
-
-        op = core.CreateOperator(
-            "SafeEnqueueBlobs",
-            [queue, self._scratch_blob[blob_name]],
-            [self._scratch_blob[blob_name], self._scratch_status[blob_name]],
-            device_option=self._device_option
-        )
-        workspace.RunOperatorOnce(op)
-
-    def _create_caffe2_queues(self, net):
-        '''
-        Creates queues on caffe2 side
-        '''
-        def create_queue(queue_name, num_blobs, capacity):
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "CreateBlobsQueue",
-                    [], [queue_name],
-                    num_blobs=1,
-                    capacity=capacity))
-            return core.ScopedBlobReference(queue_name)
-
-        for blob_name in self._input_blob_names:
-            qname = blob_name + "_c2queue" + "_" + self._input_source_name
-            q = create_queue(
-                qname, num_blobs=1, capacity=self._c2_queue_capacity
-            )
-            self._queues.append(q)
-
-    def _create_caffe2_ops(self, net):
-        '''
-        Creates dequeue-ops on caffe2 side
-        '''
-        for q, blob_name in zip(self._queues, self._input_blob_names):
-            # Add operator to the Caffe2 network to dequeue
-            net.DequeueBlobs(q, blob_name, timeout_secs=float(self._timeout))
-
-    def _log_inputs_per_interval(self, inputs, force=False):
-        self._inputs += inputs
-        current_seconds = time.time()
-        delta_seconds = current_seconds - self._prev_seconds
-        if delta_seconds >= LOG_INT_SECS or force:
-            inputs_per_sec = int(self._inputs / delta_seconds)
-            qsize = self._internal_queue.qsize()
-            log.info("{}/{}: {} inputs/sec".format(
-                self._input_source_name,
-                self._namescope,
-                inputs_per_sec,
-            ))
-            log.info("-- queue: {} batches".format(qsize))
-            # log and reset perf metrics
-            self._metrics.put_metric(
-                'inputs_per_sec', inputs_per_sec, False)
-            self._metrics.put_metric('queue_size', qsize, False)
-            self._metrics.put_metric(
-                'time_elapsed', delta_seconds, False)
-            self._metrics.log_metrics()
-            self._metrics.reset_metrics()
-            self._inputs = 0
-            self._prev_seconds = current_seconds
-
-
-class GlobalCoordinator(GlobalWorkerCoordinator):
-    def __init__(self):
-        GlobalWorkerCoordinator.__init__(self)
-        self._queues = {}
-
-    def get_queue(self, queue_name, max_buffered_batches):
-        assert isinstance(max_buffered_batches, int)
-        if queue_name not in self._queues:
-            self._queues[queue_name] = Queue.Queue(maxsize=max_buffered_batches)
-        return self._queues[queue_name]
-
-    def reset_data_input(self, namescope, name, net, batch_size):
-        log.info("Reset data input {}, batch size {}: ".format(name, batch_size))
-        for c in self._coordinators:
-            if c._worker_name == name and c._state._namescope == namescope:
-                c._state._batch_size = batch_size
-                c._state._create_caffe2_ops(net)
-
-
-class DataWorker(Worker):
-    def __init__(
-        self,
-        coordinator,
-        worker_id,
-        worker_fun,
-        metrics,
-        batch_size,
-        batch_feeder
-    ):
-        Worker.__init__(self, coordinator, worker_id, worker_fun=worker_fun,
-                        metrics=metrics)
-        self._batch_size = batch_size
-        self._batch_feeder = batch_feeder
-
-    def run(self):
-        input_data = self._worker_fun(self._worker_id, self._batch_size)
-
-        self._batch_feeder.put(input_data, self._coordinator)
-
-    def finish(self):
-        self._metrics.put_metric(
-            'fetcher_time', time.time() - self._start_time)
-
-
-global_coordinator = GlobalCoordinator()
-
-
-def enqueuer(coordinator, batch_feeder):
-    while coordinator.is_active():
-        batch_feeder._enqueue_batch(coordinator)
diff --git a/caffe2/python/data_workers_test.py b/caffe2/python/data_workers_test.py
deleted file mode 100644
index 4669aaf59476..000000000000
--- a/caffe2/python/data_workers_test.py
+++ /dev/null
@@ -1,196 +0,0 @@
-
-
-
-
-
-import numpy as np
-import unittest
-import time
-
-from caffe2.python import workspace, model_helper
-from caffe2.python import timeout_guard
-import caffe2.python.data_workers as data_workers
-
-
-def dummy_fetcher(fetcher_id, batch_size):
-    # Create random amount of values
-    n = np.random.randint(64) + 1
-    data = np.zeros((n, 3))
-    labels = []
-    for j in range(n):
-        data[j, :] *= (j + fetcher_id)
-        labels.append(data[j, 0])
-
-    return [np.array(data), np.array(labels)]
-
-
-def dummy_fetcher_rnn(fetcher_id, batch_size):
-    # Hardcoding some input blobs
-    T = 20
-    N = batch_size
-    D = 33
-    data = np.random.rand(T, N, D)
-    label = np.random.randint(N, size=(T, N))
-    seq_lengths = np.random.randint(N, size=(N))
-    return [data, label, seq_lengths]
-
-
-class DataWorkersTest(unittest.TestCase):
-
-    def testNonParallelModel(self):
-        workspace.ResetWorkspace()
-
-        model = model_helper.ModelHelper(name="test")
-        old_seq_id = data_workers.global_coordinator._fetcher_id_seq
-        coordinator = data_workers.init_data_input_workers(
-            model,
-            ["data", "label"],
-            dummy_fetcher,
-            32,
-            2,
-            input_source_name="unittest"
-        )
-        new_seq_id = data_workers.global_coordinator._fetcher_id_seq
-        self.assertEqual(new_seq_id, old_seq_id + 2)
-
-        coordinator.start()
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.CreateNet(model.net)
-
-        for _i in range(500):
-            with timeout_guard.CompleteInTimeOrDie(5):
-                workspace.RunNet(model.net.Proto().name)
-
-            data = workspace.FetchBlob("data")
-            labels = workspace.FetchBlob("label")
-
-            self.assertEqual(data.shape[0], labels.shape[0])
-            self.assertEqual(data.shape[0], 32)
-
-            for j in range(32):
-                self.assertEqual(labels[j], data[j, 0])
-                self.assertEqual(labels[j], data[j, 1])
-                self.assertEqual(labels[j], data[j, 2])
-
-        coordinator.stop_coordinator("unittest")
-        self.assertEqual(coordinator._coordinators, [])
-
-    def testRNNInput(self):
-        workspace.ResetWorkspace()
-        model = model_helper.ModelHelper(name="rnn_test")
-        old_seq_id = data_workers.global_coordinator._fetcher_id_seq
-        coordinator = data_workers.init_data_input_workers(
-            model,
-            ["data1", "label1", "seq_lengths1"],
-            dummy_fetcher_rnn,
-            32,
-            2,
-            dont_rebatch=False,
-            batch_columns=[1, 1, 0],
-        )
-        new_seq_id = data_workers.global_coordinator._fetcher_id_seq
-        self.assertEqual(new_seq_id, old_seq_id + 2)
-
-        coordinator.start()
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.CreateNet(model.net)
-
-        while coordinator._coordinators[0]._state._inputs < 100:
-            time.sleep(0.01)
-
-        # Run a couple of rounds
-        workspace.RunNet(model.net.Proto().name)
-        workspace.RunNet(model.net.Proto().name)
-
-        # Wait for the enqueue thread to get blocked
-        time.sleep(0.2)
-
-        # We don't dequeue on caffe2 side (as we don't run the net)
-        # so the enqueue thread should be blocked.
-        # Let's now shutdown and see it succeeds.
-        self.assertTrue(coordinator.stop())
-
-    @unittest.skip("Test is flaky: https://github.com/pytorch/pytorch/issues/9064")
-    def testInputOrder(self):
-        #
-        # Create two models (train and validation) with same input blobs
-        # names and ensure that both will get the data in correct order
-        #
-        workspace.ResetWorkspace()
-        self.counters = {0: 0, 1: 1}
-
-        def dummy_fetcher_rnn_ordered1(fetcher_id, batch_size):
-            # Hardcoding some input blobs
-            T = 20
-            N = batch_size
-            D = 33
-            data = np.zeros((T, N, D))
-            data[0][0][0] = self.counters[fetcher_id]
-            label = np.random.randint(N, size=(T, N))
-            label[0][0] = self.counters[fetcher_id]
-            seq_lengths = np.random.randint(N, size=(N))
-            seq_lengths[0] = self.counters[fetcher_id]
-            self.counters[fetcher_id] += 1
-            return [data, label, seq_lengths]
-
-        workspace.ResetWorkspace()
-        model = model_helper.ModelHelper(name="rnn_test_order")
-
-        coordinator = data_workers.init_data_input_workers(
-            model,
-            input_blob_names=["data2", "label2", "seq_lengths2"],
-            fetch_fun=dummy_fetcher_rnn_ordered1,
-            batch_size=32,
-            max_buffered_batches=1000,
-            num_worker_threads=1,
-            dont_rebatch=True,
-            input_source_name='train'
-        )
-        coordinator.start()
-
-        val_model = model_helper.ModelHelper(name="rnn_test_order_val")
-        coordinator1 = data_workers.init_data_input_workers(
-            val_model,
-            input_blob_names=["data2", "label2", "seq_lengths2"],
-            fetch_fun=dummy_fetcher_rnn_ordered1,
-            batch_size=32,
-            max_buffered_batches=1000,
-            num_worker_threads=1,
-            dont_rebatch=True,
-            input_source_name='val'
-        )
-        coordinator1.start()
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.CreateNet(model.net)
-        workspace.CreateNet(val_model.net)
-
-        while coordinator._coordinators[0]._state._inputs < 900:
-            time.sleep(0.01)
-
-        with timeout_guard.CompleteInTimeOrDie(5):
-            for m in (model, val_model):
-                print(m.net.Proto().name)
-                workspace.RunNet(m.net.Proto().name)
-                last_data = workspace.FetchBlob('data2')[0][0][0]
-                last_lab = workspace.FetchBlob('label2')[0][0]
-                last_seq = workspace.FetchBlob('seq_lengths2')[0]
-
-                # Run few rounds
-                for _i in range(10):
-                    workspace.RunNet(m.net.Proto().name)
-                    data = workspace.FetchBlob('data2')[0][0][0]
-                    lab = workspace.FetchBlob('label2')[0][0]
-                    seq = workspace.FetchBlob('seq_lengths2')[0]
-                    self.assertEqual(data, last_data + 1)
-                    self.assertEqual(lab, last_lab + 1)
-                    self.assertEqual(seq, last_seq + 1)
-                    last_data = data
-                    last_lab = lab
-                    last_seq = seq
-
-            time.sleep(0.2)
-
-            self.assertTrue(coordinator.stop())
diff --git a/caffe2/python/dataio.py b/caffe2/python/dataio.py
deleted file mode 100644
index 1284d9287894..000000000000
--- a/caffe2/python/dataio.py
+++ /dev/null
@@ -1,635 +0,0 @@
-## @package dataio
-# Module caffe2.python.dataio
-"""
-Defines the base interface for reading and writing operations.
-
-Readers/Writers are objects that produce operations that read/write sequences
-of data. Each operation reads or writes a list of BlobReferences.
-
-Readers and Writers must be implemented such that read and write operations
-are atomic and thread safe.
-
-Examples of possible Readers and Writers:
-    QueueReader, QueueWriter,
-    DatasetReader, DatasetWriter,
-
-See `dataset.py` for an example of implementation.
-"""
-
-
-
-
-
-from caffe2.python import core
-from caffe2.python.schema import Field, Struct, from_blob_list
-import numpy as np
-import time
-
-
-class Reader:
-    """
-    Reader is an abstract class to be implemented in order to provide
-    operations capable of iterating through a dataset or stream of data.
-
-    A Reader must implement at least one operation, `read`, which
-    adds operations to a net that read the next batch of data. Readers can
-    optionally support the `reset` operation, which is useful when multiple
-    passes over the data are required.
-    """
-    def __init__(self, schema=None):
-        if schema is not None:
-            assert isinstance(schema, Field)
-        self._schema = schema
-
-    def schema(self):
-        assert self._schema is not None, 'Schema not provided for this reader.'
-        return self._schema
-
-    def _set_schema(self, schema):
-        self._schema = schema
-
-    def setup_ex(self, init_net, finish_net):
-        """Setup nets to run at task initialization and cleanup time.
-
-        Args:
-            init_net: A net invoked at task init time.
-            finish_net: A net invoked at task cleanup time.
-        """
-        pass
-
-    def read_ex(self, local_init_net, local_finish_net):
-        read_net = core.Net('reader_body')
-        return ([read_net], ) + self.read(read_net)
-
-    def read_record_ex(self, local_init_net, local_finish_net):
-        nets, should_stop, fields = self.read_ex(
-            local_init_net, local_finish_net)
-        if self._schema:
-            fields = from_blob_list(self._schema, fields)
-        return nets, should_stop, fields
-
-    def read(self, read_net):
-        """Append operations to read_net that will read a batch from the
-        underlying data soruce.
-
-        Operations added to `read_net` must be thread safe and atomic, that is,
-        it should be possible to clone `read_net` and run multiple instances of
-        it in parallel.
-
-        Args:
-            read_net: the net that will be appended with read operations
-
-        Returns:
-            A tuple (should_stop, fields), with:
-                should_stop: BlobReference pointing to a boolean scalar
-                    blob that indicates whether the read operation
-                    was succesfull or whether the end of data has
-                    been reached.
-                fields: A tuple of BlobReference containing the latest batch
-                    of data that was read.
-        """
-        raise NotImplementedError('Readers must implement `read`.')
-
-    def reset(self, net):
-        """Append operations to `net` that will reset the reader.
-
-        This can be used to read the data multiple times.
-        Not all readers support this operation.
-        """
-        raise NotImplementedError('This reader cannot be resetted.')
-
-    def read_record(self, read_net):
-        should_stop, fields = self.read(read_net)
-        if self._schema:
-            fields = from_blob_list(self._schema, fields)
-        return should_stop, fields
-
-    def execution_step(self, reader_net_name=None, external_should_stop=None):
-        """Create an execution step with a net containing read operators.
-
-        The execution step will contain a `stop_blob` that knows how to stop
-        the execution loop when end of data was reached.
-
-        E.g.:
-
-            read_step, fields = reader.execution_step()
-            consume_net = core.Net('consume')
-            consume_net.Print(fields[0], [])
-            p = core.Plan('reader')
-            p.AddStep(read_step.AddNet(consume_net))
-            core.RunPlan(p)
-
-        Args:
-            reader_net_name: (optional) the name of the reader_net to be
-                             created. The execution step will
-                             be named accordingly.
-
-        Returns:
-            A tuple (read_step, fields), with:
-                read_step: A newly created execution step containing a net with
-                           read operations. The step will have `stop_blob` set,
-                           in order to stop the loop on end of data.
-                fields: A tuple of BlobReference containing the latest batch
-                        of data that was read.
-        """
-        reader_net = core.Net(reader_net_name or 'reader')
-        should_stop, fields = self.read_record(reader_net)
-        if external_should_stop is not None:
-            should_stop = reader_net.Or([external_should_stop, should_stop])
-        read_step = core.execution_step(
-            '{}_step'.format(reader_net_name),
-            reader_net,
-            should_stop_blob=should_stop)
-        return (read_step, fields)
-
-
-class Writer:
-    """
-    Writer is an abstract class to be implemented in order to provide
-    operations capable of feeding a data stream or a dataset.
-
-    A Writer must implement 2 operations:
-    `write`, which adds operations to a net that write the write batch of
-    data, and `commit`, which adds operations to a net in order to indicate
-    that no more data will be written.
-    """
-    _schema = None
-
-    def schema(self):
-        return self._schema
-
-    def write(self, writer_net, fields):
-        """Add operations to `writer_net` that write the next batch of data.
-
-        Operations added to the net must be thread-safe and unique, that is:
-        multiple writers must be able to write to the dataset in parallel.
-
-        Args:
-            fields: a tuple of BlobReference containing the batch of data to
-                    write.
-        """
-        raise NotImplementedError('Writers must implement write.')
-
-    def write_record(self, writer_net, fields):
-        if isinstance(fields, Field):
-            self._schema = fields
-            fields = fields.field_blobs()
-        self.write(writer_net, fields)
-
-    def setup_ex(self, init_net, finish_net):
-        """Experimental, don't use yet"""
-        self.commit(finish_net)
-
-    def write_ex(self, fields, local_init_net, local_finish_net, stop_blob):
-        """Experimental extension to the interface. Don't use yet"""
-        write_net = core.Net('write_net')
-        self.write(write_net, fields)
-        return [write_net]
-
-    def write_record_ex(
-            self, fields, local_init_net, local_finish_net, stop_blob=None):
-        """Experimental extension to the interface. Don't use yet."""
-        if isinstance(fields, Field):
-            self._schema = fields
-            fields = fields.field_blobs()
-        if stop_blob is None:
-            stop_blob = local_init_net.NextName("dequeue_status")
-        write_nets = self.write_ex(
-            fields, local_init_net, local_finish_net, stop_blob)
-        return (write_nets, stop_blob)
-
-    def commit(self, finish_net):
-        """Add operations to `finish_net` that signal end of data.
-
-        This must be implemented by all Writers, but may be no-op for some
-        of them.
-        """
-        pass
-
-
-class ReaderBuilder:
-    """ Allow usage of a reader in distributed fashion. """
-    def schema(self):
-        raise NotImplementedError()
-
-    def setup(self, **kwargs):
-        """
-        Optionally, perform one-time setup before calling new_reader().
-        Subclass should make sure this function is only called once.
-        """
-        raise NotImplementedError()
-
-    def new_reader(self, **kwargs):
-        raise NotImplementedError()
-
-
-class PipedReaderBuilder(ReaderBuilder):
-    """ReaderBuilder that modifies underlying builder by calling `piper`
-    function on each new reader produced, and return the result of
-    the function. This way, it is possible to append data processing
-    pipelines that will be replicated for each reader that gets created.
-
-    E.g.:
-
-    PipedReaderBuilder(
-        ReaderBuilder(...),
-        lambda reader: pipe(reader, processor=my_proc))
-    """
-
-    def __init__(self, builder, piper):
-        self._builder = builder
-        self._piper = piper
-
-    def schema(self):
-        return self._builder.schema()
-
-    def setup(self, **kwargs):
-        return self._builder.setup(**kwargs)
-
-    def new_reader(self, **kwargs):
-        # Passing everything down since you could wrap a PipedReaderBuilder in
-        # another PipedReaderBuilder
-        output = self._piper(
-            reader=self._builder.new_reader(**kwargs),
-            **kwargs
-        )
-        return output if isinstance(output, Reader) else output.reader()
-
-
-class Pipe:
-    def __init__(self, schema=None, obj_key=None):
-        self._num_writers = 0
-        self._num_readers = 0
-        self._schema = schema
-        self._obj_key = obj_key
-
-    def schema(self):
-        return self._schema
-
-    def setup(self, global_init_net):
-        pass
-
-    def reader(self):
-        raise NotImplementedError()
-
-    def writer(self):
-        raise NotImplementedError()
-
-    def num_readers(self):
-        return self._num_readers
-
-    def num_writers(self):
-        return self._num_writers
-
-    def _new_writer(self, writer_schema, writer_init_net):
-        if writer_schema is not None and self._schema is None:
-            self._schema = writer_schema
-        self._num_writers += 1
-        if self._obj_key is not None:
-            writer_init_net.add_attribute(self._obj_key, self)
-
-    def _new_reader(self, reader_init_net):
-        self._num_readers += 1
-        if self._obj_key is not None:
-            reader_init_net.add_attribute(self._obj_key, self)
-
-
-class CounterReader(Reader):
-    """ Reader that produces increasing integers. """
-    def __init__(self):
-        Reader.__init__(self, schema=Struct(('iter', np.int64)))
-        self.counter = None
-        self.should_stop = None
-
-    def setup_ex(self, global_init_net, global_finish_net):
-        if self.counter is None:
-            self.counter = global_init_net.CreateCounter([], init_count=0)
-            self.should_stop = global_init_net.ConstantFill(
-                [], shape=[], dtype=core.DataType.BOOL, value=False)
-
-    def read_ex(self, local_init_net, local_finish_net):
-        count_net = core.Net('limited_reader_counter')
-        value = count_net.CountUp([self.counter], 1)
-        return [count_net], self.should_stop, [value]
-
-
-class ReaderWithLimitBase(Reader):
-    """Abstract Reader constrained by certain conditions.
-
-    Base class for Reader classes which check for certain conditions to stop
-    further processing (e.g. max number of iterations or time limit).
-    Also produces a boolean blob (data_finished) that can be used to see if
-    the reader exausted all input data (true) or stopped for another reason
-    (false).
-    """
-
-    def __init__(self, reader):
-        Reader.__init__(self, schema=reader._schema)
-        self.reader = reader
-        self.net = core.Net('reader_with_limit')
-        self._data_finished = self.net.AddExternalInput(
-            self.net.NextName('data_finished'))
-        self.should_stop = None
-
-    def setup_ex(self, global_init_net, global_finish_net):
-        global_init_net.ConstantFill(
-            [], [self._data_finished],
-            shape=[], value=False, dtype=core.DataType.BOOL)
-        self.reader.setup_ex(global_init_net, global_finish_net)
-        self.setup_limiter(global_init_net, global_finish_net)
-
-    def read_ex(self, local_init_net, local_finish_net):
-        """Reads from an underlying Reader class, but may stop due to additional
-        constraints.
-
-        Build and return network(s) to read data from a Reader with
-        additional constraints, depending on which derived class is used.
-        Derived classes implement setup_limited and check_limiter_condition
-        which determine the nature of the constraint imposed on the reader,
-        e.g. iteration limits or time limit.
-
-        Args:
-            local_init_net: A net invoked at task instance init time (Once per
-                parallel thread).
-            local_finish_net: A net invoked at task instance cleanup time (Once
-                per parallel thread).
-        """
-
-        # Check if limiting constraint is met.
-        stop_condition_net = core.Net('limited_reader_condition')
-        should_stop = self.check_limiter_condition(stop_condition_net)
-
-        # Call original reader.
-        nets, local_data_finished, fields = self.reader.read_ex(
-            local_init_net, local_finish_net)
-        self._set_schema(self.reader._schema)
-
-        # Check if original reader is done.
-        check_done_net = core.Net('limited_reader_post')
-        # Copy to the same blob as the counter output to trigger reader
-        # stopping - this is ok because execution will check should_stop_blob
-        # after every single operation, so it has already been checked on this
-        # iteration by this point.
-        check_done_net.Copy(local_data_finished, should_stop)
-        # Update externally-accessible flag indicating if reader is done
-        check_done_net.Or([self._data_finished, local_data_finished],
-                          [self._data_finished])
-
-        return [stop_condition_net] + nets + [check_done_net], should_stop, fields
-
-    def setup_limiter(self, global_init_net, global_finish_net):
-        """Configure task level init/cleanup nets required to implement limit
-        condition. Must be implemented by subclass.
-
-        Args:
-            global_init_net: A net invoked at task init time.
-            global_finish_net: A net invoked at task cleanup time.
-        """
-        raise NotImplementedError("Subclass must implement `setup_limiter`")
-
-    def check_limiter_condition(self, stop_condition_net):
-        """Configure a net that is invoked between reading batches to see if
-        limit condition is met. Must be implemented by subclass.
-
-        Args:
-            stop_condition_net: A net invoked to evaluate an early termination
-                condition.
-        """
-        raise NotImplementedError("Subclass must implement `check_limiter_condition")
-
-    def data_finished(self):
-        """
-        Return a blob that can be checked after the end of the reading task,
-        which will contain a scalar float indicating whether the underlying
-        reader has been exhausted (True) or whether we stopped because reached
-        the limit of iterations (False).
-        """
-        return self._data_finished
-
-
-class ReaderWithLimit(ReaderWithLimitBase):
-    """Reader that stops after `num_iter` batches.
-
-    If `num_iter` <= 0 or is None, reverts to an unconstrained reader that
-    exports a boolean blob indicating that the reader has exhausted
-    the data steam.
-    """
-    def __init__(self, reader, num_iter=1):
-        """Class initializer.
-
-        Args:
-            reader: The underlying reader object doing the actual read.
-            num_iter: Number of batches to read. If `None`,
-                the class reverts to a normal reader except that it also
-                produces a data_finished blob as a side effect to indicate
-                whether the input stream is exhausted.
-        """
-        super().__init__(reader)
-        self.counter = None
-        self.num_iter = num_iter
-        if self.num_iter is not None:
-            self.counter = self.net.AddExternalInput(
-                self.net.NextName('counter'))
-
-    def setup_limiter(self, global_init_net, global_finish_net):
-        if self.counter:
-            global_init_net.CreateCounter(
-                [], [self.counter], init_count=int(self.num_iter))
-
-    def check_limiter_condition(self, stop_condition_net):
-        if self.counter:
-            return stop_condition_net.CountDown([self.counter], 1)
-        else:
-            return stop_condition_net.ConstantFill(
-                [], 1,
-                shape=[], value=False, dtype=core.DataType.BOOL)
-
-
-def CountUntil(num_iter):
-    return ReaderWithLimit(CounterReader(), num_iter)
-
-
-class ReaderWithTimeLimit(ReaderWithLimitBase):
-    """Reader that stops after `duration` seconds.
-
-    If `duration` <= 0 or is None, reverts to an unconstrained reader that
-    exports a boolean blob indicating that the reader has exhausted
-    the data steam.
-    """
-    def __init__(self, reader, duration=0):
-        """Class initializer.
-
-        Args:
-            reader: The underlying reader object doing the actual read.
-            duration: Number of seconds to read. If un-specified, None, or <= 0,
-                the class reverts to a normal reader except that it also
-                produces a data_finished blob as a side effect to indicate
-                whether the input stream is exhausted.
-        """
-        super().__init__(reader)
-
-        self.timer = None
-        self.duration = duration
-        self.duration_ns_blob = None
-
-    def setup_limiter(self, global_init_net, global_finish_net):
-        if self.duration is not None and self.duration > 0:
-            duration_ns = int(self.duration * (10**9))
-
-            self.timer = global_init_net.TimerBegin(
-                [], counter_name='epoch_timer')
-            start_time = global_init_net.TimerGet(self.timer)
-            self.duration_ns_blob = global_init_net.ConstantFill(
-                [start_time], value=duration_ns)
-
-            global_finish_net.TimerEnd([self.timer], [])
-
-    def check_limiter_condition(self, stop_condition_net):
-        if self.duration:
-            time_elapsed = stop_condition_net.TimerGet(self.timer)
-            return stop_condition_net.GE(
-                [time_elapsed, self.duration_ns_blob], str(self.should_stop))
-        else:
-            return stop_condition_net.ConstantFill(
-                [], 1, shape=[], value=False, dtype=core.DataType.BOOL
-            )
-
-
-class ReaderWithDelay(Reader):
-    """Test reader class that inserts a delay between reading batches."""
-
-    def __init__(self, reader, delay):
-        Reader.__init__(self, schema=reader._schema)
-        self.reader = reader
-        self.delay = delay
-
-    def setup_ex(self, global_init_net, global_finish_net):
-        self.reader.setup_ex(global_init_net, global_finish_net)
-
-    def read_ex(self, local_init_net, local_finish_net):
-        read_net = core.Net("reader_body")
-
-        def sleep_op(*args, **argd):
-            time.sleep(self.delay)
-
-        read_net.Python(sleep_op)([], [])
-        return ([read_net],) + self.reader.read(read_net)
-
-
-class CompositeReader(Reader):
-    """
-    Base class for a reader that wrap multiple readers, e.g., reading from
-    multiple sources simultaneously.
-    """
-    def __init__(self, names, readers):
-        """
-        Args:
-            names: list[str] names of readers; used as schema keys
-            readers: list[Reader] Reader instances, must have schema
-        """
-        assert len(names) == len(readers)
-        super().__init__(schema=Struct(*[
-            (name, reader.schema()) for name, reader in zip(names, readers)
-        ]))
-        self._names = names
-        self._readers = readers
-
-    def setup_ex(self, init_net, finish_net):
-        for reader in self._readers:
-            reader.setup_ex(init_net, finish_net)
-
-    def read_ex(self, local_init_net, local_finish_net):
-        """
-        Stops when one of the reader finished
-        """
-        # First, instantiate all the reader nets
-        fields = []
-        stop_blobs = []
-        all_sub_read_nets = []
-        for name, reader in zip(self._names, self._readers):
-            sub_read_nets, should_stop, record = reader.read_record_ex(
-                local_init_net, local_finish_net)
-            stop_blobs.append(should_stop)
-            all_sub_read_nets.append(sub_read_nets)
-            fields.extend(record.field_blobs())
-
-        read_nets = []
-        # Use the stop blob of the last reader as stop blob of composite reader.
-        local_should_stop = stop_blobs[-1]
-        for name, sub_read_nets, stop_blob in zip(self._names, all_sub_read_nets, stop_blobs):
-            read_nets.extend(sub_read_nets)
-            if stop_blob == local_should_stop:
-                # Skip adding stop net because Or([A, A], A) doesn't pass operator
-                # schema check
-                continue
-            stop_net = core.Net("{}_stop".format(name))
-            stop_net.Or([local_should_stop, stop_blob], local_should_stop)
-            read_nets.append(stop_net)
-
-        return read_nets, local_should_stop, fields
-
-    def reset(self, net):
-        for reader in self._readers:
-            reader.reset(net)
-
-
-class CompositeReaderBuilder(ReaderBuilder):
-    """
-    A reader builder for CompositeReader
-    """
-    def __init__(self, names, reader_builders):
-        """
-        Args:
-            names: list[str] names of readers; used as schema keys
-            reader_builders: list[ReaderBuilder] ReaderBuilder instances;
-                must have schema
-        """
-        super().__init__()
-        self._names = names
-        self._reader_builders = reader_builders
-        self._schema = Struct(*[
-            (name, reader_builder.schema())
-            for name, reader_builder in zip(names, reader_builders)
-        ])
-
-    def schema(self):
-        return self._schema
-
-    def setup(self, **kwargs):
-        data_finished_blobs = {}
-        # limiter is stateful; it can only be used once. Since
-        # CompositeReader stops when one of the reader stops,
-        # this is fine.
-        if "limiter" in kwargs:
-            limiter = kwargs.pop("limiter")
-        else:
-            limiter = None
-        for i, reader_builder in enumerate(self._reader_builders):
-            if i == len(self._reader_builders) - 1 and limiter is not None:
-                # The limiter must be applied to the last reader so that the
-                # batch counter is incremented only if every reader has data
-                kwargs["limiter"] = limiter
-            sub_reader_data_finished_blobs = reader_builder.setup(**kwargs)
-            overlapping_keys = set(data_finished_blobs.keys()) & set(sub_reader_data_finished_blobs.keys())
-            overlapping_values = set(data_finished_blobs.values()) & set(sub_reader_data_finished_blobs.values())
-            assert overlapping_keys == set(), "Overlapping keys: {}".format(overlapping_keys)
-            assert overlapping_values == set(), "Overlapping values: {}".format(overlapping_values)
-            data_finished_blobs.update(sub_reader_data_finished_blobs)
-
-        return data_finished_blobs
-
-    def new_reader(self, **kwargs):
-        readers = []
-        for reader_builder in self._reader_builders:
-            reader = reader_builder.new_reader(**kwargs)
-            if isinstance(reader, Reader):
-                pass
-            elif hasattr(reader, 'reader'):
-                reader = reader.reader()
-            else:
-                raise ValueError('reader must be an instance of Reader or Pipe')
-            readers.append(reader)
-
-        multi_reader = CompositeReader(self._names, readers)
-        assert multi_reader.schema() == self._schema
-        return multi_reader
diff --git a/caffe2/python/dataio_test.py b/caffe2/python/dataio_test.py
deleted file mode 100644
index ac1c72284fbf..000000000000
--- a/caffe2/python/dataio_test.py
+++ /dev/null
@@ -1,445 +0,0 @@
-
-
-
-
-
-from caffe2.python.dataio import (
-    CompositeReader,
-    CompositeReaderBuilder,
-    ReaderBuilder,
-    ReaderWithDelay,
-    ReaderWithLimit,
-    ReaderWithTimeLimit,
-)
-from caffe2.python.dataset import Dataset
-from caffe2.python.db_file_reader import DBFileReader
-from caffe2.python.pipeline import pipe
-from caffe2.python.schema import Struct, NewRecord, FeedRecord
-from caffe2.python.session import LocalSession
-from caffe2.python.task import TaskGroup, final_output, WorkspaceType
-from caffe2.python.test_util import TestCase
-from caffe2.python.cached_reader import CachedReader
-from caffe2.python import core, workspace, schema
-from caffe2.python.net_builder import ops
-
-import numpy as np
-import numpy.testing as npt
-import os
-import shutil
-import unittest
-import tempfile
-
-
-def make_source_dataset(ws, size=100, offset=0, name=None):
-    name = name or "src"
-    src_init = core.Net("{}_init".format(name))
-    with core.NameScope(name):
-        src_values = Struct(('label', np.array(range(offset, offset + size))))
-        src_blobs = NewRecord(src_init, src_values)
-        src_ds = Dataset(src_blobs, name=name)
-        FeedRecord(src_blobs, src_values, ws)
-    ws.run(src_init)
-    return src_ds
-
-
-def make_destination_dataset(ws, schema, name=None):
-    name = name or 'dst'
-    dst_init = core.Net('{}_init'.format(name))
-    with core.NameScope(name):
-        dst_ds = Dataset(schema, name=name)
-        dst_ds.init_empty(dst_init)
-    ws.run(dst_init)
-    return dst_ds
-
-
-class TestReaderBuilder(ReaderBuilder):
-    def __init__(self, name, size, offset):
-        self._schema = schema.Struct(
-            ('label', schema.Scalar()),
-        )
-        self._name = name
-        self._size = size
-        self._offset = offset
-        self._src_ds = None
-
-    def schema(self):
-        return self._schema
-
-    def setup(self, ws):
-        self._src_ds = make_source_dataset(ws, offset=self._offset, size=self._size,
-                                    name=self._name)
-        return {}
-
-    def new_reader(self, **kwargs):
-        return self._src_ds
-
-
-class TestCompositeReader(TestCase):
-    @unittest.skipIf(os.environ.get('JENKINS_URL'), 'Flaky test on Jenkins')
-    def test_composite_reader(self):
-        ws = workspace.C.Workspace()
-        session = LocalSession(ws)
-        num_srcs = 3
-        names = ["src_{}".format(i) for i in range(num_srcs)]
-        size = 100
-        offsets = [i * size for i in range(num_srcs)]
-        src_dses = [make_source_dataset(ws, offset=offset, size=size, name=name)
-                    for (name, offset) in zip(names, offsets)]
-
-        data = [ws.fetch_blob(str(src.field_blobs[0])) for src in src_dses]
-        # Sanity check we didn't overwrite anything
-        for d, offset in zip(data, offsets):
-            npt.assert_array_equal(d, range(offset, offset + size))
-
-        # Make an identically-sized empty destination dataset
-        dst_ds_schema = schema.Struct(
-            *[
-                (name, src_ds.content().clone_schema())
-                for name, src_ds in zip(names, src_dses)
-            ]
-        )
-        dst_ds = make_destination_dataset(ws, dst_ds_schema)
-
-        with TaskGroup() as tg:
-            reader = CompositeReader(names,
-                                     [src_ds.reader() for src_ds in src_dses])
-            pipe(reader, dst_ds.writer(), num_runtime_threads=3)
-        session.run(tg)
-
-        for i in range(num_srcs):
-            written_data = sorted(
-                ws.fetch_blob(str(dst_ds.content()[names[i]].label())))
-            npt.assert_array_equal(data[i], written_data, "i: {}".format(i))
-
-    @unittest.skipIf(os.environ.get('JENKINS_URL'), 'Flaky test on Jenkins')
-    def test_composite_reader_builder(self):
-        ws = workspace.C.Workspace()
-        session = LocalSession(ws)
-        num_srcs = 3
-        names = ["src_{}".format(i) for i in range(num_srcs)]
-        size = 100
-        offsets = [i * size for i in range(num_srcs)]
-        src_ds_builders = [
-            TestReaderBuilder(offset=offset, size=size, name=name)
-            for (name, offset) in zip(names, offsets)
-        ]
-
-        # Make an identically-sized empty destination dataset
-        dst_ds_schema = schema.Struct(
-            *[
-                (name, src_ds_builder.schema())
-                for name, src_ds_builder in zip(names, src_ds_builders)
-            ]
-        )
-        dst_ds = make_destination_dataset(ws, dst_ds_schema)
-
-        with TaskGroup() as tg:
-            reader_builder = CompositeReaderBuilder(
-                names, src_ds_builders)
-            reader_builder.setup(ws=ws)
-            pipe(reader_builder.new_reader(), dst_ds.writer(),
-                 num_runtime_threads=3)
-        session.run(tg)
-
-        for name, offset in zip(names, offsets):
-            written_data = sorted(
-                ws.fetch_blob(str(dst_ds.content()[name].label())))
-            npt.assert_array_equal(range(offset, offset + size), written_data,
-                                   "name: {}".format(name))
-
-
-class TestReaderWithLimit(TestCase):
-    def test_runtime_threads(self):
-        ws = workspace.C.Workspace()
-        session = LocalSession(ws)
-        src_ds = make_source_dataset(ws)
-        totals = [None] * 3
-
-        def proc(rec):
-            # executed once
-            with ops.task_init():
-                counter1 = ops.CreateCounter([], ['global_counter'])
-                counter2 = ops.CreateCounter([], ['global_counter2'])
-                counter3 = ops.CreateCounter([], ['global_counter3'])
-            # executed once per thread
-            with ops.task_instance_init():
-                task_counter = ops.CreateCounter([], ['task_counter'])
-            # executed on each iteration
-            ops.CountUp(counter1)
-            ops.CountUp(task_counter)
-            # executed once per thread
-            with ops.task_instance_exit():
-                with ops.loop(ops.RetrieveCount(task_counter)):
-                    ops.CountUp(counter2)
-                ops.CountUp(counter3)
-            # executed once
-            with ops.task_exit():
-                totals[0] = final_output(ops.RetrieveCount(counter1))
-                totals[1] = final_output(ops.RetrieveCount(counter2))
-                totals[2] = final_output(ops.RetrieveCount(counter3))
-            return rec
-
-        # Read full data set from original reader
-        with TaskGroup() as tg:
-            pipe(src_ds.reader(), num_runtime_threads=8, processor=proc)
-        session.run(tg)
-        self.assertEqual(totals[0].fetch(), 100)
-        self.assertEqual(totals[1].fetch(), 100)
-        self.assertEqual(totals[2].fetch(), 8)
-
-        # Read with a count-limited reader
-        with TaskGroup() as tg:
-            q1 = pipe(src_ds.reader(), num_runtime_threads=2)
-            q2 = pipe(
-                ReaderWithLimit(q1.reader(), num_iter=25),
-                num_runtime_threads=3)
-            pipe(q2, processor=proc, num_runtime_threads=6)
-        session.run(tg)
-        self.assertEqual(totals[0].fetch(), 25)
-        self.assertEqual(totals[1].fetch(), 25)
-        self.assertEqual(totals[2].fetch(), 6)
-
-    def _test_limit_reader_init_shared(self, size):
-        ws = workspace.C.Workspace()
-        session = LocalSession(ws)
-
-        # Make source dataset
-        src_ds = make_source_dataset(ws, size=size)
-
-        # Make an identically-sized empty destination Dataset
-        dst_ds = make_destination_dataset(ws, src_ds.content().clone_schema())
-
-        return ws, session, src_ds, dst_ds
-
-    def _test_limit_reader_shared(self, reader_class, size, expected_read_len,
-                                  expected_read_len_threshold,
-                                  expected_finish, num_threads, read_delay,
-                                  **limiter_args):
-        ws, session, src_ds, dst_ds = \
-            self._test_limit_reader_init_shared(size)
-
-        # Read without limiter
-        # WorkspaceType.GLOBAL is required because we are fetching
-        # reader.data_finished() after the TaskGroup finishes.
-        with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg:
-            if read_delay > 0:
-                reader = reader_class(ReaderWithDelay(src_ds.reader(),
-                                                      read_delay),
-                                      **limiter_args)
-            else:
-                reader = reader_class(src_ds.reader(), **limiter_args)
-            pipe(reader, dst_ds.writer(), num_runtime_threads=num_threads)
-        session.run(tg)
-        read_len = len(sorted(ws.blobs[str(dst_ds.content().label())].fetch()))
-
-        # Do a fuzzy match (expected_read_len +/- expected_read_len_threshold)
-        # to eliminate flakiness for time-limited tests
-        self.assertGreaterEqual(
-            read_len,
-            expected_read_len - expected_read_len_threshold)
-        self.assertLessEqual(
-            read_len,
-            expected_read_len + expected_read_len_threshold)
-        self.assertEqual(
-            sorted(ws.blobs[str(dst_ds.content().label())].fetch()),
-            list(range(read_len))
-        )
-        self.assertEqual(ws.blobs[str(reader.data_finished())].fetch(),
-                         expected_finish)
-
-    def test_count_limit_reader_without_limit(self):
-        # No iter count specified, should read all records.
-        self._test_limit_reader_shared(ReaderWithLimit,
-                                       size=100,
-                                       expected_read_len=100,
-                                       expected_read_len_threshold=0,
-                                       expected_finish=True,
-                                       num_threads=8,
-                                       read_delay=0,
-                                       num_iter=None)
-
-    def test_count_limit_reader_with_zero_limit(self):
-        # Zero iter count specified, should read 0 records.
-        self._test_limit_reader_shared(ReaderWithLimit,
-                                       size=100,
-                                       expected_read_len=0,
-                                       expected_read_len_threshold=0,
-                                       expected_finish=False,
-                                       num_threads=8,
-                                       read_delay=0,
-                                       num_iter=0)
-
-    def test_count_limit_reader_with_low_limit(self):
-        # Read with limit smaller than size of dataset
-        self._test_limit_reader_shared(ReaderWithLimit,
-                                       size=100,
-                                       expected_read_len=10,
-                                       expected_read_len_threshold=0,
-                                       expected_finish=False,
-                                       num_threads=8,
-                                       read_delay=0,
-                                       num_iter=10)
-
-    def test_count_limit_reader_with_high_limit(self):
-        # Read with limit larger than size of dataset
-        self._test_limit_reader_shared(ReaderWithLimit,
-                                       size=100,
-                                       expected_read_len=100,
-                                       expected_read_len_threshold=0,
-                                       expected_finish=True,
-                                       num_threads=8,
-                                       read_delay=0,
-                                       num_iter=110)
-
-    def test_time_limit_reader_without_limit(self):
-        # No duration specified, should read all records.
-        self._test_limit_reader_shared(ReaderWithTimeLimit,
-                                       size=100,
-                                       expected_read_len=100,
-                                       expected_read_len_threshold=0,
-                                       expected_finish=True,
-                                       num_threads=8,
-                                       read_delay=0.1,
-                                       duration=0)
-
-    def test_time_limit_reader_with_short_limit(self):
-        # Read with insufficient time limit
-        size = 50
-        num_threads = 4
-        sleep_duration = 0.25
-        duration = 1
-        expected_read_len = int(round(num_threads * duration / sleep_duration))
-        # Because the time limit check happens before the delay + read op,
-        # subtract a little bit of time to ensure we don't get in an extra read
-        duration = duration - 0.25 * sleep_duration
-
-        # NOTE: `expected_read_len_threshold` was added because this test case
-        # has significant execution variation under stress. Under stress, we may
-        # read strictly less than the expected # of samples; anywhere from
-        # [0,N] where N = expected_read_len.
-        # Hence we set expected_read_len to N/2, plus or minus N/2.
-        self._test_limit_reader_shared(ReaderWithTimeLimit,
-                                       size=size,
-                                       expected_read_len=expected_read_len / 2,
-                                       expected_read_len_threshold=expected_read_len / 2,
-                                       expected_finish=False,
-                                       num_threads=num_threads,
-                                       read_delay=sleep_duration,
-                                       duration=duration)
-
-    def test_time_limit_reader_with_long_limit(self):
-        # Read with ample time limit
-        # NOTE: we don't use `expected_read_len_threshold` because the duration,
-        # read_delay, and # threads should be more than sufficient
-        self._test_limit_reader_shared(ReaderWithTimeLimit,
-                                       size=50,
-                                       expected_read_len=50,
-                                       expected_read_len_threshold=0,
-                                       expected_finish=True,
-                                       num_threads=4,
-                                       read_delay=0.2,
-                                       duration=10)
-
-
-class TestDBFileReader(TestCase):
-    def setUp(self):
-        self.temp_paths = []
-
-    def tearDown(self):
-        # In case any test method fails, clean up temp paths.
-        for path in self.temp_paths:
-            self._delete_path(path)
-
-    @staticmethod
-    def _delete_path(path):
-        if os.path.isfile(path):
-            os.remove(path)  # Remove file.
-        elif os.path.isdir(path):
-            shutil.rmtree(path)  # Remove dir recursively.
-
-    def _make_temp_path(self):
-        # Make a temp path as db_path.
-        with tempfile.NamedTemporaryFile() as f:
-            temp_path = f.name
-        self.temp_paths.append(temp_path)
-        return temp_path
-
-    @staticmethod
-    def _build_source_reader(ws, size):
-        src_ds = make_source_dataset(ws, size)
-        return src_ds.reader()
-
-    @staticmethod
-    def _read_all_data(ws, reader, session):
-        dst_ds = make_destination_dataset(ws, reader.schema().clone_schema())
-
-        with TaskGroup() as tg:
-            pipe(reader, dst_ds.writer(), num_runtime_threads=8)
-        session.run(tg)
-
-        return ws.blobs[str(dst_ds.content().label())].fetch()
-
-    @unittest.skipIf("LevelDB" not in core.C.registered_dbs(), "Need LevelDB")
-    def test_cached_reader(self):
-        ws = workspace.C.Workspace()
-        session = LocalSession(ws)
-        db_path = self._make_temp_path()
-
-        # Read data for the first time.
-        cached_reader1 = CachedReader(
-            self._build_source_reader(ws, 100), db_path, loop_over=False,
-        )
-        build_cache_step = cached_reader1.build_cache_step()
-        session.run(build_cache_step)
-
-        data = self._read_all_data(ws, cached_reader1, session)
-        self.assertEqual(sorted(data), list(range(100)))
-
-        # Read data from cache.
-        cached_reader2 = CachedReader(
-            self._build_source_reader(ws, 200), db_path,
-        )
-        build_cache_step = cached_reader2.build_cache_step()
-        session.run(build_cache_step)
-
-        data = self._read_all_data(ws, cached_reader2, session)
-        self.assertEqual(sorted(data), list(range(100)))
-
-        self._delete_path(db_path)
-
-        # We removed cache so we expect to receive data from original reader.
-        cached_reader3 = CachedReader(
-            self._build_source_reader(ws, 300), db_path,
-        )
-        build_cache_step = cached_reader3.build_cache_step()
-        session.run(build_cache_step)
-
-        data = self._read_all_data(ws, cached_reader3, session)
-        self.assertEqual(sorted(data), list(range(300)))
-
-        self._delete_path(db_path)
-
-    @unittest.skipIf("LevelDB" not in core.C.registered_dbs(), "Need LevelDB")
-    def test_db_file_reader(self):
-        ws = workspace.C.Workspace()
-        session = LocalSession(ws)
-        db_path = self._make_temp_path()
-
-        # Build a cache DB file.
-        cached_reader = CachedReader(
-            self._build_source_reader(ws, 100),
-            db_path=db_path,
-            db_type='LevelDB',
-        )
-        build_cache_step = cached_reader.build_cache_step()
-        session.run(build_cache_step)
-
-        # Read data from cache DB file.
-        db_file_reader = DBFileReader(
-            db_path=db_path,
-            db_type='LevelDB',
-        )
-        data = self._read_all_data(ws, db_file_reader, session)
-        self.assertEqual(sorted(data), list(range(100)))
-
-        self._delete_path(db_path)
diff --git a/caffe2/python/dataset.py b/caffe2/python/dataset.py
deleted file mode 100644
index abb1f27d87ca..000000000000
--- a/caffe2/python/dataset.py
+++ /dev/null
@@ -1,344 +0,0 @@
-## @package dataset
-# Module caffe2.python.dataset
-"""
-Implementation of an in-memory dataset with structured schema.
-
-Use this to store and iterate through datasets with complex schema that
-fit in memory.
-
-Iterating through entries of this dataset is very fast since the dataset
-is stored as a set of native Caffe2 tensors, thus no type conversion or
-deserialization is necessary.
-"""
-
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.dataio import Reader, Writer
-from caffe2.python.schema import (
-    Struct, from_blob_list, from_column_list, InitEmptyRecord)
-import numpy as np
-
-
-class _DatasetReader(Reader):
-    def __init__(self, dataset, name, batch_size=1, enforce_batch_size=False):
-        """Don't call this directly. Instead, use dataset.reader()"""
-        Reader.__init__(self, dataset.content())
-        self.dataset = dataset
-        self.name = name or (dataset.name + '_cursor')
-        self.batch_size = batch_size
-        self.enforce_batch_size = enforce_batch_size
-        self.cursor = None
-
-    def setup_ex(self, init_net, exit_net):
-        if self.cursor is None:
-            self.cursor = init_net.CreateTreeCursor(
-                [],
-                init_net.NextScopedBlob(self.name),
-                fields=self.dataset.fields)
-
-    def read(self, read_net):
-        assert self.cursor, 'setup not called.'
-        content = self.dataset.content()
-        with core.NameScope(read_net.NextName(self.name)):
-            fields = read_net.ReadNextBatch(
-                [self.cursor] + content.field_blobs(),
-                content.field_names(),
-                batch_size=self.batch_size,
-                enforce_batch_size=self.enforce_batch_size)
-            fields = core.output_to_list(fields)
-            return (read_net.IsEmpty([fields[0]]), fields)
-
-    def reset(self, net):
-        net.ResetCursor([self.cursor], [])
-
-
-class _DatasetRandomReader(Reader):
-    def __init__(self, dataset, name, indices, batch_size=1, loop_over=False,
-                 enforce_batch_size=False):
-        """Don't call this directly. Instead, use dataset.random_reader()"""
-        Reader.__init__(self, dataset.content())
-        self.dataset = dataset
-        self.cursor = None
-        self.name = name or (dataset.name + '_cursor')
-        self.indices = indices
-        self.batch_size = batch_size
-        self.loop_over = loop_over
-        self.enforce_batch_size = enforce_batch_size
-
-    def setup_ex(self, init_net, exit_net):
-        if self.cursor is None:
-            self.cursor = init_net.CreateTreeCursor(
-                [],
-                init_net.NextScopedBlob(self.name),
-                fields=self.dataset.fields)
-
-    def reset(self, net):
-        net.ResetCursor([self.cursor], [])
-
-    def computeoffset(self, net):
-        self.reset(net)
-        offsets = net.ComputeOffset(
-            [self.cursor] + self.dataset.content().field_blobs(),
-            'offsets')
-        self.offsets = offsets
-
-    def sort_and_shuffle(self, net, sort_by_field=None,
-                         shuffle_size=1, batch_size=1):
-        # no sorting by default
-        content = self.dataset.content()
-        sort_by_field_idx = -1
-        if sort_by_field:
-            assert sort_by_field in content.field_names(), (
-                'Must be valid field.')
-            sort_by_field_idx = content.field_names().index(sort_by_field)
-        self.reset(net)
-
-        indices = net.SortAndShuffle(
-            [self.cursor] + content.field_blobs(),
-            'indices',
-            sort_by_field_idx=sort_by_field_idx,
-            shuffle_size=shuffle_size,
-            batch_size=batch_size)
-        self.indices = indices
-
-    def read(self, read_net):
-        assert self.cursor, 'setup_ex not called'
-        assert self.indices, 'sort_and_shuffle not called'
-        assert self.offsets, 'computeoffset not called'
-        content = self.dataset.content()
-        with core.NameScope(read_net.NextName(self.name)):
-            fields = read_net.ReadRandomBatch(
-                [self.cursor, self.indices, self.offsets] + (
-                    content.field_blobs()),
-                content.field_names(),
-                batch_size=self.batch_size,
-                enforce_batch_size=self.enforce_batch_size,
-                loop_over=self.loop_over)
-            fields = core.output_to_list(fields)
-            return (read_net.IsEmpty([fields[0]]), fields)
-
-
-class _DatasetWriter(Writer):
-    def __init__(self, content):
-        """Don't call this directly. Use dataset.writer() instead."""
-        self._content = content
-        self.mutex = None
-
-    def setup_ex(self, init_net, exit_net):
-        if self.mutex is None:
-            self.mutex = init_net.CreateMutex([])
-
-    def write(self, writer_net, fields):
-        """
-        Add operations to `net` that append the blobs in `fields` to the end
-        of the dataset. An additional operator will also be added that checks
-        the consistency of the data in `fields` against the dataset schema.
-
-        Args:
-            writer_net: The net that will contain the Append operators.
-            fields: A list of BlobReference to be appeneded to this dataset.
-        """
-        assert self.mutex is not None, 'setup not called.'
-        field_blobs = self._content.field_blobs()
-        assert len(fields) == len(field_blobs), (
-            'Expected %s fields, got %s.' % (len(field_blobs), len(fields)))
-        writer_net.CheckDatasetConsistency(
-            fields, [], fields=self._content.field_names())
-        writer_net.AtomicAppend(
-            [self.mutex] + field_blobs + list(fields),
-            field_blobs)
-
-    def commit(self, finish_net):
-        """Commit is a no-op for an in-memory dataset."""
-        pass
-
-
-def Const(net, value, dtype=None, name=None):
-    """
-    Create a 'constant' by first creating an external input in the given
-    net, and then feeding the corresponding blob with its provided value
-    in the current workspace. The name is automatically generated in order
-    to avoid clashes with existing blob names.
-    """
-    assert isinstance(net, core.Net), 'net must be a core.Net instance.'
-    value = np.array(value, dtype=dtype)
-    blob = net.AddExternalInput(net.NextName(prefix=name))
-    workspace.FeedBlob(str(blob), value)
-    return blob
-
-
-def execution_step_with_progress(name, init_net, substeps, rows_read):
-    # progress reporter
-    report_net = core.Net('report_net')
-    report_net.Print([rows_read], [])
-    return core.execution_step(
-        name,
-        substeps,
-        report_net=report_net,
-        concurrent_substeps=True,
-        report_interval=5)
-
-
-class Dataset:
-    """Represents an in-memory dataset with fixed schema.
-
-    Use this to store and iterate through datasets with complex schema that
-    fit in memory.
-
-    Iterating through entries of this dataset is very fast since the dataset
-    is stored as a set of native Caffe2 tensors, thus no type conversion or
-    deserialization is necessary.
-    """
-
-    def __init__(self, fields, name=None):
-        """Create an un-initialized dataset with schema provided by `fields`.
-
-        Before this dataset can be used, it must be initialized, either by
-        `init_empty` or `init_from_dataframe`.
-
-        Args:
-            fields: either a schema.Struct or a list of field names in a format
-                    compatible with the one described in schema.py.
-            name: optional name to prepend to blobs that will store the data.
-        """
-        assert isinstance(fields, list) or isinstance(fields, Struct), (
-            'fields must be either a Struct or a list of raw field names.')
-        if isinstance(fields, list):
-            fields = from_column_list(fields)
-        self.schema = fields
-        self.fields = fields.field_names()
-        self.field_types = fields.field_types()
-        self.name = name or 'dataset'
-        self.field_blobs = fields.field_blobs() if fields.has_blobs() else None
-
-    def trim(self, net, multiple_of):
-        """
-        Trims the contents of this dataset so that the number of records is
-        multiple of the given argument.
-        """
-        net.TrimDataset(
-            self.field_blobs,
-            self.field_blobs,
-            fields=self.fields,
-            multiple_of=multiple_of)
-
-    def init_empty(self, init_net):
-        """Initialize the blobs for this dataset with empty values.
-
-        Empty arrays will be immediately fed into the current workspace,
-        and `init_net` will take those blobs as external inputs.
-        """
-        self.field_blobs = InitEmptyRecord(
-            init_net, self.schema.clone_schema()).field_blobs()
-
-    def init_from_dataframe(self, net, dataframe):
-        """Initialize the blobs for this dataset from a Pandas dataframe.
-
-        Each column of the dataframe will be immediately fed into the current
-        workspace, and the `net` will take this blobs as external inputs.
-        """
-        assert len(self.fields) == len(dataframe.columns)
-        self.field_blobs = [
-            Const(net, dataframe.as_matrix([col]).flatten(), name=field)
-            for col, field in enumerate(self.fields)]
-
-    def get_blobs(self):
-        """
-        Return the list of BlobReference pointing to the blobs that contain
-        the data for this dataset.
-        """
-        assert self
-        return self.field_blobs
-
-    def content(self):
-        """
-        Return a Record of BlobReferences pointing to the full content of
-        this dataset.
-        """
-        return from_blob_list(self.schema, self.field_blobs)
-
-    def field_names(self):
-        """Return the list of field names for this dataset."""
-        return self.fields
-
-    def field_types(self):
-        """
-        Return the list of field dtypes for this dataset.
-
-        If a list of strings, not a schema.Struct, was passed to the
-        constructor, this will return a list of dtype(np.void).
-        """
-        return self.field_types
-
-    def reader(self, init_net=None, cursor_name=None, batch_size=1,
-               enforce_batch_size=False):
-        """Create a Reader object that is used to iterate through the dataset.
-
-        This will append operations to `init_net` that create a TreeCursor,
-        used to iterate through the data.
-
-        NOTE: Currently, it is not safe to append to a dataset while reading.
-
-        Args:
-            init_net: net that will be run once to create the cursor.
-            cursor_name: optional name for the blob containing a pointer
-                         to the cursor.
-            batch_size: how many samples to read per iteration.
-
-        Returns:
-            A _DatasetReader that can be used to create operators that will
-            iterate through the dataset.
-        """
-        assert self.field_blobs, 'Dataset not initialized.'
-        reader = _DatasetReader(self, cursor_name, batch_size,
-                                enforce_batch_size)
-        if init_net is not None:
-            reader.setup_ex(init_net, None)
-        return reader
-
-    def random_reader(self, init_net=None, indices=None, cursor_name=None,
-                      batch_size=1, loop_over=False, enforce_batch_size=False):
-        """Create a Reader object that is used to iterate through the dataset.
-
-        NOTE: The reader order depends on the order in indices.
-
-        Args:
-            init_net: net that will be run once to create the cursor.
-            indices: blob of reading order
-            cursor_name: optional name for the blob containing a pointer
-                         to the cursor.
-            batch_size: how many samples to read per iteration.
-            loop_over: repeat the dataset indefinitely (in the same order)
-
-        Returns:
-            A DatasetReader that can be used to create operators that will
-            iterate through the dataset according to indices.
-        """
-        assert self.field_blobs, 'Dataset not initialized.'
-        reader = _DatasetRandomReader(
-            self, cursor_name, indices, batch_size, loop_over,
-            enforce_batch_size)
-        if init_net is not None:
-            reader.setup_ex(init_net, None)
-        return reader
-
-    def writer(self, init_net=None):
-        """Create a Writer that can be used to append entries into the dataset.
-
-        NOTE: Currently, it is not safe to append to a dataset
-              while reading from it.
-        NOTE: Currently implementation of writer is not thread safe.
-              TODO: fixme
-
-        Args:
-            init_net: net that will be run once in order to create the writer.
-                      (currently not used)
-        """
-        assert self.field_blobs, 'Dataset not initialized.'
-        writer = _DatasetWriter(self.content())
-        if init_net is not None:
-            writer.setup_ex(init_net, None)
-        return writer
diff --git a/caffe2/python/db_file_reader.py b/caffe2/python/db_file_reader.py
deleted file mode 100644
index 7b1f2cccae0e..000000000000
--- a/caffe2/python/db_file_reader.py
+++ /dev/null
@@ -1,182 +0,0 @@
-## @package db_file_reader
-# Module caffe2.python.db_file_reader
-
-
-
-
-
-from caffe2.python import core, scope, workspace, _import_c_extension as C
-from caffe2.python.dataio import Reader
-from caffe2.python.dataset import Dataset
-from caffe2.python.schema import from_column_list
-
-import os
-
-
-class DBFileReader(Reader):
-
-    default_name_suffix = 'db_file_reader'
-
-    """Reader reads from a DB file.
-
-    Example usage:
-    db_file_reader = DBFileReader(db_path='/tmp/cache.db', db_type='LevelDB')
-
-    Args:
-        db_path: str.
-        db_type: str. DB type of file. A db_type is registed by
-            `REGISTER_CAFFE2_DB(<db_type>, <DB Class>)`.
-        name: str or None. Name of DBFileReader.
-            Optional name to prepend to blobs that will store the data.
-            Default to '<db_name>_<default_name_suffix>'.
-        batch_size: int.
-            How many examples are read for each time the read_net is run.
-        loop_over: bool.
-            If True given, will go through examples in random order endlessly.
-        field_names: List[str]. If the schema.field_names() should not in
-            alphabetic order, it must be specified.
-            Otherwise, schema will be automatically restored with
-            schema.field_names() sorted in alphabetic order.
-    """
-    def __init__(
-        self,
-        db_path,
-        db_type,
-        name=None,
-        batch_size=100,
-        loop_over=False,
-        field_names=None,
-    ):
-        assert db_path is not None, "db_path can't be None."
-        assert db_type in C.registered_dbs(), \
-            "db_type [{db_type}] is not available. \n" \
-            "Choose one of these: {registered_dbs}.".format(
-                db_type=db_type,
-                registered_dbs=C.registered_dbs(),
-        )
-
-        self.db_path = os.path.expanduser(db_path)
-        self.db_type = db_type
-        self.name = name or '{db_name}_{default_name_suffix}'.format(
-            db_name=self._extract_db_name_from_db_path(),
-            default_name_suffix=self.default_name_suffix,
-        )
-        self.batch_size = batch_size
-        self.loop_over = loop_over
-
-        # Before self._init_reader_schema(...),
-        # self.db_path and self.db_type are required to be set.
-        super().__init__(self._init_reader_schema(field_names))
-        self.ds = Dataset(self._schema, self.name + '_dataset')
-        self.ds_reader = None
-
-    def _init_name(self, name):
-        return name or self._extract_db_name_from_db_path(
-        ) + '_db_file_reader'
-
-    def _init_reader_schema(self, field_names=None):
-        """Restore a reader schema from the DB file.
-
-        If `field_names` given, restore scheme according to it.
-
-        Overwise, loade blobs from the DB file into the workspace,
-        and restore schema from these blob names.
-        It is also assumed that:
-        1). Each field of the schema have corresponding blobs
-            stored in the DB file.
-        2). Each blob loaded from the DB file corresponds to
-            a field of the schema.
-        3). field_names in the original schema are in alphabetic order,
-            since blob names loaded to the workspace from the DB file
-            will be in alphabetic order.
-
-        Load a set of blobs from a DB file. From names of these blobs,
-        restore the DB file schema using `from_column_list(...)`.
-
-        Returns:
-            schema: schema.Struct. Used in Reader.__init__(...).
-        """
-        if field_names:
-            return from_column_list(field_names)
-
-        if self.db_type == "log_file_db":
-            assert os.path.exists(self.db_path), \
-                'db_path [{db_path}] does not exist'.format(db_path=self.db_path)
-        with core.NameScope(self.name):
-            # blob_prefix is for avoiding name conflict in workspace
-            blob_prefix = scope.CurrentNameScope()
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                'Load',
-                [],
-                [],
-                absolute_path=True,
-                db=self.db_path,
-                db_type=self.db_type,
-                load_all=True,
-                add_prefix=blob_prefix,
-            )
-        )
-        col_names = [
-            blob_name[len(blob_prefix):] for blob_name in sorted(workspace.Blobs())
-            if blob_name.startswith(blob_prefix)
-        ]
-        schema = from_column_list(col_names)
-        return schema
-
-    def setup_ex(self, init_net, finish_net):
-        """From the Dataset, create a _DatasetReader and setup a init_net.
-
-        Make sure the _init_field_blobs_as_empty(...) is only called once.
-
-        Because the underlying NewRecord(...) creats blobs by calling
-        NextScopedBlob(...), so that references to previously-initiated
-        empty blobs will be lost, causing accessibility issue.
-        """
-        if self.ds_reader:
-            self.ds_reader.setup_ex(init_net, finish_net)
-        else:
-            self._init_field_blobs_as_empty(init_net)
-            self._feed_field_blobs_from_db_file(init_net)
-            self.ds_reader = self.ds.random_reader(
-                init_net,
-                batch_size=self.batch_size,
-                loop_over=self.loop_over,
-            )
-            self.ds_reader.sort_and_shuffle(init_net)
-            self.ds_reader.computeoffset(init_net)
-
-    def read(self, read_net):
-        assert self.ds_reader, 'setup_ex must be called first'
-        return self.ds_reader.read(read_net)
-
-    def _init_field_blobs_as_empty(self, init_net):
-        """Initialize dataset field blobs by creating an empty record"""
-        with core.NameScope(self.name):
-            self.ds.init_empty(init_net)
-
-    def _feed_field_blobs_from_db_file(self, net):
-        """Load from the DB file at db_path and feed dataset field blobs"""
-        if self.db_type == "log_file_db":
-            assert os.path.exists(self.db_path), \
-                'db_path [{db_path}] does not exist'.format(db_path=self.db_path)
-        net.Load(
-            [],
-            self.ds.get_blobs(),
-            db=self.db_path,
-            db_type=self.db_type,
-            absolute_path=True,
-            source_blob_names=self.ds.field_names(),
-        )
-
-    def _extract_db_name_from_db_path(self):
-        """Extract DB name from DB path
-
-            E.g. given self.db_path=`/tmp/sample.db`, or
-            self.db_path = `dper_test_data/cached_reader/sample.db`
-            it returns `sample`.
-
-            Returns:
-                db_name: str.
-        """
-        return os.path.basename(self.db_path).rsplit('.', 1)[0]
diff --git a/caffe2/python/db_test.py b/caffe2/python/db_test.py
deleted file mode 100644
index f0f5d2770dc0..000000000000
--- a/caffe2/python/db_test.py
+++ /dev/null
@@ -1,46 +0,0 @@
-
-
-
-
-
-from caffe2.python import workspace
-
-import os
-import tempfile
-import unittest
-
-
-class TestDB(unittest.TestCase):
-    def setUp(self):
-        handle, self.file_name = tempfile.mkstemp()
-        os.close(handle)
-        self.data = [
-            (
-                "key{}".format(i).encode("ascii"),
-                "value{}".format(i).encode("ascii")
-            )
-            for i in range(1, 10)
-        ]
-
-    def testSimple(self):
-        db = workspace.C.create_db(
-            "minidb", self.file_name, workspace.C.Mode.write)
-
-        for key, value in self.data:
-            transaction = db.new_transaction()
-            transaction.put(key, value)
-            del transaction
-
-        del db  # should close DB
-
-        db = workspace.C.create_db(
-            "minidb", self.file_name, workspace.C.Mode.read)
-        cursor = db.new_cursor()
-        data = []
-        while cursor.valid():
-            data.append((cursor.key(), cursor.value()))
-            cursor.next()  # noqa: B305
-        del cursor
-
-        db.close()  # test explicit db closer
-        self.assertEqual(data, self.data)
diff --git a/caffe2/python/device_checker.py b/caffe2/python/device_checker.py
deleted file mode 100644
index 3385f1e2c046..000000000000
--- a/caffe2/python/device_checker.py
+++ /dev/null
@@ -1,119 +0,0 @@
-## @package device_checker
-# Module caffe2.python.device_checker
-import numpy as np
-import copy
-from caffe2.python import workspace
-from caffe2.python.core import InferOpBlobDevicesAsDict
-
-
-class DeviceChecker:
-    """A device checker in Python to check consistency across multiple devices.
-
-    This is not the most efficient way to check devices, as the Python interface
-    will involve a lot of copies back and forth operations. Use at your own risk.
-    """
-
-    def __init__(self, threshold, device_options):
-        self._threshold = threshold
-        self._device_options = device_options
-
-    def CheckSimple(self, op, inputs, outputs_to_check,
-                    input_device_options=None):
-        """Checks the operator with different device implementations.
-
-        Inputs:
-          op: the operator to be checked.
-          inputs: the input data in numpy arrays.
-          outputs_to_check: the outputs to check between devices.
-          input_device_options: a mapping from input name to a device to use
-            (instead of self._device_options)
-        Outputs:
-          boolean: True if it passes, False if it does not pass.
-        """
-        op = copy.deepcopy(op)
-        # Entering the checker workspace
-        old_ws_name = workspace.CurrentWorkspace()
-        results = []
-        workspace.SwitchWorkspace("_device_check_", True)
-        for i, device_option in enumerate(self._device_options):
-            op.device_option.CopyFrom(device_option)
-            _input_device_options = input_device_options or \
-                InferOpBlobDevicesAsDict(op)[0]
-            print(_input_device_options)
-            for i, arr in enumerate(inputs):
-                workspace.FeedBlob(
-                    op.input[i], np.array(arr),
-                    _input_device_options.get(op.input[i], device_option)
-                )
-            workspace.RunOperatorOnce(op)
-            results.append(
-                [workspace.FetchBlob(op.output[idx])
-                 for idx in outputs_to_check])
-            # Everything is done, reset the workspace.
-            workspace.ResetWorkspace()
-        # After running on all devices, check correctness
-        success = True
-        for i in range(1, len(self._device_options)):
-            for j in range(len(outputs_to_check)):
-                x = results[i][j]
-                y = results[0][j]
-                if not np.allclose(x, y,
-                                   atol=self._threshold, rtol=self._threshold):
-                    print('Failure in checking device option {}'
-                          ' and output {}. The outputs are:'
-                          .format(i, op.output[outputs_to_check[j]]))
-                    print(x.flatten())
-                    print(y.flatten())
-                    print(np.max(np.abs(x - y)))
-                    success = False
-                # else:
-                #     print ('Passed device pair (0, %d), %s %s' %
-                #            (i, outputs_to_check[j], y.shape))
-        workspace.SwitchWorkspace(old_ws_name)
-        return success
-
-    def CheckNet(self, net, inputs=None, blobs_to_check=None, ignore=None):
-        """Checks a network by inspecting all of its intermediate results, and
-        see if things match.
-        """
-        if inputs is None:
-            inputs = {}
-        if ignore is None:
-            ignore = set()
-        old_ws_name = workspace.CurrentWorkspace()
-        results = []
-        if blobs_to_check is None:
-            blobs_to_check = sum([list(op.output) for op in net.op], [])
-        blobs_to_check = [b for b in blobs_to_check if b not in ignore]
-        workspace.SwitchWorkspace("_device_check_", True)
-        for device_option in self._device_options:
-            for name, arr in inputs.items():
-                # print 'feeding', name
-                workspace.FeedBlob(name, arr, device_option)
-            for op in net.op:
-                op.device_option.CopyFrom(device_option)
-            workspace.RunNetOnce(net)
-            results.append(
-                [workspace.FetchBlob(name) for name in blobs_to_check]
-            )
-        # After running on all devices, check correctness
-        success = True
-        for i in range(1, len(results)):
-            for j in range(len(blobs_to_check)):
-                x = results[i][j]
-                y = results[0][j]
-                if not np.allclose(x, y,
-                                   atol=self._threshold, rtol=self._threshold):
-                    print('Failure in checking device option {}'
-                          ' and output {}. The outputs are:'
-                          .format(i, blobs_to_check[j]))
-                    print(x.flatten())
-                    print(y.flatten())
-                    print(np.max(np.abs(x - y)))
-                    success = False
-                # else:
-                #     print ('Passed device pair (%d, %d), %s %s: %s' %
-                #            (i, j, blobs_to_check[j], y.shape,
-                #             str(y.flatten())))
-        workspace.SwitchWorkspace(old_ws_name)
-        return success
diff --git a/caffe2/python/dlpack.h b/caffe2/python/dlpack.h
deleted file mode 100644
index 3d33935d31a9..000000000000
--- a/caffe2/python/dlpack.h
+++ /dev/null
@@ -1,230 +0,0 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file dlpack.h
- * \brief The common header of DLPack.
- */
-#ifndef DLPACK_DLPACK_H_
-#define DLPACK_DLPACK_H_
-
-/**
- * \brief Compatibility with C++
- */
-#ifdef __cplusplus
-#define DLPACK_EXTERN_C extern "C"
-#else
-#define DLPACK_EXTERN_C
-#endif
-
-/*! \brief The current version of dlpack */
-#define DLPACK_VERSION 70
-
-/*! \brief The current ABI version of dlpack */
-#define DLPACK_ABI_VERSION 1
-
-/*! \brief DLPACK_DLL prefix for windows */
-#ifdef _WIN32
-#ifdef DLPACK_EXPORTS
-#define DLPACK_DLL __declspec(dllexport)
-#else
-#define DLPACK_DLL __declspec(dllimport)
-#endif
-#else
-#define DLPACK_DLL
-#endif
-
-#include <stddef.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-/*!
- * \brief The device type in DLDevice.
- */
-#ifdef __cplusplus
-typedef enum : int32_t {
-#else
-typedef enum {
-#endif
-  /*! \brief CPU device */
-  kDLCPU = 1,
-  /*! \brief CUDA GPU device */
-  kDLCUDA = 2,
-  /*!
-   * \brief Pinned CUDA CPU memory by cudaMallocHost
-   */
-  kDLCUDAHost = 3,
-  /*! \brief OpenCL devices. */
-  kDLOpenCL = 4,
-  /*! \brief Vulkan buffer for next generation graphics. */
-  kDLVulkan = 7,
-  /*! \brief Metal for Apple GPU. */
-  kDLMetal = 8,
-  /*! \brief Verilog simulator buffer */
-  kDLVPI = 9,
-  /*! \brief ROCm GPUs for AMD GPUs */
-  kDLROCM = 10,
-  /*!
-   * \brief Pinned ROCm CPU memory allocated by hipMallocHost
-   */
-  kDLROCMHost = 11,
-  /*!
-   * \brief Reserved extension device type,
-   * used for quickly test extension device
-   * The semantics can differ depending on the implementation.
-   */
-  kDLExtDev = 12,
-  /*!
-   * \brief CUDA managed/unified memory allocated by cudaMallocManaged
-   */
-  kDLCUDAManaged = 13,
-  /*!
-   * \brief Unified shared memory allocated on a oneAPI non-partititioned
-   * device. Call to oneAPI runtime is required to determine the device
-   * type, the USM allocation type and the sycl context it is bound to.
-   *
-   */
-  kDLOneAPI = 14,
-  /*! \brief GPU support for next generation WebGPU standard. */
-  kDLWebGPU = 15,
-  /*! \brief Qualcomm Hexagon DSP */
-  kDLHexagon = 16,
-} DLDeviceType;
-
-/*!
- * \brief A Device for Tensor and operator.
- */
-typedef struct {
-  /*! \brief The device type used in the device. */
-  DLDeviceType device_type;
-  /*!
-   * \brief The device index.
-   * For vanilla CPU memory, pinned memory, or managed memory, this is set to 0.
-   */
-  int32_t device_id;
-} DLDevice;
-
-/*!
- * \brief The type code options DLDataType.
- */
-typedef enum {
-  /*! \brief signed integer */
-  kDLInt = 0U,
-  /*! \brief unsigned integer */
-  kDLUInt = 1U,
-  /*! \brief IEEE floating point */
-  kDLFloat = 2U,
-  /*!
-   * \brief Opaque handle type, reserved for testing purposes.
-   * Frameworks need to agree on the handle data type for the exchange to be
-   * well-defined.
-   */
-  kDLOpaqueHandle = 3U,
-  /*! \brief bfloat16 */
-  kDLBfloat = 4U,
-  /*!
-   * \brief complex number
-   * (C/C++/Python layout: compact struct per complex number)
-   */
-  kDLComplex = 5U,
-} DLDataTypeCode;
-
-/*!
- * \brief The data type the tensor can hold. The data type is assumed to follow
- * the native endian-ness. An explicit error message should be raised when
- * attempting to export an array with non-native endianness
- *
- *  Examples
- *   - float: type_code = 2, bits = 32, lanes=1
- *   - float4(vectorized 4 float): type_code = 2, bits = 32, lanes=4
- *   - int8: type_code = 0, bits = 8, lanes=1
- *   - std::complex<float>: type_code = 5, bits = 64, lanes = 1
- */
-typedef struct {
-  /*!
-   * \brief Type code of base types.
-   * We keep it uint8_t instead of DLDataTypeCode for minimal memory
-   * footprint, but the value should be one of DLDataTypeCode enum values.
-   * */
-  uint8_t code;
-  /*!
-   * \brief Number of bits, common choices are 8, 16, 32.
-   */
-  uint8_t bits;
-  /*! \brief Number of lanes in the type, used for vector types. */
-  uint16_t lanes;
-} DLDataType;
-
-/*!
- * \brief Plain C Tensor object, does not manage memory.
- */
-typedef struct {
-  /*!
-   * \brief The data pointer points to the allocated data. This will be CUDA
-   * device pointer or cl_mem handle in OpenCL. It may be opaque on some device
-   * types. This pointer is always aligned to 256 bytes as in CUDA. The
-   * `byte_offset` field should be used to point to the beginning of the data.
-   *
-   * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow,
-   * TVM, perhaps others) do not adhere to this 256 byte aligment requirement
-   * on CPU/CUDA/ROCm, and always use `byte_offset=0`.  This must be fixed
-   * (after which this note will be updated); at the moment it is recommended
-   * to not rely on the data pointer being correctly aligned.
-   *
-   * For given DLTensor, the size of memory required to store the contents of
-   * data is calculated as follows:
-   *
-   * \code{.c}
-   * static inline size_t GetDataSize(const DLTensor* t) {
-   *   size_t size = 1;
-   *   for (tvm_index_t i = 0; i < t->ndim; ++i) {
-   *     size *= t->shape[i];
-   *   }
-   *   size *= (t->dtype.bits * t->dtype.lanes + 7) / 8;
-   *   return size;
-   * }
-   * \endcode
-   */
-  void* data;
-  /*! \brief The device of the tensor */
-  DLDevice device;
-  /*! \brief Number of dimensions */
-  int32_t ndim;
-  /*! \brief The data type of the pointer*/
-  DLDataType dtype;
-  /*! \brief The shape of the tensor */
-  int64_t* shape;
-  /*!
-   * \brief strides of the tensor (in number of elements, not bytes)
-   *  can be NULL, indicating tensor is compact and row-majored.
-   */
-  int64_t* strides;
-  /*! \brief The offset in bytes to the beginning pointer to data */
-  uint64_t byte_offset;
-} DLTensor;
-
-/*!
- * \brief C Tensor object, manage memory of DLTensor. This data structure is
- *  intended to facilitate the borrowing of DLTensor by another framework. It is
- *  not meant to transfer the tensor. When the borrowing framework doesn't need
- *  the tensor, it should call the deleter to notify the host that the resource
- *  is no longer needed.
- */
-typedef struct DLManagedTensor {
-  /*! \brief DLTensor which is being memory managed */
-  DLTensor dl_tensor;
-  /*! \brief the context of the original host framework of DLManagedTensor in
-   *   which DLManagedTensor is used in the framework. It can also be NULL.
-   */
-  void* manager_ctx;
-  /*! \brief Destructor signature void (*)(void*) - this should be called
-   *   to destruct manager_ctx which holds the DLManagedTensor. It can be NULL
-   *   if there is no way for the caller to provide a reasonable destructor.
-   *   The destructors deletes the argument self as well.
-   */
-  void (*deleter)(struct DLManagedTensor* self);
-} DLManagedTensor;
-#ifdef __cplusplus
-} // DLPACK_EXTERN_C
-#endif
-#endif // DLPACK_DLPACK_H_
diff --git a/caffe2/python/docs/formatter.py b/caffe2/python/docs/formatter.py
deleted file mode 100644
index 982a05255e2d..000000000000
--- a/caffe2/python/docs/formatter.py
+++ /dev/null
@@ -1,104 +0,0 @@
-## @package formatter
-# Module caffe2.python.docs.formatter
-
-
-
-
-from caffe2.python.docs.parser import Parser
-
-
-class Formatter:
-    def __init__(self):
-        self.content = ""
-
-    def clone(self):
-        return self.__class__()
-
-    def dump(self):
-        return self.content
-
-    def parseAndAdd(self, text):
-        text = Parser(text, self).parse()
-        self.addRaw(text)
-
-    def addRaw(self, text):
-        raise Exception('Not yet implemented.')
-
-    def addLine(self, text):
-        raise Exception('Not yet implemented.')
-
-    def addLinebreak(self):
-        raise Exception('Not yet implemented.')
-
-    def addHeader(self, text):
-        raise Exception('Not yet implemented.')
-
-    def addEmphasis(self, text):
-        raise Exception('Not yet implemented.')
-
-    def addList(self, textList):
-        raise Exception('Not yet implemented.')
-
-    def addLink(self, text, url):
-        raise Exception('Not yet implemented.')
-
-    def addCode(self, text):
-        raise Exception('Not yet implemented.')
-
-    def addCodeLink(self, text):
-        raise Exception('Not yet implemented.')
-
-    def addTable(self, table):
-        raise Exception('Not yet implemented.')
-
-    def addBreak(self):
-        raise Exception('Not yet implemented.')
-
-
-class Markdown(Formatter):
-    def addRaw(self, text):
-        self.content += "{text}".format(text=text)
-
-    def addLine(self, text, new_line=False):
-        self.content += "{line}{text}\n".format(line=('\n' if new_line else ''),
-                                                text=text)
-
-    def addLinebreak(self):
-        self.content += "\n"
-
-    def addHeader(self, text, h=1):
-        self.addLine("{header} {text}".format(header=h * '#', text=text), True)
-
-    def addEmphasis(self, text, s=1):
-        self.addRaw("{stars}{text}{stars}".format(stars=s * '*', text=text))
-
-    def addList(self, textList):
-        for text in textList:
-            self.addLine("- {text}".format(text=text), True)
-        self.addLinebreak()
-
-    def addLink(self, text, url):
-        self.addRaw("[{text}]({url})".format(text=text, url=url))
-
-    def addCodeLink(self, path, options=None):
-        self.addRaw("({path})".format(path=path))
-
-    def addCode(self, text, inline=False):
-        if (inline):
-            self.content += "`{text}`".format(text=text)
-        else:
-            self.addRaw("\n\n```\n{text}```\n\n".format(text=text))
-
-    def addTable(self, table, noTitle=False):
-        self.addLinebreak()
-        assert(len(table) > 1)
-        if noTitle:
-            table.insert(0, [' ' for i in range(len(table[0]))])
-        self.addLine(' | '.join(table[0]))
-        self.addLine(' | '.join(['----' for i in range(len(table[0]))]))
-        for row in table[1:]:
-            self.addLine(' | '.join(row))
-        self.addLinebreak()
-
-    def addBreak(self):
-        self.addLine('\n---\n', True)
diff --git a/caffe2/python/docs/generator.py b/caffe2/python/docs/generator.py
deleted file mode 100644
index 0a2cca904c05..000000000000
--- a/caffe2/python/docs/generator.py
+++ /dev/null
@@ -1,230 +0,0 @@
-## @package generator
-# Module caffe2.python.docs.generator
-
-
-
-
-import argparse
-import os
-from caffe2.python import core, workspace
-from caffe2.python.docs.formatter import Markdown
-
-OpSchema = workspace.C.OpSchema
-
-
-class DocUploader:
-    def __init__(self):
-        pass
-
-    def upload(self, text):
-        pass
-
-
-class DocGenerator:
-    def __init__(self, formatter, uploader):
-        self.formatter = formatter
-        self.uploader = uploader
-        self.content_body = ""
-
-    def create_body(self):
-        pass
-
-    def update(self):
-        self.uploader.upload(self.content_body)
-
-
-class OpDocGenerator(DocGenerator):
-    def getOperatorDoc(self, name, schema, priority):
-        return OperatorDoc(name, schema, priority)
-
-    def getOperatorEngine(self, name):
-        return OperatorEngine(name)
-
-    def getOperators(self):
-        # map: op_name -> operator
-        self.operators = {}
-        # map: op_name -> [engine, engine]
-        self.engines = {}
-
-        def filePriority(x):
-            if x == "caffe2/caffe2/operators":
-                return 0
-            if 'contrib' in x.split('/'):
-                return 2
-            if 'experiments' in x.split('/'):
-                return 3
-            return 1
-
-        for name in core._GetRegisteredOperators():
-            schema = OpSchema.get(name)
-            if schema:
-                priority = filePriority(os.path.dirname(schema.file))
-                operator = self.getOperatorDoc(name, schema, priority)
-                self.operators[name] = operator
-
-            # Engine
-            elif name.find("_ENGINE_") != -1:
-                engine = self.getOperatorEngine(name)
-                if engine.base_op_name in self.engines:
-                    self.engines[engine.base_op_name].append(engine)
-                else:
-                    self.engines[engine.base_op_name] = [engine]
-
-            # No schema
-            else:
-                priority = 4
-                self.operators[name] = self.getOperatorDoc(name, schema, priority)
-
-        for name, engines in self.engines.items():
-            if name in self.operators:
-                self.operators[name].addEngines(engines)
-
-        # Generate a sorted list of operators
-        return sorted(
-            self.operators.values(),
-            key=lambda op: (op.priority, op.name)
-        )
-
-    def createBody(self):
-        operators = self.getOperators()
-
-        for operator in operators:
-            operator.generateSchema(self.formatter)
-
-        self.content_body += self.formatter.dump()
-
-
-class OperatorEngine:
-    def __init__(self, name):
-        self.op_name = name
-        self.base_op_name, self.engine = name.split("_ENGINE_", 1)
-
-    def getDeviceImpl(self):
-        deviceImplList = []
-        for device, impl in [('CPU', OpSchema.get_cpu_impl(self.op_name)),
-                             ('CUDA', OpSchema.get_cuda_impl(self.op_name))]:
-            if not impl:
-                continue
-            deviceImplList.append((device, impl))
-        return deviceImplList
-
-    def generateDoc(self, formatter):
-        for device, impl in self.getDeviceImpl():
-            formatter.addLine(
-                '{engine} on {device}: {impl}'.format(engine=self.engine,
-                                                      device=device,
-                                                      impl=impl))
-
-
-class OperatorDoc:
-    def __init__(self, name, schema, priority):
-        self.name = name
-        self.schema = schema
-        self.priority = priority
-        print("Gathering docs for {}...".format(self.name))
-        self.engines = []
-
-    def addEngines(self, engines):
-        self.engines = engines
-
-    def generateDoc(self, formatter):
-        if self.schema.doc:
-            formatter.parseAndAdd(self.schema.doc)
-            formatter.addLinebreak()
-        else:
-            formatter.addLine("No documentation yet.")
-
-    def generateTable(self, formatter, tuples, title_row, title):
-        if tuples:
-            if title:
-                formatter.addHeader(title, 3)
-            table = []
-            if title_row:
-                table = [title_row]
-            for name, doc in tuples:
-                table.append([name, doc or ''])
-            formatter.addTable(table, (table == []))
-
-    def generateInterface(self, formatter):
-        def makeDesc(title, args):
-            f = formatter.clone()
-            f.addEmphasis(title, 1)
-            out = [(f.dump(), '')]
-            for arg in args:
-                f = formatter.clone()
-                if isinstance(arg, tuple):
-                    name = arg[0]
-                    if len(arg) > 1:
-                        description = arg[1] or ''
-                    else:
-                        description = ''
-                else:
-                    name = arg.name
-                    description = arg.description or ''
-                f.addCode(name, inline=True)
-                out.append((f.dump(), description or ''))
-            return out
-
-        tuples = []
-
-        if self.schema.args:
-            tuples += makeDesc('Arguments', self.schema.args)
-
-        if self.schema.input_desc:
-            tuples += makeDesc('Inputs', self.schema.input_desc)
-
-        if self.schema.output_desc:
-            tuples += makeDesc('Outputs', self.schema.output_desc)
-
-        self.generateTable(formatter, tuples, None, 'Interface')
-        print("Generated interface for {}".format(self.name))
-
-    def generateCodeLink(self, formatter):
-        formatter.addHeader("Code", 3)
-        formatter.addLinebreak()
-        formatter.addCodeLink(self.schema.file)
-
-    def getInfo(self, formatter, name, impl):
-        pass
-
-    def generateDevices(self, formatter):
-        formatter.addHeader("Devices", 3)
-        devices = [
-            self.getInfo(formatter,
-                         'CPU', OpSchema.get_cpu_impl(self.name)),
-            self.getInfo(formatter,
-                         'GPU', OpSchema.get_cuda_impl(self.name)),
-        ]
-        formatter.addList([i for i in devices if i])
-
-    def generateEngines(self, formatter):
-        if not len(self.engines):
-            return
-        formatter.addHeader("Engines", 3)
-        for engine in self.engines:
-            engine.generateDoc(formatter)
-
-    def generateSchema(self, formatter):
-        formatter.addHeader(self.name, 2)
-        if self.schema:
-            self.generateDoc(formatter)
-            self.generateInterface(formatter)
-            self.generateCodeLink(formatter)
-            self.generateDevices(formatter)
-            self.generateEngines(formatter)
-            formatter.addBreak()
-        else:
-            formatter.addLine("No schema documented yet.")
-            self.generateDevices(formatter)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Operators catalog generator.")
-    parser.add_argument('catalog_path', type=str,
-                        help='operators-catalogue.md to write out to')
-    args = parser.parse_args()
-
-    with open(args.catalog_path, 'w') as fp:
-        ops = OpDocGenerator(Markdown(), DocUploader())
-        ops.createBody()
-        fp.write(ops.content_body)
diff --git a/caffe2/python/docs/github.py b/caffe2/python/docs/github.py
deleted file mode 100644
index ff9328cbcd56..000000000000
--- a/caffe2/python/docs/github.py
+++ /dev/null
@@ -1,125 +0,0 @@
-## @package github
-# Module caffe2.python.docs.github
-
-
-
-
-import argparse
-import os
-from caffe2.python.docs.formatter import Markdown
-from caffe2.python.docs.generator import OpDocGenerator, DocUploader
-from caffe2.python.docs.generator import OperatorDoc, OperatorEngine
-
-
-class GHOpDocUploader(DocUploader):
-    def __init__(self):
-        pass
-
-    def upload(self, content_body):
-        print(content_body)
-
-
-class GHMarkdown(Markdown):
-    def addHeader(self, text, h=1):
-        self.addLine("\n{header} {text}\n".format(header=h * '#', text=text), True)
-
-    def addDocHeader(self):
-        self.addLine("---")
-        self.addLine("docid: operators-catalog")
-        self.addLine("title: Operators Catalog")
-        self.addLine("layout: operators")
-        self.addLine("permalink: /docs/operators-catalogue.html")
-        self.addLine("---")
-        self.addLine("* TOC")
-        self.addLine("{:toc}")
-
-    def addTable(self, table, noTitle=False):
-        self.addLinebreak()
-        assert(len(table) > 1)
-        self.addLine(' | '.join(['----------' for i in range(len(table[0]))]))
-        self.addLine(' | '.join(table[0]))
-        for row in table[1:]:
-            self.addLine(' | '.join(row))
-
-    def addTableHTML(self, table, noTitle=False):
-        self.addRaw("<table>")
-        for row in table:
-            self.addRaw("<tr>")
-            for cell in row:
-                self.addRaw("<td>")
-                self.addLine("{cell}".format(cell=cell))
-                self.addRaw("</td>")
-            self.addRaw("</tr>")
-        self.addRaw("</table>")
-
-def getCodeLink(formatter, schema):
-    formatter = formatter.clone()
-    path = os.path.relpath(schema.file, "caffe2")
-    schemaLink = ('https://github.com/pytorch/pytorch/blob/main/{path}'
-                  .format(path=path))
-    formatter.addLink('{path}'.format(path=path), schemaLink)
-    return formatter.dump()
-
-
-class GHOperatorEngine(OperatorEngine):
-    def generateDoc(self, formatter):
-        for device, _ in self.getDeviceImpl():
-            formatter.addCode('{engine}'.format(engine=self.engine), True)
-            if device:
-                formatter.addRaw(' on ')
-                formatter.addEmphasis("{device}".format(device=device), 1)
-
-
-class GHOperatorDoc(OperatorDoc):
-    def generateCodeLink(self, formatter):
-        formatter.addHeader("Code", 3)
-        formatter.addLinebreak()
-        formatter.addRaw(getCodeLink(formatter, self.schema))
-
-    def getInfo(self, formatter, name, impl):
-        formatter = formatter.clone()
-        if impl:
-            formatter.addEmphasis('{name}'.format(name=name), 1)
-            formatter.addRaw(' ')
-            formatter.addCode('{impl}'.format(impl=impl), True)
-        return formatter.dump()
-
-    def generateSchema(self, formatter):
-        formatter.addHeader(self.name, 2)
-        if self.schema:
-            self.generateDoc(formatter)
-            self.generateInterface(formatter)
-            self.generateCodeLink(formatter)
-            formatter.addBreak()
-        else:
-            formatter.addLine("No schema documented yet.")
-
-
-class GHOpDocGenerator(OpDocGenerator):
-    def getOperatorDoc(self, name, schema, priority):
-        return GHOperatorDoc(name, schema, priority)
-
-    def getOperatorEngine(self, name):
-        return GHOperatorEngine(name)
-
-    def createBody(self):
-        self.formatter.addDocHeader()
-        operators = self.getOperators()
-
-        for operator in operators:
-            operator.generateSchema(self.formatter)
-
-        self.content_body += self.formatter.dump()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Operators catalog generator.")
-    parser.add_argument('catalog_path', type=str,
-                        help='operators-catalogue.md to write out to')
-    args = parser.parse_args()
-
-    with open(args.catalog_path, 'w') as fp:
-        ops = GHOpDocGenerator(GHMarkdown(), GHOpDocUploader)
-        ops.createBody()
-        fp.write(ops.content_body)
-        print("Updated {}!".format(args.catalog_path))
diff --git a/caffe2/python/docs/parser.py b/caffe2/python/docs/parser.py
deleted file mode 100644
index 1d8e194a3e86..000000000000
--- a/caffe2/python/docs/parser.py
+++ /dev/null
@@ -1,96 +0,0 @@
-## @package parser
-# Module caffe2.python.docs.parser
-
-
-
-
-import re
-
-
-class Parser:
-    # List of tuples (regex_str, lambda(regex_match, formatter))
-    # If a lambda returns True it will be called repeatedly with replacement
-    # otherwise it will only be called on text that hasn't been parsed yet.
-    regexes = [
-        # Code blocks of various formats
-        ('````(.+?)````',
-         lambda m, f: f.addCode(m.group(1))
-         ),
-        ('```(.+?)```',
-         lambda m, f: f.addCode(m.group(1))
-         ),
-        (r'((( {2})+)(\S.*)(\n\s*\n|\n))+',
-         lambda m, f: f.addCode(m.group(0))
-         ),
-        (r'([^\.])\n',
-         lambda m, f: f.addRaw('{c} '.format(c=m.group(1))) or True
-         ),
-        ('`(.+?)`',
-         lambda m, f: f.addCode(m.group(1), True)
-         ),
-        # Make links clickable
-        ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]'
-         r'|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
-         lambda m, f: f.addLink(m.group(0), m.group(0))
-         ),
-        (r'\*\*(.+?)\*\*',
-         lambda m, f: f.addEmphasis(m.group(1), 2)
-         ),
-        (r'\*(.+?)\*',
-         lambda m, f: f.addEmphasis(m.group(1), 1)
-         ),
-    ]
-
-    def __init__(self, text, formatter):
-        self.text = text
-        self.lines = []
-        self.formatter = formatter
-
-    def parseText(self):
-        UNPARSED = 0
-        PARSED = 1
-        parsed_block = [(UNPARSED, self.text)]
-        for regex, func in self.regexes:
-            index = 0
-            while index < len(parsed_block):
-                label, text = parsed_block[index]
-
-                # Already been parsed
-                if (label == PARSED):
-                    index += 1
-                    continue
-
-                match = re.search(regex, text)
-                if match:
-                    parsed_block.pop(index)
-                    start = match.start(0)
-                    end = match.end(0)
-
-                    f = self.formatter.clone()
-                    merge = func(match, f)
-
-                    if merge:
-                        merged = text[:start] + f.dump() + text[end:]
-                        parsed_block.insert(index, (UNPARSED, merged))
-                    else:
-                        if text[:start]:
-                            parsed_block.insert(index,
-                                                (UNPARSED, text[:start]))
-
-                        index += 1
-                        parsed_block.insert(index, (PARSED, f.dump()))
-
-                        index += 1
-                        if text[end:]:
-                            parsed_block.insert(index,
-                                                (UNPARSED, text[end:]))
-
-                else:
-                    index += 1
-
-        self.lines += [i for _, i in parsed_block]
-        self.text = ' '.join(self.lines)
-
-    def parse(self):
-        self.parseText()
-        return self.text
diff --git a/caffe2/python/dyndep.py b/caffe2/python/dyndep.py
deleted file mode 100644
index 0382cc3a8212..000000000000
--- a/caffe2/python/dyndep.py
+++ /dev/null
@@ -1,52 +0,0 @@
-## @package dyndep
-# Module caffe2.python.dyndep
-
-
-
-
-
-import ctypes
-import os
-from threading import Lock
-from caffe2.python import core, extension_loader
-
-
-def InitOpsLibrary(name, trigger_lazy=True):
-    """Loads a dynamic library that contains custom operators into Caffe2.
-
-    Since Caffe2 uses static variable registration, you can optionally load a
-    separate .so file that contains custom operators and registers that into
-    the caffe2 core binary. In C++, this is usually done by either declaring
-    dependency during compilation time, or via dynload. This allows us to do
-    registration similarly on the Python side.
-
-    Args:
-        name: a name that ends in .so, such as "my_custom_op.so". Otherwise,
-            the command will simply be ignored.
-    Returns:
-        None
-    """
-    if not os.path.exists(name):
-        # Note(jiayq): if the name does not exist, instead of immediately
-        # failing we will simply print a warning, deferring failure to the
-        # time when an actual call is made.
-        print('Ignoring {} as it is not a valid file.'.format(name))
-        return
-    _init_impl(name, trigger_lazy=trigger_lazy)
-
-
-_IMPORTED_DYNDEPS = set()
-dll_lock = Lock()
-
-
-def GetImportedOpsLibraries():
-    return _IMPORTED_DYNDEPS
-
-
-def _init_impl(path, trigger_lazy=True):
-    with dll_lock:
-        _IMPORTED_DYNDEPS.add(path)
-        with extension_loader.DlopenGuard():
-            ctypes.CDLL(path)
-        # reinitialize available ops
-        core.RefreshRegisteredOperators(trigger_lazy)
diff --git a/caffe2/python/embedding_generation_benchmark.py b/caffe2/python/embedding_generation_benchmark.py
deleted file mode 100644
index 33dbf757dda4..000000000000
--- a/caffe2/python/embedding_generation_benchmark.py
+++ /dev/null
@@ -1,196 +0,0 @@
-## @package embedding_generation_benchmark
-# Module caffe2.python.embedding_generation_benchmark
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import workspace, core, utils, model_helper
-
-import argparse
-import numpy as np
-import time
-
-import logging
-
-logging.basicConfig()
-log = logging.getLogger("embedding_generation_benchmark")
-log.setLevel(logging.DEBUG)
-
-
-def generate_data(T, batch_size, max_seq_length):
-    '''
-    Fill a queue with input data
-    '''
-    log.info("Generating T={} batches".format(T))
-
-    generate_input_init_net = core.Net('generate_input_init')
-    queue = generate_input_init_net.CreateBlobsQueue(
-        [], "inputqueue", num_blobs=1, capacity=T,
-    )
-    workspace.RunNetOnce(generate_input_init_net)
-
-    generate_input_net = core.Net('generate_input')
-    generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
-    np.random.seed(2603)
-
-    for t in range(T):
-        if (t % (max(10, T // 10)) == 0):
-            log.info("Generating data {}/{}".format(t, T))
-        X = np.tile(np.arange(max_seq_length), [batch_size, 1]).transpose()
-        workspace.FeedBlob("scratch", X)
-        workspace.RunNetOnce(generate_input_net.Proto())
-
-    log.info("Finished data generation")
-    return queue
-
-
-def generate_embedding_table(vocab_size, embedding_size):
-    log.info("Generating embedding table with dimensions {}"
-             .format([vocab_size, embedding_size]))
-
-    generate_table_net = core.Net('generate_table')
-    table = generate_table_net.GaussianFill(
-        [],
-        ['embedding_table'],
-        shape=[vocab_size, embedding_size],
-    )
-
-    workspace.RunNetOnce(generate_table_net)
-    return table
-
-
-def create_model(args, queue, embedding_table, embedding_size):
-    model = model_helper.ModelHelper(name='embedding_generation_bench')
-    input_blob = model.net.DequeueBlobs(queue, 'input_data')
-
-    if args.implementation == 'sinusoid':
-        model.net.SinusoidPositionEncoding(
-            [input_blob],
-            ['output'],
-            embedding_size=embedding_size
-        )
-    else:
-        model.net.Gather(
-            [embedding_table, input_blob],
-            ['output'],
-        )
-
-    return model
-
-
-def Caffe2EmbeddingGeneration(args):
-    T = args.data_size // args.batch_size
-
-    queue = generate_data(T, args.batch_size, args.seq_length)
-
-    embedding_table = None
-    if args.implementation == 'table':
-        embedding_table = generate_embedding_table(
-            args.seq_length,
-            args.embedding_size,
-        )
-
-    model = create_model(args, queue, embedding_table, args.embedding_size)
-
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-
-    start_time = time.time()
-    num_iters = T
-    total_iters = 0
-
-    # Run the Benchmark
-    log.info("------ Warming up ------")
-    workspace.RunNet(model.net.Proto().name)
-
-    log.info("------ Starting benchmark ------")
-    start_time = time.time()
-    last_time = time.time()
-    for iteration in range(1, num_iters, args.iters_to_report):
-        iters_once = min(args.iters_to_report, num_iters - iteration)
-        total_iters += iters_once
-        workspace.RunNet(model.net.Proto().name, iters_once)
-
-        new_time = time.time()
-        log.info(
-            "Iter: {} / {}. Embeddings Generated Per Second: {}k.".format(
-                iteration,
-                num_iters,
-                (iters_once * args.batch_size * args.seq_length) /
-                (new_time - last_time) // 100 / 10,
-            )
-        )
-        last_time = new_time
-
-    total_per_sec = (num_iters - 1) * args.batch_size * args.seq_length
-    total_per_sec = total_per_sec / (time.time() - start_time) // 100 / 10
-
-    log.info("Done. Total embeddings generated per second " +
-             "excluding 1st iteration: {}k".format(total_per_sec))
-
-    return time.time() - start_time
-
-
-@utils.debug
-def Benchmark(args):
-    return Caffe2EmbeddingGeneration(args)
-
-
-def GetArgumentParser():
-    parser = argparse.ArgumentParser(
-        description="Embedding generation benchmark."
-    )
-
-    parser.add_argument(
-        "--embedding_size",
-        type=int,
-        default=512,
-        help="Embedding size",
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=16,
-        help="The batch size."
-    )
-    parser.add_argument(
-        "--data_size",
-        type=int,
-        default=10000,
-        help="Number of sequences to generate"
-    )
-    parser.add_argument(
-        "--seq_length",
-        type=int,
-        default=128,
-        help="Max sequence length"
-    )
-    parser.add_argument(
-        "--iters_to_report",
-        type=int,
-        default=20,
-        help="Number of iterations to report progress"
-    )
-    parser.add_argument(
-        "--implementation",
-        type=str,
-        default="sinusoid",
-        help="'table' or 'sinusoid'",
-    )
-    return parser
-
-
-if __name__ == '__main__':
-    args, extra_args = GetArgumentParser().parse_known_args()
-
-    workspace.GlobalInit([
-        'caffe2',
-        '--caffe2_log_level=0',
-        '--caffe2_print_blob_sizes_at_exit=0'] + extra_args)
-
-    device = core.DeviceOption(caffe2_pb2.CPU)
-
-    with core.DeviceScope(device):
-        Benchmark(args)
diff --git a/caffe2/python/examples/char_rnn.py b/caffe2/python/examples/char_rnn.py
deleted file mode 100644
index 910e7818a6e8..000000000000
--- a/caffe2/python/examples/char_rnn.py
+++ /dev/null
@@ -1,276 +0,0 @@
-## @package char_rnn
-# Module caffe2.python.examples.char_rnn
-
-
-
-
-
-from caffe2.python import core, workspace, model_helper, utils, brew
-from caffe2.python.rnn_cell import LSTM
-from caffe2.proto import caffe2_pb2
-from caffe2.python.optimizer import build_sgd
-
-
-import argparse
-import logging
-import numpy as np
-from datetime import datetime
-
-'''
-This script takes a text file as input and uses a recurrent neural network
-to learn to predict next character in a sequence.
-'''
-
-logging.basicConfig()
-log = logging.getLogger("char_rnn")
-log.setLevel(logging.DEBUG)
-
-
-# Default set() here is intentional as it would accumulate values like a global
-# variable
-def CreateNetOnce(net, created_names=set()): # noqa
-    name = net.Name()
-    if name not in created_names:
-        created_names.add(name)
-        workspace.CreateNet(net)
-
-
-class CharRNN:
-    def __init__(self, args):
-        self.seq_length = args.seq_length
-        self.batch_size = args.batch_size
-        self.iters_to_report = args.iters_to_report
-        self.hidden_size = args.hidden_size
-
-        with open(args.train_data) as f:
-            self.text = f.read()
-
-        self.vocab = list(set(self.text))
-        self.char_to_idx = {ch: idx for idx, ch in enumerate(self.vocab)}
-        self.idx_to_char = {idx: ch for idx, ch in enumerate(self.vocab)}
-        self.D = len(self.char_to_idx)
-
-        print("Input has {} characters. Total input size: {}".format(
-            len(self.vocab), len(self.text)))
-
-    def CreateModel(self):
-        log.debug("Start training")
-        model = model_helper.ModelHelper(name="char_rnn")
-
-        input_blob, seq_lengths, hidden_init, cell_init, target = \
-            model.net.AddExternalInputs(
-                'input_blob',
-                'seq_lengths',
-                'hidden_init',
-                'cell_init',
-                'target',
-            )
-
-        hidden_output_all, self.hidden_output, _, self.cell_state = LSTM(
-            model, input_blob, seq_lengths, (hidden_init, cell_init),
-            self.D, self.hidden_size, scope="LSTM")
-        output = brew.fc(
-            model,
-            hidden_output_all,
-            None,
-            dim_in=self.hidden_size,
-            dim_out=self.D,
-            axis=2
-        )
-
-        # axis is 2 as first two are T (time) and N (batch size).
-        # We treat them as one big batch of size T * N
-        softmax = model.net.Softmax(output, 'softmax', axis=2)
-
-        softmax_reshaped, _ = model.net.Reshape(
-            softmax, ['softmax_reshaped', '_'], shape=[-1, self.D])
-
-        # Create a copy of the current net. We will use it on the forward
-        # pass where we don't need loss and backward operators
-        self.forward_net = core.Net(model.net.Proto())
-
-        xent = model.net.LabelCrossEntropy([softmax_reshaped, target], 'xent')
-        # Loss is average both across batch and through time
-        # Thats why the learning rate below is multiplied by self.seq_length
-        loss = model.net.AveragedLoss(xent, 'loss')
-        model.AddGradientOperators([loss])
-
-        # use build_sdg function to build an optimizer
-        build_sgd(
-            model,
-            base_learning_rate=0.1 * self.seq_length,
-            policy="step",
-            stepsize=1,
-            gamma=0.9999
-        )
-
-        self.model = model
-        self.predictions = softmax
-        self.loss = loss
-
-        self.prepare_state = core.Net("prepare_state")
-        self.prepare_state.Copy(self.hidden_output, hidden_init)
-        self.prepare_state.Copy(self.cell_state, cell_init)
-
-    def _idx_at_pos(self, pos):
-        return self.char_to_idx[self.text[pos]]
-
-    def TrainModel(self):
-        log.debug("Training model")
-
-        workspace.RunNetOnce(self.model.param_init_net)
-
-        # As though we predict the same probability for each character
-        smooth_loss = -np.log(1.0 / self.D) * self.seq_length
-        last_n_iter = 0
-        last_n_loss = 0.0
-        num_iter = 0
-        N = len(self.text)
-
-        # We split text into batch_size pieces. Each piece will be used only
-        # by a corresponding batch during the training process
-        text_block_positions = np.zeros(self.batch_size, dtype=np.int32)
-        text_block_size = N // self.batch_size
-        text_block_starts = list(range(0, N, text_block_size))
-        text_block_sizes = [text_block_size] * self.batch_size
-        text_block_sizes[self.batch_size - 1] += N % self.batch_size
-        assert sum(text_block_sizes) == N
-
-        # Writing to output states which will be copied to input
-        # states within the loop below
-        workspace.FeedBlob(self.hidden_output, np.zeros(
-            [1, self.batch_size, self.hidden_size], dtype=np.float32
-        ))
-        workspace.FeedBlob(self.cell_state, np.zeros(
-            [1, self.batch_size, self.hidden_size], dtype=np.float32
-        ))
-        workspace.CreateNet(self.prepare_state)
-
-        # We iterate over text in a loop many times. Each time we peak
-        # seq_length segment and feed it to LSTM as a sequence
-        last_time = datetime.now()
-        progress = 0
-        while True:
-            workspace.FeedBlob(
-                "seq_lengths",
-                np.array([self.seq_length] * self.batch_size,
-                         dtype=np.int32)
-            )
-            workspace.RunNet(self.prepare_state.Name())
-
-            input = np.zeros(
-                [self.seq_length, self.batch_size, self.D]
-            ).astype(np.float32)
-            target = np.zeros(
-                [self.seq_length * self.batch_size]
-            ).astype(np.int32)
-
-            for e in range(self.batch_size):
-                for i in range(self.seq_length):
-                    pos = text_block_starts[e] + text_block_positions[e]
-                    input[i][e][self._idx_at_pos(pos)] = 1
-                    target[i * self.batch_size + e] =\
-                        self._idx_at_pos((pos + 1) % N)
-                    text_block_positions[e] = (
-                        text_block_positions[e] + 1) % text_block_sizes[e]
-                    progress += 1
-
-            workspace.FeedBlob('input_blob', input)
-            workspace.FeedBlob('target', target)
-
-            CreateNetOnce(self.model.net)
-            workspace.RunNet(self.model.net.Name())
-
-            num_iter += 1
-            last_n_iter += 1
-
-            if num_iter % self.iters_to_report == 0:
-                new_time = datetime.now()
-                print("Characters Per Second: {}". format(
-                    int(progress / (new_time - last_time).total_seconds())
-                ))
-                print("Iterations Per Second: {}". format(
-                    int(self.iters_to_report /
-                        (new_time - last_time).total_seconds())
-                ))
-
-                last_time = new_time
-                progress = 0
-
-                print("{} Iteration {} {}".
-                      format('-' * 10, num_iter, '-' * 10))
-
-            loss = workspace.FetchBlob(self.loss) * self.seq_length
-            smooth_loss = 0.999 * smooth_loss + 0.001 * loss
-            last_n_loss += loss
-
-            if num_iter % self.iters_to_report == 0:
-                self.GenerateText(500, np.random.choice(self.vocab))
-
-                log.debug("Loss since last report: {}"
-                          .format(last_n_loss / last_n_iter))
-                log.debug("Smooth loss: {}".format(smooth_loss))
-
-                last_n_loss = 0.0
-                last_n_iter = 0
-
-    def GenerateText(self, num_characters, ch):
-        # Given a starting symbol we feed a fake sequence of size 1 to
-        # our RNN num_character times. After each time we use output
-        # probabilities to pick a next character to feed to the network.
-        # Same character becomes part of the output
-        CreateNetOnce(self.forward_net)
-
-        text = '' + ch
-        for _i in range(num_characters):
-            workspace.FeedBlob(
-                "seq_lengths", np.array([1] * self.batch_size, dtype=np.int32))
-            workspace.RunNet(self.prepare_state.Name())
-
-            input = np.zeros([1, self.batch_size, self.D]).astype(np.float32)
-            input[0][0][self.char_to_idx[ch]] = 1
-
-            workspace.FeedBlob("input_blob", input)
-            workspace.RunNet(self.forward_net.Name())
-
-            p = workspace.FetchBlob(self.predictions)
-            next = np.random.choice(self.D, p=p[0][0])
-
-            ch = self.idx_to_char[next]
-            text += ch
-
-        print(text)
-
-
-@utils.debug
-def main():
-    parser = argparse.ArgumentParser(
-        description="Caffe2: Char RNN Training"
-    )
-    parser.add_argument("--train_data", type=str, default=None,
-                        help="Path to training data in a text file format",
-                        required=True)
-    parser.add_argument("--seq_length", type=int, default=25,
-                        help="One training example sequence length")
-    parser.add_argument("--batch_size", type=int, default=1,
-                        help="Training batch size")
-    parser.add_argument("--iters_to_report", type=int, default=500,
-                        help="How often to report loss and generate text")
-    parser.add_argument("--hidden_size", type=int, default=100,
-                        help="Dimension of the hidden representation")
-    parser.add_argument("--gpu", action="store_true",
-                        help="If set, training is going to use GPU 0")
-
-    args = parser.parse_args()
-
-    device = core.DeviceOption(
-        workspace.GpuDeviceType if args.gpu else caffe2_pb2.CPU, 0)
-    with core.DeviceScope(device):
-        model = CharRNN(args)
-        model.CreateModel()
-        model.TrainModel()
-
-
-if __name__ == '__main__':
-    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
-    main()
diff --git a/caffe2/python/examples/imagenet_trainer.py b/caffe2/python/examples/imagenet_trainer.py
deleted file mode 100644
index 53c0c96c4667..000000000000
--- a/caffe2/python/examples/imagenet_trainer.py
+++ /dev/null
@@ -1,726 +0,0 @@
-# Module caffe2.python.examples.resnet50_trainer
-import argparse
-import logging
-import numpy as np
-import time
-import os
-
-from caffe2.python import core, workspace, experiment_util, data_parallel_model
-from caffe2.python import dyndep, optimizer
-from caffe2.python import timeout_guard, model_helper, brew
-from caffe2.proto import caffe2_pb2
-
-import caffe2.python.models.resnet as resnet
-import caffe2.python.models.shufflenet as shufflenet
-from caffe2.python.modeling.initializers import Initializer, PseudoFP16Initializer
-import caffe2.python.predictor.predictor_exporter as pred_exp
-import caffe2.python.predictor.predictor_py_utils as pred_utils
-from caffe2.python.predictor_constants import predictor_constants
-
-'''
-Parallelized multi-GPU distributed trainer for Resne(X)t & Shufflenet.
-Can be used to train on imagenet data, for example.
-The default parameters can train a standard Resnet-50 (1x64d), and parameters
-can be provided to train ResNe(X)t models (e.g., ResNeXt-101 32x4d).
-
-To run the trainer in single-machine multi-gpu mode by setting num_shards = 1.
-
-To run the trainer in multi-machine multi-gpu mode with M machines,
-run the same program on all machines, specifying num_shards = M, and
-shard_id = a unique integer in the set [0, M-1].
-
-For rendezvous (the trainer processes have to know about each other),
-you can either use a directory path that is visible to all processes
-(e.g. NFS directory), or use a Redis instance. Use the former by
-passing the `file_store_path` argument. Use the latter by passing the
-`redis_host` and `redis_port` arguments.
-'''
-
-logging.basicConfig()
-log = logging.getLogger("Imagenet_trainer")
-log.setLevel(logging.DEBUG)
-
-dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:file_store_handler_ops')
-dyndep.InitOpsLibrary('@/caffe2/caffe2/distributed:redis_store_handler_ops')
-
-
-def AddImageInput(
-    model,
-    reader,
-    batch_size,
-    img_size,
-    dtype,
-    is_test,
-    mean_per_channel=None,
-    std_per_channel=None,
-):
-    '''
-    The image input operator loads image and label data from the reader and
-    applies transformations to the images (random cropping, mirroring, ...).
-    '''
-    data, label = brew.image_input(
-        model,
-        reader, ["data", "label"],
-        batch_size=batch_size,
-        output_type=dtype,
-        use_gpu_transform=True if core.IsGPUDeviceType(model._device_type) else False,
-        use_caffe_datum=True,
-        mean_per_channel=mean_per_channel,
-        std_per_channel=std_per_channel,
-        # mean_per_channel takes precedence over mean
-        mean=128.,
-        std=128.,
-        scale=256,
-        crop=img_size,
-        mirror=1,
-        is_test=is_test,
-    )
-
-    data = model.StopGradient(data, data)
-
-
-def AddNullInput(model, reader, batch_size, img_size, dtype):
-    '''
-    The null input function uses a gaussian fill operator to emulate real image
-    input. A label blob is hardcoded to a single value. This is useful if you
-    want to test compute throughput or don't have a dataset available.
-    '''
-    suffix = "_fp16" if dtype == "float16" else ""
-    model.param_init_net.GaussianFill(
-        [],
-        ["data" + suffix],
-        shape=[batch_size, 3, img_size, img_size],
-    )
-    if dtype == "float16":
-        model.param_init_net.FloatToHalf("data" + suffix, "data")
-
-    model.param_init_net.ConstantFill(
-        [],
-        ["label"],
-        shape=[batch_size],
-        value=1,
-        dtype=core.DataType.INT32,
-    )
-
-
-def SaveModel(args, train_model, epoch, use_ideep):
-    prefix = "[]_{}".format(train_model._device_prefix, train_model._devices[0])
-    predictor_export_meta = pred_exp.PredictorExportMeta(
-        predict_net=train_model.net.Proto(),
-        parameters=data_parallel_model.GetCheckpointParams(train_model),
-        inputs=[prefix + "/data"],
-        outputs=[prefix + "/softmax"],
-        shapes={
-            prefix + "/softmax": (1, args.num_labels),
-            prefix + "/data": (args.num_channels, args.image_size, args.image_size)
-        }
-    )
-
-    # save the train_model for the current epoch
-    model_path = "%s/%s_%d.mdl" % (
-        args.file_store_path,
-        args.save_model_name,
-        epoch,
-    )
-
-    # set db_type to be "minidb" instead of "log_file_db", which breaks
-    # the serialization in save_to_db. Need to switch back to log_file_db
-    # after migration
-    pred_exp.save_to_db(
-        db_type="minidb",
-        db_destination=model_path,
-        predictor_export_meta=predictor_export_meta,
-        use_ideep=use_ideep
-    )
-
-
-def LoadModel(path, model, use_ideep):
-    '''
-    Load pretrained model from file
-    '''
-    log.info("Loading path: {}".format(path))
-    meta_net_def = pred_exp.load_from_db(path, 'minidb')
-    init_net = core.Net(pred_utils.GetNet(
-        meta_net_def, predictor_constants.GLOBAL_INIT_NET_TYPE))
-    predict_init_net = core.Net(pred_utils.GetNet(
-        meta_net_def, predictor_constants.PREDICT_INIT_NET_TYPE))
-
-    if use_ideep:
-        predict_init_net.RunAllOnIDEEP()
-    else:
-        predict_init_net.RunAllOnGPU()
-    if use_ideep:
-        init_net.RunAllOnIDEEP()
-    else:
-        init_net.RunAllOnGPU()
-
-    assert workspace.RunNetOnce(predict_init_net)
-    assert workspace.RunNetOnce(init_net)
-
-    # Hack: fix iteration counter which is in CUDA context after load model
-    itercnt = workspace.FetchBlob("optimizer_iteration")
-    workspace.FeedBlob(
-        "optimizer_iteration",
-        itercnt,
-        device_option=core.DeviceOption(caffe2_pb2.CPU, 0)
-    )
-
-
-def RunEpoch(
-    args,
-    epoch,
-    train_model,
-    test_model,
-    total_batch_size,
-    num_shards,
-    expname,
-    explog,
-):
-    '''
-    Run one epoch of the trainer.
-    TODO: add checkpointing here.
-    '''
-    # TODO: add loading from checkpoint
-    log.info("Starting epoch {}/{}".format(epoch, args.num_epochs))
-    epoch_iters = int(args.epoch_size / total_batch_size / num_shards)
-    test_epoch_iters = int(args.test_epoch_size / total_batch_size / num_shards)
-    for i in range(epoch_iters):
-        # This timeout is required (temporarily) since CUDA-NCCL
-        # operators might deadlock when synchronizing between GPUs.
-        timeout = args.first_iter_timeout if i == 0 else args.timeout
-        with timeout_guard.CompleteInTimeOrDie(timeout):
-            t1 = time.time()
-            workspace.RunNet(train_model.net.Proto().name)
-            t2 = time.time()
-            dt = t2 - t1
-
-        fmt = "Finished iteration {}/{} of epoch {} ({:.2f} images/sec)"
-        log.info(fmt.format(i + 1, epoch_iters, epoch, total_batch_size / dt))
-        prefix = "{}_{}".format(
-            train_model._device_prefix,
-            train_model._devices[0])
-        accuracy = workspace.FetchBlob(prefix + '/accuracy')
-        loss = workspace.FetchBlob(prefix + '/loss')
-        train_fmt = "Training loss: {}, accuracy: {}"
-        log.info(train_fmt.format(loss, accuracy))
-
-    num_images = epoch * epoch_iters * total_batch_size
-    prefix = "{}_{}".format(train_model._device_prefix, train_model._devices[0])
-    accuracy = workspace.FetchBlob(prefix + '/accuracy')
-    loss = workspace.FetchBlob(prefix + '/loss')
-    learning_rate = workspace.FetchBlob(
-        data_parallel_model.GetLearningRateBlobNames(train_model)[0]
-    )
-    test_accuracy = 0
-    test_accuracy_top5 = 0
-    if test_model is not None:
-        # Run 100 iters of testing
-        ntests = 0
-        for _ in range(test_epoch_iters):
-            workspace.RunNet(test_model.net.Proto().name)
-            for g in test_model._devices:
-                test_accuracy += workspace.FetchBlob(
-                    "{}_{}".format(test_model._device_prefix, g) + '/accuracy'
-                ).item()
-                test_accuracy_top5 += workspace.FetchBlob(
-                    "{}_{}".format(test_model._device_prefix, g) + '/accuracy_top5'
-                ).item()
-                ntests += 1
-        test_accuracy /= ntests
-        test_accuracy_top5 /= ntests
-    else:
-        test_accuracy = (-1)
-        test_accuracy_top5 = (-1)
-
-    explog.log(
-        input_count=num_images,
-        batch_count=(i + epoch * epoch_iters),
-        additional_values={
-            'accuracy': accuracy,
-            'loss': loss,
-            'learning_rate': learning_rate,
-            'epoch': epoch,
-            'top1_test_accuracy': test_accuracy,
-            'top5_test_accuracy': test_accuracy_top5,
-        }
-    )
-    assert loss < 40, "Exploded gradients :("
-
-    # TODO: add checkpointing
-    return epoch + 1
-
-
-def Train(args):
-    if args.model == "resnext":
-        model_name = "resnext" + str(args.num_layers)
-    elif args.model == "shufflenet":
-        model_name = "shufflenet"
-
-    # Either use specified device list or generate one
-    if args.gpus is not None:
-        gpus = [int(x) for x in args.gpus.split(',')]
-        num_gpus = len(gpus)
-    else:
-        gpus = list(range(args.num_gpus))
-        num_gpus = args.num_gpus
-
-    log.info("Running on GPUs: {}".format(gpus))
-
-    # Verify valid batch size
-    total_batch_size = args.batch_size
-    batch_per_device = total_batch_size // num_gpus
-    assert \
-        total_batch_size % num_gpus == 0, \
-        "Number of GPUs must divide batch size"
-
-    # Verify valid image mean/std per channel
-    if args.image_mean_per_channel:
-        assert \
-            len(args.image_mean_per_channel) == args.num_channels, \
-            "The number of channels of image mean doesn't match input"
-
-    if args.image_std_per_channel:
-        assert \
-            len(args.image_std_per_channel) == args.num_channels, \
-            "The number of channels of image std doesn't match input"
-
-    # Round down epoch size to closest multiple of batch size across machines
-    global_batch_size = total_batch_size * args.num_shards
-    epoch_iters = int(args.epoch_size / global_batch_size)
-
-    assert \
-        epoch_iters > 0, \
-        "Epoch size must be larger than batch size times shard count"
-
-    args.epoch_size = epoch_iters * global_batch_size
-    log.info("Using epoch size: {}".format(args.epoch_size))
-
-    # Create ModelHelper object
-    if args.use_ideep:
-        train_arg_scope = {
-            'use_cudnn': False,
-            'cudnn_exhaustive_search': False,
-            'training_mode': 1
-        }
-    else:
-        train_arg_scope = {
-            'order': 'NCHW',
-            'use_cudnn': True,
-            'cudnn_exhaustive_search': True,
-            'ws_nbytes_limit': (args.cudnn_workspace_limit_mb * 1024 * 1024),
-        }
-    train_model = model_helper.ModelHelper(
-        name=model_name, arg_scope=train_arg_scope
-    )
-
-    num_shards = args.num_shards
-    shard_id = args.shard_id
-
-    # Expect interfaces to be comma separated.
-    # Use of multiple network interfaces is not yet complete,
-    # so simply use the first one in the list.
-    interfaces = args.distributed_interfaces.split(",")
-
-    # Rendezvous using MPI when run with mpirun
-    if os.getenv("OMPI_COMM_WORLD_SIZE") is not None:
-        num_shards = int(os.getenv("OMPI_COMM_WORLD_SIZE", 1))
-        shard_id = int(os.getenv("OMPI_COMM_WORLD_RANK", 0))
-        if num_shards > 1:
-            rendezvous = dict(
-                kv_handler=None,
-                num_shards=num_shards,
-                shard_id=shard_id,
-                engine="GLOO",
-                transport=args.distributed_transport,
-                interface=interfaces[0],
-                mpi_rendezvous=True,
-                exit_nets=None)
-
-    elif num_shards > 1:
-        # Create rendezvous for distributed computation
-        store_handler = "store_handler"
-        if args.redis_host is not None:
-            # Use Redis for rendezvous if Redis host is specified
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "RedisStoreHandlerCreate", [], [store_handler],
-                    host=args.redis_host,
-                    port=args.redis_port,
-                    prefix=args.run_id,
-                )
-            )
-        else:
-            # Use filesystem for rendezvous otherwise
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "FileStoreHandlerCreate", [], [store_handler],
-                    path=args.file_store_path,
-                    prefix=args.run_id,
-                )
-            )
-
-        rendezvous = dict(
-            kv_handler=store_handler,
-            shard_id=shard_id,
-            num_shards=num_shards,
-            engine="GLOO",
-            transport=args.distributed_transport,
-            interface=interfaces[0],
-            exit_nets=None)
-
-    else:
-        rendezvous = None
-
-    # Model building functions
-    def create_resnext_model_ops(model, loss_scale):
-        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
-                       else Initializer)
-
-        with brew.arg_scope([brew.conv, brew.fc],
-                            WeightInitializer=initializer,
-                            BiasInitializer=initializer,
-                            enable_tensor_core=args.enable_tensor_core,
-                            float16_compute=args.float16_compute):
-            pred = resnet.create_resnext(
-                model,
-                "data",
-                num_input_channels=args.num_channels,
-                num_labels=args.num_labels,
-                num_layers=args.num_layers,
-                num_groups=args.resnext_num_groups,
-                num_width_per_group=args.resnext_width_per_group,
-                no_bias=True,
-                no_loss=True,
-            )
-
-        if args.dtype == 'float16':
-            pred = model.net.HalfToFloat(pred, pred + '_fp32')
-
-        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
-                                              ['softmax', 'loss'])
-        loss = model.Scale(loss, scale=loss_scale)
-        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
-        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
-        return [loss]
-
-    def create_shufflenet_model_ops(model, loss_scale):
-        initializer = (PseudoFP16Initializer if args.dtype == 'float16'
-                       else Initializer)
-
-        with brew.arg_scope([brew.conv, brew.fc],
-                            WeightInitializer=initializer,
-                            BiasInitializer=initializer,
-                            enable_tensor_core=args.enable_tensor_core,
-                            float16_compute=args.float16_compute):
-            pred = shufflenet.create_shufflenet(
-                model,
-                "data",
-                num_input_channels=args.num_channels,
-                num_labels=args.num_labels,
-                no_loss=True,
-            )
-
-        if args.dtype == 'float16':
-            pred = model.net.HalfToFloat(pred, pred + '_fp32')
-
-        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
-                                              ['softmax', 'loss'])
-        loss = model.Scale(loss, scale=loss_scale)
-        brew.accuracy(model, [softmax, "label"], "accuracy", top_k=1)
-        brew.accuracy(model, [softmax, "label"], "accuracy_top5", top_k=5)
-        return [loss]
-
-    def add_optimizer(model):
-        stepsz = int(30 * args.epoch_size / total_batch_size / num_shards)
-
-        if args.float16_compute:
-            # TODO: merge with multi-precision optimizer
-            opt = optimizer.build_fp16_sgd(
-                model,
-                args.base_learning_rate,
-                momentum=0.9,
-                nesterov=1,
-                weight_decay=args.weight_decay,   # weight decay included
-                policy="step",
-                stepsize=stepsz,
-                gamma=0.1
-            )
-        else:
-            optimizer.add_weight_decay(model, args.weight_decay)
-            opt = optimizer.build_multi_precision_sgd(
-                model,
-                args.base_learning_rate,
-                momentum=0.9,
-                nesterov=1,
-                policy="step",
-                stepsize=stepsz,
-                gamma=0.1
-            )
-        return opt
-
-    # Define add_image_input function.
-    # Depends on the "train_data" argument.
-    # Note that the reader will be shared with between all GPUS.
-    if args.train_data == "null":
-        def add_image_input(model):
-            AddNullInput(
-                model,
-                None,
-                batch_size=batch_per_device,
-                img_size=args.image_size,
-                dtype=args.dtype,
-            )
-    else:
-        reader = train_model.CreateDB(
-            "reader",
-            db=args.train_data,
-            db_type=args.db_type,
-            num_shards=num_shards,
-            shard_id=shard_id,
-        )
-
-        def add_image_input(model):
-            AddImageInput(
-                model,
-                reader,
-                batch_size=batch_per_device,
-                img_size=args.image_size,
-                dtype=args.dtype,
-                is_test=False,
-                mean_per_channel=args.image_mean_per_channel,
-                std_per_channel=args.image_std_per_channel,
-            )
-
-    def add_post_sync_ops(model):
-        """Add ops applied after initial parameter sync."""
-        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
-            if param_info.blob_copy is not None:
-                model.param_init_net.HalfToFloat(
-                    param_info.blob,
-                    param_info.blob_copy[core.DataType.FLOAT]
-                )
-
-    data_parallel_model.Parallelize(
-        train_model,
-        input_builder_fun=add_image_input,
-        forward_pass_builder_fun=create_resnext_model_ops
-        if args.model == "resnext" else create_shufflenet_model_ops,
-        optimizer_builder_fun=add_optimizer,
-        post_sync_builder_fun=add_post_sync_ops,
-        devices=gpus,
-        rendezvous=rendezvous,
-        optimize_gradient_memory=False,
-        use_nccl=args.use_nccl,
-        cpu_device=args.use_cpu,
-        ideep=args.use_ideep,
-        shared_model=args.use_cpu,
-        combine_spatial_bn=args.use_cpu,
-    )
-
-    data_parallel_model.OptimizeGradientMemory(train_model, {}, set(), False)
-
-    workspace.RunNetOnce(train_model.param_init_net)
-    workspace.CreateNet(train_model.net)
-
-    # Add test model, if specified
-    test_model = None
-    if (args.test_data is not None):
-        log.info("----- Create test net ----")
-        if args.use_ideep:
-            test_arg_scope = {
-                'use_cudnn': False,
-                'cudnn_exhaustive_search': False,
-            }
-        else:
-            test_arg_scope = {
-                'order': "NCHW",
-                'use_cudnn': True,
-                'cudnn_exhaustive_search': True,
-            }
-        test_model = model_helper.ModelHelper(
-            name=model_name + "_test",
-            arg_scope=test_arg_scope,
-            init_params=False,
-        )
-
-        test_reader = test_model.CreateDB(
-            "test_reader",
-            db=args.test_data,
-            db_type=args.db_type,
-        )
-
-        def test_input_fn(model):
-            AddImageInput(
-                model,
-                test_reader,
-                batch_size=batch_per_device,
-                img_size=args.image_size,
-                dtype=args.dtype,
-                is_test=True,
-                mean_per_channel=args.image_mean_per_channel,
-                std_per_channel=args.image_std_per_channel,
-            )
-
-        data_parallel_model.Parallelize(
-            test_model,
-            input_builder_fun=test_input_fn,
-            forward_pass_builder_fun=create_resnext_model_ops
-            if args.model == "resnext" else create_shufflenet_model_ops,
-            post_sync_builder_fun=add_post_sync_ops,
-            param_update_builder_fun=None,
-            devices=gpus,
-            use_nccl=args.use_nccl,
-            cpu_device=args.use_cpu,
-        )
-        workspace.RunNetOnce(test_model.param_init_net)
-        workspace.CreateNet(test_model.net)
-
-    epoch = 0
-    # load the pre-trained model and reset epoch
-    if args.load_model_path is not None:
-        LoadModel(args.load_model_path, train_model, args.use_ideep)
-
-        # Sync the model params
-        data_parallel_model.FinalizeAfterCheckpoint(train_model)
-
-        # reset epoch. load_model_path should end with *_X.mdl,
-        # where X is the epoch number
-        last_str = args.load_model_path.split('_')[-1]
-        if last_str.endswith('.mdl'):
-            epoch = int(last_str[:-4])
-            log.info("Reset epoch to {}".format(epoch))
-        else:
-            log.warning("The format of load_model_path doesn't match!")
-
-    expname = "%s_gpu%d_b%d_L%d_lr%.2f_v2" % (
-        model_name,
-        args.num_gpus,
-        total_batch_size,
-        args.num_labels,
-        args.base_learning_rate,
-    )
-
-    explog = experiment_util.ModelTrainerLog(expname, args)
-
-    # Run the training one epoch a time
-    while epoch < args.num_epochs:
-        epoch = RunEpoch(
-            args,
-            epoch,
-            train_model,
-            test_model,
-            total_batch_size,
-            num_shards,
-            expname,
-            explog
-        )
-
-        # Save the model for each epoch
-        SaveModel(args, train_model, epoch, args.use_ideep)
-
-        model_path = "%s/%s_" % (
-            args.file_store_path,
-            args.save_model_name
-        )
-        # remove the saved model from the previous epoch if it exists
-        if os.path.isfile(model_path + str(epoch - 1) + ".mdl"):
-            os.remove(model_path + str(epoch - 1) + ".mdl")
-
-
-def main():
-    # TODO: use argv
-    parser = argparse.ArgumentParser(
-        description="Caffe2: ImageNet Trainer"
-    )
-    parser.add_argument("--train_data", type=str, default=None, required=True,
-                        help="Path to training data (or 'null' to simulate)")
-    parser.add_argument("--num_layers", type=int, default=50,
-                        help="The number of layers in ResNe(X)t model")
-    parser.add_argument("--resnext_num_groups", type=int, default=1,
-                        help="The cardinality of resnext")
-    parser.add_argument("--resnext_width_per_group", type=int, default=64,
-                        help="The cardinality of resnext")
-    parser.add_argument("--test_data", type=str, default=None,
-                        help="Path to test data")
-    parser.add_argument("--image_mean_per_channel", type=float, nargs='+',
-                        help="The per channel mean for the images")
-    parser.add_argument("--image_std_per_channel", type=float, nargs='+',
-                        help="The per channel standard deviation for the images")
-    parser.add_argument("--test_epoch_size", type=int, default=50000,
-                        help="Number of test images")
-    parser.add_argument("--db_type", type=str, default="lmdb",
-                        help="Database type (such as lmdb or leveldb)")
-    parser.add_argument("--gpus", type=str,
-                        help="Comma separated list of GPU devices to use")
-    parser.add_argument("--num_gpus", type=int, default=1,
-                        help="Number of GPU devices (instead of --gpus)")
-    parser.add_argument("--num_channels", type=int, default=3,
-                        help="Number of color channels")
-    parser.add_argument("--image_size", type=int, default=224,
-                        help="Input image size (to crop to)")
-    parser.add_argument("--num_labels", type=int, default=1000,
-                        help="Number of labels")
-    parser.add_argument("--batch_size", type=int, default=32,
-                        help="Batch size, total over all GPUs")
-    parser.add_argument("--epoch_size", type=int, default=1500000,
-                        help="Number of images/epoch, total over all machines")
-    parser.add_argument("--num_epochs", type=int, default=1000,
-                        help="Num epochs.")
-    parser.add_argument("--base_learning_rate", type=float, default=0.1,
-                        help="Initial learning rate.")
-    parser.add_argument("--weight_decay", type=float, default=1e-4,
-                        help="Weight decay (L2 regularization)")
-    parser.add_argument("--cudnn_workspace_limit_mb", type=int, default=64,
-                        help="CuDNN workspace limit in MBs")
-    parser.add_argument("--num_shards", type=int, default=1,
-                        help="Number of machines in distributed run")
-    parser.add_argument("--shard_id", type=int, default=0,
-                        help="Shard id.")
-    parser.add_argument("--run_id", type=str,
-                        help="Unique run identifier (e.g. uuid)")
-    parser.add_argument("--redis_host", type=str,
-                        help="Host of Redis server (for rendezvous)")
-    parser.add_argument("--redis_port", type=int, default=6379,
-                        help="Port of Redis server (for rendezvous)")
-    parser.add_argument("--file_store_path", type=str, default="/tmp",
-                        help="Path to directory to use for rendezvous")
-    parser.add_argument("--save_model_name", type=str, default="resnext_model",
-                        help="Save the trained model to a given name")
-    parser.add_argument("--load_model_path", type=str, default=None,
-                        help="Load previously saved model to continue training")
-    parser.add_argument("--use_cpu", action="store_true",
-                        help="Use CPU instead of GPU")
-    parser.add_argument("--use_nccl", action="store_true",
-                        help="Use nccl for inter-GPU collectives")
-    parser.add_argument("--use_ideep", type=bool, default=False,
-                        help="Use ideep")
-    parser.add_argument('--dtype', default='float',
-                        choices=['float', 'float16'],
-                        help='Data type used for training')
-    parser.add_argument('--float16_compute', action='store_true',
-                        help="Use float 16 compute, if available")
-    parser.add_argument('--enable_tensor_core', action='store_true',
-                        help='Enable Tensor Core math for Conv and FC ops')
-    parser.add_argument("--distributed_transport", type=str, default="tcp",
-                        help="Transport to use for distributed run [tcp|ibverbs]")
-    parser.add_argument("--distributed_interfaces", type=str, default="",
-                        help="Network interfaces to use for distributed run")
-
-    parser.add_argument("--first_iter_timeout", type=int, default=1200,
-                        help="Timeout (secs) of the first iteration "
-                        "(default: %(default)s)")
-    parser.add_argument("--timeout", type=int, default=60,
-                        help="Timeout (secs) of each (except the first) iteration "
-                        "(default: %(default)s)")
-    parser.add_argument("--model",
-                        default="resnext", const="resnext", nargs="?",
-                        choices=["shufflenet", "resnext"],
-                        help="List of models which can be run")
-    args = parser.parse_args()
-
-    Train(args)
-
-
-if __name__ == '__main__':
-    workspace.GlobalInit(['caffe2', '--caffe2_log_level=2'])
-    main()
diff --git a/caffe2/python/examples/lmdb_create_example.py b/caffe2/python/examples/lmdb_create_example.py
deleted file mode 100644
index af56069a7be0..000000000000
--- a/caffe2/python/examples/lmdb_create_example.py
+++ /dev/null
@@ -1,107 +0,0 @@
-## @package lmdb_create_example
-# Module caffe2.python.examples.lmdb_create_example
-
-
-
-
-
-import argparse
-import numpy as np
-
-import lmdb
-from caffe2.proto import caffe2_pb2
-from caffe2.python import workspace, model_helper
-
-'''
-Simple example to create an lmdb database of random image data and labels.
-This can be used a skeleton to write your own data import.
-
-It also runs a dummy-model with Caffe2 that reads the data and
-validates the checksum is same.
-'''
-
-
-def create_db(output_file):
-    print(">>> Write database...")
-    LMDB_MAP_SIZE = 1 << 40   # MODIFY
-    env = lmdb.open(output_file, map_size=LMDB_MAP_SIZE)
-
-    checksum = 0
-    with env.begin(write=True) as txn:
-        for j in range(0, 128):
-            # MODIFY: add your own data reader / creator
-            label = j % 10
-            width = 64
-            height = 32
-
-            img_data = np.random.rand(3, width, height)
-            # ...
-
-            # Create TensorProtos
-            tensor_protos = caffe2_pb2.TensorProtos()
-            img_tensor = tensor_protos.protos.add()
-            img_tensor.dims.extend(img_data.shape)
-            img_tensor.data_type = 1
-
-            flatten_img = img_data.reshape(np.prod(img_data.shape))
-            img_tensor.float_data.extend(flatten_img)
-
-            label_tensor = tensor_protos.protos.add()
-            label_tensor.data_type = 2
-            label_tensor.int32_data.append(label)
-            txn.put(
-                '{}'.format(j).encode('ascii'),
-                tensor_protos.SerializeToString()
-            )
-
-            checksum += np.sum(img_data) * label
-            if (j % 16 == 0):
-                print("Inserted {} rows".format(j))
-
-    print("Checksum/write: {}".format(int(checksum)))
-    return checksum
-
-
-def read_db_with_caffe2(db_file, expected_checksum):
-    print(">>> Read database...")
-    model = model_helper.ModelHelper(name="lmdbtest")
-    batch_size = 32
-    data, label = model.TensorProtosDBInput(
-        [], ["data", "label"], batch_size=batch_size,
-        db=db_file, db_type="lmdb")
-
-    checksum = 0
-
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-
-    for _ in range(0, 4):
-        workspace.RunNet(model.net.Proto().name)
-
-        img_datas = workspace.FetchBlob("data")
-        labels = workspace.FetchBlob("label")
-        for j in range(batch_size):
-            checksum += np.sum(img_datas[j, :]) * labels[j]
-
-    print("Checksum/read: {}".format(int(checksum)))
-    assert np.abs(expected_checksum - checksum < 0.1), \
-        "Read/write checksums dont match"
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Example LMDB creation"
-    )
-    parser.add_argument("--output_file", type=str, default=None,
-                        help="Path to write the database to",
-                        required=True)
-
-    args = parser.parse_args()
-    checksum = create_db(args.output_file)
-
-    # For testing reading:
-    read_db_with_caffe2(args.output_file, checksum)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/caffe2/python/examples/resnet50_trainer.py b/caffe2/python/examples/resnet50_trainer.py
deleted file mode 120000
index 19a82c641954..000000000000
--- a/caffe2/python/examples/resnet50_trainer.py
+++ /dev/null
@@ -1 +0,0 @@
-imagenet_trainer.py
\ No newline at end of file
diff --git a/caffe2/python/experiment_util.py b/caffe2/python/experiment_util.py
deleted file mode 100644
index 2821ec1ff42b..000000000000
--- a/caffe2/python/experiment_util.py
+++ /dev/null
@@ -1,113 +0,0 @@
-## @package experiment_util
-# Module caffe2.python.experiment_util
-
-
-
-
-
-import datetime
-import time
-import logging
-import socket
-import abc
-
-from collections import OrderedDict
-
-'''
-Utilities for logging experiment run stats, such as accuracy
-and loss over time for different runs. Runtime arguments are stored
-in the log.
-
-Optionally, ModelTrainerLog calls out to a logger to log to
-an external log destination.
-'''
-
-
-class ExternalLogger:
-    __metaclass__ = abc.ABCMeta
-
-    @abc.abstractmethod
-    def set_runtime_args(self, runtime_args):
-        """
-            Set runtime arguments for the logger.
-            runtime_args: dict of runtime arguments.
-        """
-        raise NotImplementedError(
-            'Must define set_runtime_args function to use this base class'
-        )
-
-    @abc.abstractmethod
-    def log(self, log_dict):
-        """
-            log a dict of key/values to an external destination
-            log_dict: input dict
-        """
-        raise NotImplementedError(
-            'Must define log function to use this base class'
-        )
-
-
-class ModelTrainerLog():
-
-    def __init__(self, expname, runtime_args, external_loggers=None):
-        now = datetime.datetime.fromtimestamp(time.time())
-        self.experiment_id = \
-            "{}_{}".format(expname, now.strftime('%Y%m%d_%H%M%S'))
-        self.filename = "{}.log".format(self.experiment_id)
-        self.logstr("# %s" % str(runtime_args))
-        self.headers = None
-        self.start_time = time.time()
-        self.last_time = self.start_time
-        self.last_input_count = 0
-        self.external_loggers = None
-
-        if external_loggers is not None:
-            self.external_loggers = external_loggers
-            if not isinstance(runtime_args, dict):
-                runtime_args = dict(vars(runtime_args))
-            runtime_args['experiment_id'] = self.experiment_id
-            runtime_args['hostname'] = socket.gethostname()
-            for logger in self.external_loggers:
-                logger.set_runtime_args(runtime_args)
-        else:
-            self.external_loggers = []
-
-    def logstr(self, str):
-        with open(self.filename, "a") as f:
-            f.write(str + "\n")
-            f.close()
-        logging.getLogger("experiment_logger").info(str)
-
-    def log(self, input_count, batch_count, additional_values):
-        logdict = OrderedDict()
-        delta_t = time.time() - self.last_time
-        delta_count = input_count - self.last_input_count
-        self.last_time = time.time()
-        self.last_input_count = input_count
-
-        logdict['time_spent'] = delta_t
-        logdict['cumulative_time_spent'] = time.time() - self.start_time
-        logdict['input_count'] = delta_count
-        logdict['cumulative_input_count'] = input_count
-        logdict['cumulative_batch_count'] = batch_count
-        if delta_t > 0:
-            logdict['inputs_per_sec'] = delta_count / delta_t
-        else:
-            logdict['inputs_per_sec'] = 0.0
-
-        for k in sorted(additional_values.keys()):
-            logdict[k] = additional_values[k]
-
-        # Write the headers if they are not written yet
-        if self.headers is None:
-            self.headers = list(logdict.keys())
-            self.logstr(",".join(self.headers))
-
-        self.logstr(",".join(str(v) for v in logdict.values()))
-
-        for logger in self.external_loggers:
-            try:
-                logger.log(logdict)
-            except Exception as e:
-                logging.warning(
-                    "Failed to call ExternalLogger: {}".format(e), e)
diff --git a/caffe2/python/extension_loader.py b/caffe2/python/extension_loader.py
deleted file mode 100644
index 06c6707dcce9..000000000000
--- a/caffe2/python/extension_loader.py
+++ /dev/null
@@ -1,29 +0,0 @@
-## @package extension_loader
-# Module caffe2.python.extension_loader
-
-
-
-
-import contextlib
-import ctypes
-import sys
-
-
-_set_global_flags = (
-    hasattr(sys, 'getdlopenflags') and hasattr(sys, 'setdlopenflags'))
-
-
-@contextlib.contextmanager
-def DlopenGuard(extra_flags=ctypes.RTLD_GLOBAL):
-    if _set_global_flags:
-        old_flags = sys.getdlopenflags()
-        sys.setdlopenflags(old_flags | extra_flags)
-
-    # in case we dlopen something that doesn't exist, yield will fail and throw;
-    # we need to remember reset the old flags to clean up, otherwise RTLD_GLOBAL
-    # flag will stick around and create symbol conflict problems
-    try:
-        yield
-    finally:
-        if _set_global_flags:
-            sys.setdlopenflags(old_flags)
diff --git a/caffe2/python/fakefp16_transform_lib.py b/caffe2/python/fakefp16_transform_lib.py
deleted file mode 100644
index c3f142061479..000000000000
--- a/caffe2/python/fakefp16_transform_lib.py
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env python3
-
-
-
-
-import caffe2.python._import_c_extension as C
-from caffe2.proto.caffe2_pb2 import NetDef
-
-def fakeFp16FuseOps(net : NetDef) -> NetDef:
-    net_str = net.SerializeToString()
-
-    out_str = C.fakeFp16FuseOps(net_str)
-    out_net = NetDef()
-    out_net.ParseFromString(out_str)
-
-    return out_net
diff --git a/caffe2/python/fakelowp/__init__.py b/caffe2/python/fakelowp/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/fakelowp/init_shared_libs.py b/caffe2/python/fakelowp/init_shared_libs.py
deleted file mode 100644
index 889b2bcab520..000000000000
--- a/caffe2/python/fakelowp/init_shared_libs.py
+++ /dev/null
@@ -1,9 +0,0 @@
-
-
-import ctypes
-import os
-
-if 'OSS_ONNXIFI_LIB' in os.environ:
-    lib = os.environ['OSS_ONNXIFI_LIB']
-    print("Loading ONNXIFI lib: ".format(lib))
-    ctypes.CDLL(lib, ctypes.RTLD_GLOBAL)
diff --git a/caffe2/python/fakelowp/test_utils.py b/caffe2/python/fakelowp/test_utils.py
deleted file mode 100644
index 4a31a92e5bce..000000000000
--- a/caffe2/python/fakelowp/test_utils.py
+++ /dev/null
@@ -1,65 +0,0 @@
-
-
-
-
-
-import sys
-import numpy as np
-
-def print_test_debug_info(testname, items_dict):
-    filename = "debug_operator_onnxifi_" + testname + ".txt"
-    np.set_printoptions(threshold=sys.maxsize)
-    with open(filename, 'w') as f:
-        for key, value in items_dict.items():
-            print(key, value)
-            f.write("{}\n".format(key))
-            f.write("{}\n".format(value))
-
-def print_net(net):
-    for i in net.external_input:
-        print("Input: {}".format(i))
-    for i in net.external_output:
-        print("Output: {}".format(i))
-    for op in net.op:
-        print("Op {}".format(op.type))
-        for x in op.input:
-            print("  input: {}".format(x))
-        for y in op.output:
-            print("  output: {}".format(y))
-
-def _sigmoid(x):
-    return 1. / (1. + np.exp(np.float64(-x)))
-
-def _tanh(x):
-    return np.tanh(np.float64(x))
-
-def _swish(x):
-    return np.float64(x) * _sigmoid(x)
-
-def _gelu_by_sigmoid(x):
-    return np.float64(x) / (1. + np.exp(np.float64(x) * 1.702))
-
-
-def _acc_func(opname, x):
-    if opname == "Swish":
-        return _swish(x)
-    elif opname == "Sigmoid":
-        return _sigmoid(x)
-    elif opname == "Tanh":
-        return _tanh(x)
-    elif opname == "Gelu":
-        return _gelu_by_sigmoid(x)
-    else:
-        return x
-
-def _get_ulp16(x):
-    abs_x = np.abs(x)
-    mask = (abs_x > 2.**(-14))
-    abs_x = mask * abs_x + (1 - mask) * 2.**(-14)
-    k = np.floor(np.log2(abs_x))
-    return 2.**(k - 10)
-
-def compute_ulp_error(opname, xvec, y_nnpi):
-    y_acc = _acc_func(opname, np.float64(xvec))
-    scale = 1. / _get_ulp16(y_acc)
-    return (y_nnpi - y_acc) * scale
diff --git a/caffe2/python/filler_test.py b/caffe2/python/filler_test.py
deleted file mode 100644
index 9aff384e99af..000000000000
--- a/caffe2/python/filler_test.py
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-
-
-from caffe2.python import core, test_util, workspace
-
-
-class TestFiller(test_util.TestCase):
-    def test_filler(self):
-        net = core.Net("test_filler")
-        net.Concat(["X0", "X1", "X2"], ["concat_out", "split_info"])
-        self.assertFalse(workspace.HasBlob("X0"))
-        input_dim = (30, 20)
-        workspace.FillRandomNetworkInputs(net, [[input_dim, input_dim, input_dim]], [["float", "float", "float"]])
-        self.assertTrue(workspace.HasBlob("X0"))
-        self.assertEqual(workspace.FetchBlob("X0").shape, input_dim)
-
-        with self.assertRaises(RuntimeError):
-            # Filler should throw if number of input dims/types is mismatched.
-            workspace.FillRandomNetworkInputs(net, [[input_dim]], [["float"]])
diff --git a/caffe2/python/functional.py b/caffe2/python/functional.py
deleted file mode 100644
index 26a4dbab2b3b..000000000000
--- a/caffe2/python/functional.py
+++ /dev/null
@@ -1,113 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.proto import caffe2_pb2
-from caffe2.python.onnx.workspace import Workspace
-from collections import namedtuple
-
-OpSchema = workspace.C.OpSchema
-
-
-def namedtupledict(typename, field_names, *args, **kwargs):
-    field_names_map = {n: i for i, n in enumerate(field_names)}
-    # Some output names are invalid python identifier, e.g. "0"
-    kwargs.setdefault('rename', True)
-    data = namedtuple(typename, field_names, *args, **kwargs)
-
-    def getitem(self, key):
-        if isinstance(key, str):
-            key = field_names_map[key]
-        return super(type(self), self).__getitem__(key)
-
-    data.__getitem__ = getitem
-    return data
-
-
-class _Functional:
-    def __getattribute__(self, op_type):
-        def op_func(*inputs, **args):
-            ws = Workspace()
-            schema = OpSchema.get(op_type)
-            input_prefix = 'input_'
-            output_prefix = 'output_'
-
-            def get_name_list(prefix, num, max_num):
-                return [prefix + str(x) for x in range(min(num, max_num))]
-
-            input_names, output_names = [], []
-            input_names = get_name_list(
-                input_prefix, len(inputs), schema.max_input
-            )
-            # verify the length of input name is in range
-            # of schema
-            num_input = len(input_names)
-            if num_input > schema.max_input or num_input < \
-               schema.min_input or not schema.num_inputs_allowed(num_input):
-                raise ValueError(
-                    "Functional C2: Number of inputs not in \
-                range: {} - {} or not allowed."
-                    .format(schema.min_input, schema.max_input)
-                )
-
-            if 'num_output' in args:
-                num_output = args['num_output']
-                if num_output > schema.max_output or \
-                   num_output < schema.min_output or \
-                   not schema.num_outputs_allowed(num_output) or \
-                   not schema.num_inputs_outputs_allowed(num_input,
-                                                         num_output):
-                    raise ValueError(
-                        "Functional C2: Number of output \
-                    not in range: {} - {} or not allowed"
-                        .format(schema.min_output, schema.max_output)
-                    )
-                output_names = get_name_list(
-                    output_prefix, num_output, schema.max_output
-                )
-                args.pop('num_output')
-            calculated = schema.CalculateOutput(num_input)
-            if not output_names and calculated != -1:
-                output_names = get_name_list(
-                    output_prefix, calculated, schema.max_output
-                )
-
-            if not output_names:
-                max_output = schema.max_output
-                # For an op with max_output == inf
-                # and no Output defined in schema
-                # user should pass output_size explicitly
-                if schema.inf == max_output:
-                    raise ValueError(
-                        "For operators with max_output == inf,\
-                        user should pass num_output explicitly."
-                    )
-                output_names = get_name_list(
-                    output_prefix, max_output, max_output
-                )
-
-            # There could be input-output inplace enforcement; replace the
-            # output names with input ones if such enforcements exist
-            for i in range(len(input_names)):
-                for j in range(len(output_names)):
-                    if schema.inplace_enforced(i, j):
-                        output_names[j] = input_names[i]
-
-            op = core.CreateOperator(
-                op_type, input_names, output_names, **args
-            )
-            device_option = args.get('device_option', core.DeviceOption(caffe2_pb2.CPU))
-            with core.DeviceScope(device_option):
-                for i, input_blob in enumerate(inputs):
-                    ws.FeedBlob(input_names[i], input_blob)
-                # RunOperator
-                ws.RunOperatorOnce(op)
-                output_values = [ws.FetchBlob(x) for x in output_names]
-                return namedtupledict('output', output_names)(*output_values)
-
-        return op_func
-
-
-Functional = _Functional()
diff --git a/caffe2/python/functional_test.py b/caffe2/python/functional_test.py
deleted file mode 100644
index d90943761aa4..000000000000
--- a/caffe2/python/functional_test.py
+++ /dev/null
@@ -1,122 +0,0 @@
-
-
-
-
-
-import unittest
-
-from caffe2.python import core
-from hypothesis import given
-import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import workspace
-from caffe2.python.functional import Functional
-import numpy as np
-
-
-@st.composite
-def _tensor_splits(draw, add_axis=False):
-    """Generates (axis, split_info, tensor_splits) tuples."""
-    tensor = draw(hu.tensor(min_value=4))  # Each dim has at least 4 elements.
-    axis = draw(st.integers(0, len(tensor.shape) - 1))
-    if add_axis:
-        # Simple case: get individual slices along one axis, where each of them
-        # is (N-1)-dimensional. The axis will be added back upon concatenation.
-        return (
-            axis, np.ones(tensor.shape[axis], dtype=np.int32), [
-                np.array(tensor.take(i, axis=axis))
-                for i in range(tensor.shape[axis])
-            ]
-        )
-    else:
-        # General case: pick some (possibly consecutive, even non-unique)
-        # indices at which we will split the tensor, along the given axis.
-        splits = sorted(
-            draw(
-                st.
-                lists(elements=st.integers(0, tensor.shape[axis]), max_size=4)
-            ) + [0, tensor.shape[axis]]
-        )
-        return (
-            axis, np.array(np.diff(splits), dtype=np.int32), [
-                tensor.take(range(splits[i], splits[i + 1]), axis=axis)
-                for i in range(len(splits) - 1)
-            ],
-        )
-
-
-class TestFunctional(hu.HypothesisTestCase):
-    @given(X=hu.tensor(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
-    def test_relu(self, X, engine, gc, dc):
-        X += 0.02 * np.sign(X)
-        X[X == 0.0] += 0.02
-        output = Functional.Relu(X, device_option=gc)
-        Y_l = output[0]
-        Y_d = output["output_0"]
-
-        with workspace.WorkspaceGuard("tmp_workspace"):
-            op = core.CreateOperator("Relu", ["X"], ["Y"], engine=engine)
-            workspace.FeedBlob("X", X)
-            workspace.RunOperatorOnce(op)
-            Y_ref = workspace.FetchBlob("Y")
-
-        np.testing.assert_array_equal(
-            Y_l, Y_ref, err_msg='Functional Relu result mismatch'
-        )
-
-        np.testing.assert_array_equal(
-            Y_d, Y_ref, err_msg='Functional Relu result mismatch'
-        )
-
-    @given(tensor_splits=_tensor_splits(), **hu.gcs)
-    def test_concat(self, tensor_splits, gc, dc):
-        # Input Size: 1 -> inf
-        axis, _, splits = tensor_splits
-        concat_result, split_info = Functional.Concat(*splits, axis=axis, device_option=gc)
-
-        concat_result_ref = np.concatenate(splits, axis=axis)
-        split_info_ref = np.array([a.shape[axis] for a in splits])
-
-        np.testing.assert_array_equal(
-            concat_result,
-            concat_result_ref,
-            err_msg='Functional Concat result mismatch'
-        )
-
-        np.testing.assert_array_equal(
-            split_info,
-            split_info_ref,
-            err_msg='Functional Concat split info mismatch'
-        )
-
-    @given(tensor_splits=_tensor_splits(), split_as_arg=st.booleans(), **hu.gcs)
-    def test_split(self, tensor_splits, split_as_arg, gc, dc):
-        # Output Size: 1 - inf
-        axis, split_info, splits = tensor_splits
-
-        split_as_arg = True
-
-        if split_as_arg:
-            input_tensors = [np.concatenate(splits, axis=axis)]
-            kwargs = dict(axis=axis, split=split_info, num_output=len(splits))
-        else:
-            input_tensors = [np.concatenate(splits, axis=axis), split_info]
-            kwargs = dict(axis=axis, num_output=len(splits))
-        result = Functional.Split(*input_tensors, device_option=gc, **kwargs)
-
-        def split_ref(input, split=split_info):
-            s = np.cumsum([0] + list(split))
-            return [
-                np.array(input.take(np.arange(s[i], s[i + 1]), axis=axis))
-                for i in range(len(split))
-            ]
-
-        result_ref = split_ref(*input_tensors)
-        for i, ref in enumerate(result_ref):
-            np.testing.assert_array_equal(
-                result[i], ref, err_msg='Functional Relu result mismatch'
-            )
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py b/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py
deleted file mode 100644
index 97b0ce44e86b..000000000000
--- a/caffe2/python/fused_8bit_rowwise_conversion_ops_test.py
+++ /dev/null
@@ -1,106 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-
-import numpy as np
-import struct
-from hypothesis import given
-
-# Eigen/Python round 0.5 away from 0, Numpy rounds to even
-round_to_nearest = np.vectorize(round)
-
-
-def bytes_to_floats(byte_matrix):
-    floats = np.empty([np.shape(byte_matrix)[0], 1], dtype=np.float32)
-    for i, byte_values in enumerate(byte_matrix):
-        floats[i], = struct.unpack('f', bytearray(byte_values))
-    return floats
-
-
-def floats_to_bytes(floats):
-    byte_matrix = np.empty([np.shape(floats)[0], 4], dtype=np.uint8)
-    for i, value in enumerate(floats):
-        assert isinstance(value, np.float32), (value, floats)
-        as_bytes = struct.pack('f', value)
-        # In Python3 bytes will be a list of int, in Python2 a list of string
-        if isinstance(as_bytes[0], int):
-            byte_matrix[i] = list(as_bytes)
-        else:
-            byte_matrix[i] = [ord(i) for i in as_bytes]
-    return byte_matrix
-
-
-def fused_rowwise_8bit_quantize_reference(data):
-    minimum = np.min(data, axis=-1, keepdims=True)
-    maximum = np.max(data, axis=-1, keepdims=True)
-    span = maximum - minimum
-    bias = minimum
-    scale = span / 255.0
-    inverse_scale = 255.0 / (span + 1e-8)
-    quantized_data = round_to_nearest((data - bias) * inverse_scale)
-    scale_bytes = floats_to_bytes(scale.reshape(-1))
-    scale_bytes = scale_bytes.reshape(data.shape[:-1] + (scale_bytes.shape[-1],))
-    bias_bytes = floats_to_bytes(bias.reshape(-1))
-    bias_bytes = bias_bytes.reshape(data.shape[:-1] + (bias_bytes.shape[-1],))
-    print(quantized_data.shape, scale.shape, scale_bytes.shape, bias.shape, bias_bytes.shape)
-    return np.concatenate([quantized_data, scale_bytes, bias_bytes], axis=-1)
-
-
-def fused_rowwise_8bit_quantize_dequantize_reference(data):
-    fused_quantized = fused_rowwise_8bit_quantize_reference(data)
-    scale = bytes_to_floats(fused_quantized[..., -8:-4].astype(np.uint8).reshape(-1, 4))
-    scale = scale.reshape(fused_quantized.shape[:-1] + (scale.shape[-1],))
-    bias = bytes_to_floats(fused_quantized[..., -4:].astype(np.uint8).reshape(-1, 4))
-    bias = bias.reshape(fused_quantized.shape[:-1] + (bias.shape[-1],))
-    quantized_data = fused_quantized[..., :-8]
-    return quantized_data * scale + bias
-
-
-class TestFused8BitRowwiseQuantizationConversion(hu.HypothesisTestCase):
-    @given(input_data=hu.tensor(min_dim=1, max_dim=3, max_value=33))
-    def test_quantize_op(self, input_data):
-        input_data[input_data == -0.0] = 0.0
-
-        quantize = core.CreateOperator(
-            'FloatToFused8BitRowwiseQuantized',
-            ['input_data'],
-            ['quantized_data'],
-        )
-        workspace.FeedBlob('input_data', input_data)
-        workspace.RunOperatorOnce(quantize)
-
-        quantized_data = workspace.FetchBlob('quantized_data')
-
-        reference = fused_rowwise_8bit_quantize_reference(
-            input_data.astype(np.float32)
-        )
-        np.testing.assert_array_almost_equal(quantized_data, reference)
-
-    @given(input_data=hu.tensor(min_dim=1, max_dim=3, max_value=33))
-    def test_quantize_and_dequantize_op(self, input_data):
-        quantize = core.CreateOperator(
-            'FloatToFused8BitRowwiseQuantized',
-            ['input_data'],
-            ['quantized_data'],
-        )
-        workspace.FeedBlob('input_data', input_data)
-        workspace.RunOperatorOnce(quantize)
-
-        quantized_data = workspace.FetchBlob('quantized_data')
-
-        dequantize = core.CreateOperator(
-            'Fused8BitRowwiseQuantizedToFloat',
-            ['quantized_data'],
-            ['dequantized_data'],
-        )
-        workspace.FeedBlob('quantized_data', quantized_data)
-        workspace.RunOperatorOnce(dequantize)
-
-        dequantized_data = workspace.FetchBlob('dequantized_data')
-
-        reference = fused_rowwise_8bit_quantize_dequantize_reference(input_data)
-        np.testing.assert_array_almost_equal(dequantized_data, reference)
diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py
deleted file mode 100644
index 318c4ad33bc9..000000000000
--- a/caffe2/python/gradient_check_test.py
+++ /dev/null
@@ -1,557 +0,0 @@
-# TODO(jiayq): as more and more tests are moving to hypothesis test, we
-# can gradually remove this test script. DO NOT ADD MORE TESTS TO THIS
-# FILE.
-
-
-
-
-
-import numpy as np
-from caffe2.python import (
-    brew,
-    core,
-    device_checker,
-    gradient_checker,
-    model_helper,
-    test_util,
-    workspace,
-)
-from caffe2.python.gradient_checker import NetGradientChecker
-from caffe2.python.net_builder import ops, NetBuilder
-from caffe2.proto import caffe2_pb2
-
-import unittest
-from typing import Optional
-
-
-if workspace.has_gpu_support and workspace.NumGpuDevices() > 0:
-    _gpu_dev_option = caffe2_pb2.DeviceOption()
-    _gpu_dev_option.device_type = workspace.GpuDeviceType
-    cpu_device_option = caffe2_pb2.DeviceOption()
-    gpu_device_checker = device_checker.DeviceChecker(
-        0.01, [_gpu_dev_option]
-    )
-    device_checker = device_checker.DeviceChecker(
-        0.01, [_gpu_dev_option, cpu_device_option]
-    )
-    gpu_gradient_checkers = [
-        gradient_checker.GradientChecker(
-            0.005, 0.05, _gpu_dev_option, "gpu_checker_ws"
-        ),
-    ]
-    gradient_checkers = [
-        gradient_checker.GradientChecker(
-            0.005, 0.05, _gpu_dev_option, "gpu_checker_ws"
-        ),
-        gradient_checker.GradientChecker(
-            0.01, 0.05, cpu_device_option, "cpu_checker_ws"
-        ),
-    ]
-    gpu_device_option: Optional[caffe2_pb2.DeviceOption] = _gpu_dev_option
-else:
-    cpu_device_option = caffe2_pb2.DeviceOption()
-    gpu_device_option = None
-    gpu_device_checker = device_checker.DeviceChecker(
-        0.01, []
-    )
-    device_checker = device_checker.DeviceChecker(0.01, [cpu_device_option])
-
-    gradient_checkers = [
-        gradient_checker.GradientChecker(
-            0.01, 0.05, cpu_device_option, "cpu_checker_ws"
-        )
-    ]
-    gpu_gradient_checkers = []
-
-
-class TestLRN(test_util.TestCase):
-
-    def setUp(self):
-        self.test_configs = [(6, 10), (3, 13), ]
-
-    def testLRN(self):
-        for input_size, depth in self.test_configs:
-            op = core.CreateOperator("LRN",
-                                     ["X"],
-                                     ["Y", "Y_scale"],
-                                     size=11,
-                                     alpha=0.001,
-                                     beta=0.5,
-                                     bias=2.0,
-                                     order="NHWC"
-                                     )
-            X = np.random.rand(2, input_size, input_size,
-                               depth).astype(np.float32)
-            res = device_checker.CheckSimple(op, [X], [0])
-            self.assertTrue(res)
-            for checker in gradient_checkers:
-                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
-                self.assertTrue(res)
-
-
-class TestFlatten(test_util.TestCase):
-
-    def testFlatten(self):
-        op = core.CreateOperator("Flatten", ["X"], ["Y"])
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        res = device_checker.CheckSimple(op, [X], [0])
-        self.assertTrue(res)
-        for checker in gradient_checkers:
-            res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
-            self.assertTrue(res)
-
-
-class TestConcat(test_util.TestCase):
-
-    def setUp(self):
-        self.test_configs = [
-            # input_size, depth1, depth2, depth3, depth4
-            (3, 2, 3, 4, 5),
-            (4, 5, 4, 3, 2),
-        ]
-
-    def testConcatNHWC(self):
-        for input_size, d1, d2, d3, d4 in self.test_configs:
-            op = core.CreateOperator("Concat",
-                                     ["X1", "X2", "X3", "X4"],
-                                     ["Y", "Y_dims"],
-                                     order="NHWC"
-                                     )
-            Xs = [
-                np.random.rand(2, input_size, input_size,
-                               d1).astype(np.float32),
-                np.random.rand(2, input_size, input_size,
-                               d2).astype(np.float32),
-                np.random.rand(2, input_size, input_size,
-                               d3).astype(np.float32),
-                np.random.rand(2, input_size, input_size, d4).astype(np.float32)
-            ]
-            for i in range(4):
-                res = device_checker.CheckSimple(op, Xs, [0])
-                self.assertTrue(res)
-                for checker in gradient_checkers:
-                    res, grad, grad_estimated = checker.CheckSimple(op, Xs, i,
-                                                                    [0])
-                    self.assertTrue(res)
-
-    def testConcatNCHW(self):
-        for input_size, d1, d2, d3, d4 in self.test_configs:
-            op = core.CreateOperator("Concat",
-                                     ["X1", "X2", "X3", "X4"],
-                                     ["Y", "Y_dims"],
-                                     order="NCHW"
-                                     )
-            Xs = [
-                np.random.rand(2, d1, input_size,
-                               input_size).astype(np.float32),
-                np.random.rand(2, d2, input_size,
-                               input_size).astype(np.float32),
-                np.random.rand(2, d3, input_size,
-                               input_size).astype(np.float32),
-                np.random.rand(2, d4, input_size, input_size).astype(np.float32)
-            ]
-            for i in range(4):
-                res = device_checker.CheckSimple(op, Xs, [0])
-                self.assertTrue(res)
-                for checker in gradient_checkers:
-                    res, grad, grad_estimated = checker.CheckSimple(op, Xs, i,
-                                                                    [0])
-                    self.assertTrue(res)
-
-
-class TestRelu(test_util.TestCase):
-
-    def setUp(self):
-        self.test_configs = [
-            # input size
-            # (0, 1),
-            (1, 1),
-            (2, 1),
-            (1, 3, 3, 1),
-            (2, 3, 3, 1),
-            (1, 5, 5, 3),
-            (2, 5, 5, 3),
-        ]
-
-    def testRelu(self):
-        for input_size in self.test_configs:
-            op = core.CreateOperator("Relu", ["X"], ["Y"])
-            X = np.random.rand(*input_size).astype(np.float32)
-            # go away from the origin point to avoid kink problems
-            X += 0.01 * np.sign(X)
-            X[X == 0] = 0.01
-            res = device_checker.CheckSimple(op, [X], [0])
-            self.assertTrue(res)
-            for checker in gradient_checkers:
-                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
-                self.assertTrue(res)
-
-
-class TestTanh(test_util.TestCase):
-
-    def setUp(self):
-        self.test_configs = [
-            # (0, 1),
-            (1, 1),
-            (2, 1),
-            (1, 2, 3, 4),
-        ]
-
-    def testTanh(self):
-        for input_size in self.test_configs:
-            op = core.CreateOperator("Tanh", ["X"], ["Y"])
-            X = np.random.rand(*input_size).astype(np.float32) - 0.5
-            res = device_checker.CheckSimple(op, [X], [0])
-            self.assertTrue(res)
-            for checker in gradient_checkers:
-                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
-                self.assertTrue(res)
-
-
-class TestAbs(test_util.TestCase):
-
-    def setUp(self):
-        self.test_configs = [
-            (1, 1),
-            (2, 3),
-            (2, 3, 4),
-            (2, 3, 4, 5),
-        ]
-
-    def testAbs(self):
-        for input_size in self.test_configs:
-            op = core.CreateOperator("Abs", ["X"], ["Y"])
-            X = np.random.rand(*input_size).astype(np.float32)
-            # go away from the origin point to avoid kink problems
-            X += 0.01 * np.sign(X)
-            X[X == 0] = 0.01
-            res = device_checker.CheckSimple(op, [X], [0])
-            self.assertTrue(res)
-            for checker in gradient_checkers:
-                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
-                self.assertTrue(res)
-
-class TestExp(test_util.TestCase):
-
-    def setUp(self):
-        self.test_configs = [
-            # (0, 1),
-            (1, 1),
-            (2, 1),
-            (1, 2, 3, 4),
-        ]
-
-    def testExp(self):
-        for input_size in self.test_configs:
-            op = core.CreateOperator("Exp", ["X"], ["Y"])
-            X = np.random.rand(*input_size).astype(np.float32) - 0.5
-            res = device_checker.CheckSimple(op, [X], [0])
-            self.assertTrue(res)
-            for checker in gradient_checkers:
-                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
-                self.assertTrue(res)
-
-class TestCos(test_util.TestCase):
-
-    def setUp(self):
-        self.test_configs = [
-            (1, 1),
-            (2, 3),
-            (2, 3, 4),
-            (2, 3, 4, 5),
-        ]
-
-    def testCos(self):
-        for input_size in self.test_configs:
-            op = core.CreateOperator("Cos", ["X"], ["Y"])
-            X = np.random.rand(*input_size).astype(np.float32) - 0.5
-            res = device_checker.CheckSimple(op, [X], [0])
-            self.assertTrue(res)
-            for checker in gradient_checkers:
-                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
-                self.assertTrue(res)
-
-class TestSin(test_util.TestCase):
-
-    def setUp(self):
-        self.test_configs = [
-            (1, 1),
-            (2, 3),
-            (2, 3, 4),
-            (2, 3, 4, 5),
-        ]
-
-    def testSin(self):
-        for input_size in self.test_configs:
-            op = core.CreateOperator("Sin", ["X"], ["Y"])
-            X = np.random.rand(*input_size).astype(np.float32) - 0.5
-            res = device_checker.CheckSimple(op, [X], [0])
-            self.assertTrue(res)
-            for checker in gradient_checkers:
-                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
-                self.assertTrue(res)
-
-class TestSigmoid(test_util.TestCase):
-
-    def setUp(self):
-        self.test_configs = [
-            # (0, 1),
-            (1, 1),
-            (2, 1),
-            (1, 2, 3, 4),
-        ]
-
-    def testSigmoid(self):
-        for input_size in self.test_configs:
-            op = core.CreateOperator("Sigmoid", ["X"], ["Y"])
-            X = np.random.rand(*input_size).astype(np.float32) - 0.5
-            res = device_checker.CheckSimple(op, [X], [0])
-            self.assertTrue(res)
-            for checker in gradient_checkers:
-                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
-                self.assertTrue(res)
-
-
-class TestSum(test_util.TestCase):
-
-    def setUp(self):
-        self.test_configs = [
-            # ((0, 1), False),
-            ((1, 2, 3, 4), True),
-            ((1, 2, 3, 4), False)]
-
-    def testSum(self):
-        for (input_size, in_place) in self.test_configs:
-            op = core.CreateOperator("Sum", ["X1", "X2"],
-                                     ["Y" if not in_place else "X1"])
-            X1 = np.random.rand(*input_size).astype(np.float32) - 0.5
-            X2 = np.random.rand(*input_size).astype(np.float32) - 0.5
-            res = device_checker.CheckSimple(op, [X1, X2], [0])
-            self.assertTrue(res)
-            for checker in gradient_checkers:
-                res, grad, grad_estimated = checker.CheckSimple(
-                    op, [X1, X2], 0, [0])
-                self.assertTrue(res)
-                res, grad, grad_estimated = checker.CheckSimple(
-                    op, [X1, X2], 1, [0])
-                self.assertTrue(res)
-
-
-class TestMakeTwoClass(test_util.TestCase):
-
-    def setUp(self):
-        self.test_configs = [
-            # input size
-            # (0, 1),
-            (1,),
-            (7,),
-            (1, 3),
-            (2, 5),
-        ]
-
-    def testMakeTwoClass(self):
-        for input_size in self.test_configs:
-            op = core.CreateOperator("MakeTwoClass", ["X"], ["Y"])
-            X = np.random.rand(*input_size).astype(np.float32)
-            # step a little to avoid gradient problems
-            X[X < 0.01] += 0.01
-            X[X > 0.99] -= 0.01
-            res = device_checker.CheckSimple(op, [X], [0])
-            self.assertTrue(res)
-            for checker in gradient_checkers:
-                res, grad, grad_estimated = checker.CheckSimple(op, [X], 0, [0])
-                self.assertTrue(res)
-
-
-class TestNetGradientChecker(test_util.TestCase):
-    def test_net_gradient_checker(self):
-        model = model_helper.ModelHelper(name="test")
-        const = model.net.AddExternalInputs("const1", "const2")
-        fc = brew.fc(model, dim_in=3, dim_out=4, blob_in="X", blob_out="Y", axis=0)
-        dist = [model.net.SquaredL2Distance([fc, c]) for c in const]
-        losses = [model.net.AveragedLoss(d) for d in dist]  # using two losses here
-
-        workspace.RunNetOnce(model.param_init_net)
-        NetGradientChecker.Check(
-            model.net,
-            outputs_with_grad=losses,
-            input_values={"X": np.array([1, 2, 3], dtype="float32"),
-                          const[0]: np.array([1, 1, 1, 1], dtype="float32"),
-                          const[1]: np.array([2, 2, 2, 2], dtype="float32")},
-            input_to_check="X",
-        )
-
-    def test_net_comparison(self):
-        # (a + b) * (c + d) == a * c + a * d + b * c + b * d
-        net1 = core.Net("net1")
-        a, b, c, d = net1.AddExternalInputs("a", "b", "c", "d")
-        a_b = net1.Sum([a, b], "a+b")
-        c_d = net1.Sum([c, d], "c+d")
-        x = net1.Mul([a_b, c_d], "x")
-
-        net2 = core.Net("net2")
-        ac = net2.Mul([a, c], "ac")
-        ad = net2.Mul([a, d], "ad")
-        bc = net2.Mul([b, c], "bc")
-        bd = net2.Mul([b, d], "bd")
-        y = net2.Sum([ac, ad, bc, bd], "y")
-
-        input_values = {blob: np.array([i], dtype=np.float32)
-                        for i, blob in enumerate([a, b, c, d])}
-
-        NetGradientChecker.CompareNets(
-            [net1, net2], [[x], [y]], [0],
-            inputs_with_grads=[a, b, c, d],
-            input_values=input_values,
-        )
-
-
-class TestIf(test_util.TestCase):
-    def testIf(self):
-        W_a_values = [2.0, 1.5]
-        B_a_values = [0.5]
-        W_b_values = [7.0, 3.5]
-        B_b_values = [1.5]
-
-        with NetBuilder(_use_control_ops=True) as init_nb:
-            W_a = ops.UniformFill([], "W_a", shape=[1, 2], min=-1., max=1.)
-            B_a = ops.ConstantFill([], "B_a", shape=[1], value=0.0)
-            W_b = ops.UniformFill([], "W_b", shape=[1, 2], min=-1., max=1.)
-            B_b = ops.ConstantFill([], "B_b", shape=[1], value=0.0)
-
-            W_gt_a = ops.GivenTensorFill(
-                [], "W_gt_a", shape=[1, 2], values=W_a_values)
-            B_gt_a = ops.GivenTensorFill([], "B_gt_a", shape=[1], values=B_a_values)
-            W_gt_b = ops.GivenTensorFill(
-                [], "W_gt_b", shape=[1, 2], values=W_b_values)
-            B_gt_b = ops.GivenTensorFill([], "B_gt_b", shape=[1], values=B_b_values)
-
-        params = [W_gt_a, B_gt_a, W_a, B_a, W_gt_b, B_gt_b, W_b, B_b]
-
-        with NetBuilder(_use_control_ops=True, initial_scope=params) as train_nb:
-            Y_pred = ops.ConstantFill([], "Y_pred", shape=[1], value=0.0)
-            Y_noise = ops.ConstantFill([], "Y_noise", shape=[1], value=0.0)
-
-            switch = ops.UniformFill(
-                [], "switch", shape=[1], min=-1., max=1., run_once=0)
-            zero = ops.ConstantFill([], "zero", shape=[1], value=0.0)
-            X = ops.GaussianFill(
-                [], "X", shape=[4096, 2], mean=0.0, std=1.0, run_once=0)
-            noise = ops.GaussianFill(
-                [], "noise", shape=[4096, 1], mean=0.0, std=1.0, run_once=0)
-
-            with ops.IfNet(ops.LT([switch, zero])):
-                Y_gt = ops.FC([X, W_gt_a, B_gt_a], "Y_gt")
-                ops.Add([Y_gt, noise], Y_noise)
-                ops.FC([X, W_a, B_a], Y_pred)
-            with ops.Else():
-                Y_gt = ops.FC([X, W_gt_b, B_gt_b], "Y_gt")
-                ops.Add([Y_gt, noise], Y_noise)
-                ops.FC([X, W_b, B_b], Y_pred)
-
-            dist = ops.SquaredL2Distance([Y_noise, Y_pred], "dist")
-            loss = dist.AveragedLoss([], ["loss"])
-
-        assert len(init_nb.get()) == 1, "Expected a single init net produced"
-        assert len(train_nb.get()) == 1, "Expected a single train net produced"
-
-        train_net = train_nb.get()[0]
-        gradient_map = train_net.AddGradientOperators([loss])
-
-        init_net = init_nb.get()[0]
-        ITER = init_net.ConstantFill(
-            [], "ITER", shape=[1], value=0, dtype=core.DataType.INT64)
-        train_net.Iter(ITER, ITER)
-        LR = train_net.LearningRate(ITER, "LR", base_lr=-0.1,
-                                        policy="step", stepsize=20, gamma=0.9)
-        ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
-        train_net.WeightedSum([W_a, ONE, gradient_map[W_a], LR], W_a)
-        train_net.WeightedSum([B_a, ONE, gradient_map[B_a], LR], B_a)
-        train_net.WeightedSum([W_b, ONE, gradient_map[W_b], LR], W_b)
-        train_net.WeightedSum([B_b, ONE, gradient_map[B_b], LR], B_b)
-
-        workspace.RunNetOnce(init_net)
-        workspace.CreateNet(train_net)
-        # print("Before training, W_a is: {}".format(workspace.FetchBlob("W_a")))
-        # print("Before training, B_a is: {}".format(workspace.FetchBlob("B_a")))
-        # print("Before training, W_b is: {}".format(workspace.FetchBlob("W_b")))
-        # print("Before training, B_b is: {}".format(workspace.FetchBlob("B_b")))
-
-        for _epoch in range(1000):
-            workspace.RunNet(train_net.Proto().name)
-
-        # print("After training, W_a is: {}".format(workspace.FetchBlob("W_a")))
-        # print("After training, B_a is: {}".format(workspace.FetchBlob("B_a")))
-        # print("After training, W_b is: {}".format(workspace.FetchBlob("W_b")))
-        # print("After training, B_b is: {}".format(workspace.FetchBlob("B_b")))
-        # print("Ground truth W_a is: {}".format(workspace.FetchBlob("W_gt_a")))
-        # print("Ground truth B_a is: {}".format(workspace.FetchBlob("B_gt_a")))
-        # print("Ground truth W_b is: {}".format(workspace.FetchBlob("W_gt_b")))
-        # print("Ground truth B_b is: {}".format(workspace.FetchBlob("B_gt_b")))
-
-        values_map = {
-            "W_a": W_a_values,
-            "B_a": B_a_values,
-            "W_b": W_b_values,
-            "B_b": B_b_values,
-        }
-
-        train_eps = 0.01
-
-        for blob_name, values in values_map.items():
-            trained_values = workspace.FetchBlob(blob_name)
-            if trained_values.ndim == 2:
-                self.assertEqual(trained_values.shape[0], 1)
-                trained_values = trained_values[0][:]
-            else:
-                self.assertEqual(trained_values.ndim, 1)
-
-            self.assertEqual(trained_values.size, len(values))
-            for idx in range(len(trained_values)):
-                self.assertTrue(abs(trained_values[idx] - values[idx]) < train_eps)
-
-
-class TestWhile(test_util.TestCase):
-    @unittest.skip("Skip flaky test.")
-    def testWhile(self):
-        with NetBuilder(_use_control_ops=True) as nb:
-            ops.Copy(ops.Const(0), "i")
-            ops.Copy(ops.Const(1), "one")
-            ops.Copy(ops.Const(2), "two")
-            ops.Copy(ops.Const(2.0), "x")
-            ops.Copy(ops.Const(3.0), "y")
-            ops.Copy(ops.Const(2.0), "z")
-            # raises x to the power of 4 and y to the power of 2
-            # and z to the power of 3
-            with ops.WhileNet():
-                with ops.Condition():
-                    ops.Add(["i", "one"], "i")
-                    ops.LE(["i", "two"])
-                ops.Pow("x", "x", exponent=2.0)
-                with ops.IfNet(ops.LT(["i", "two"])):
-                    ops.Pow("y", "y", exponent=2.0)
-                with ops.Else():
-                    ops.Pow("z", "z", exponent=3.0)
-
-            ops.Add(["x", "y"], "x_plus_y")
-            ops.Add(["x_plus_y", "z"], "s")
-
-        assert len(nb.get()) == 1, "Expected a single net produced"
-        net = nb.get()[0]
-
-        net.AddGradientOperators(["s"])
-        workspace.RunNetOnce(net)
-        # (x^4)' = 4x^3
-        self.assertAlmostEqual(workspace.FetchBlob("x_grad"), 32)
-        self.assertAlmostEqual(workspace.FetchBlob("x"), 16)
-        # (y^2)' = 2y
-        self.assertAlmostEqual(workspace.FetchBlob("y_grad"), 6)
-        self.assertAlmostEqual(workspace.FetchBlob("y"), 9)
-        # (z^3)' = 3z^2
-        self.assertAlmostEqual(workspace.FetchBlob("z_grad"), 12)
-        self.assertAlmostEqual(workspace.FetchBlob("z"), 8)
-
-
-if __name__ == '__main__':
-    workspace.GlobalInit(["python"])
-    unittest.main()
diff --git a/caffe2/python/gradient_checker.py b/caffe2/python/gradient_checker.py
deleted file mode 100644
index f4eabaa274f8..000000000000
--- a/caffe2/python/gradient_checker.py
+++ /dev/null
@@ -1,387 +0,0 @@
-## @package gradient_checker
-# Module caffe2.python.gradient_checker
-
-
-
-
-
-import os
-import numpy as np
-
-from caffe2.python import core, workspace, net_drawer
-from caffe2.proto import caffe2_pb2
-
-
-def getGradientForOp(op):
-    return core.GradientRegistry.GetGradientForOp(
-        op, [s + '_grad' for s in op.output])
-
-
-def _get_grad_blob(grad_map, input_to_check):
-    grad_blob = grad_map[input_to_check]
-
-    if isinstance(grad_blob, core.BlobReference):
-        return workspace.blobs[grad_blob]
-
-    # If grad_blob is not a single blob, it should be a gradient slice.
-    # To make it comparable with the estimiated gradient which is dense,
-    # we need to first convert grad_blob to dense gradient.
-    assert isinstance(grad_blob, core.GradientSlice)
-    dense_grad = 'tmp_dense_grad'
-    sparse_to_dense_op = core.CreateOperator(
-        'SparseToDense',
-        [grad_blob.indices, grad_blob.values, input_to_check],
-        dense_grad,
-    )
-    workspace.RunOperatorOnce(sparse_to_dense_op)
-    return workspace.blobs[dense_grad]
-
-
-def _get_grad(net, outputs, outputs_with_grad, input_values, inputs_with_grads):
-    grad_net = net.Clone(net.Name() + "_copy")
-    grad_map = grad_net.AddGradientOperators(outputs_with_grad)
-
-    for name, value in (input_values or {}).items():
-        workspace.blobs[name] = value
-
-    for input_to_check in inputs_with_grads:
-        assert input_to_check in grad_map, (
-            '{} has no gradient, cannot check net gradient.'.format(
-                input_to_check))
-        assert str(input_to_check) in workspace.blobs
-
-    workspace.RunNetOnce(grad_net)
-    forward_results = [(output, workspace.blobs[output]) for output in outputs]
-    grads = {input_to_check: _get_grad_blob(grad_map, input_to_check)
-             for input_to_check in inputs_with_grads}
-
-    return forward_results, grads, grad_net
-
-
-def _assert_close(value1, value2, threshold, err_msg=''):
-    np.testing.assert_allclose(
-        value1, value2,
-        atol=threshold, rtol=threshold,
-        err_msg=err_msg,
-    )
-
-    delta = np.abs(value1 - value2).flatten()
-    return np.mean(delta), max(delta)
-
-
-class NetGradientChecker:
-    @staticmethod
-    def CompareNets(nets, outputs, outputs_with_grad_ids,
-                    inputs_with_grads, input_values=None,
-                    threshold=0.0000001, print_net_images=False):
-        def _get_output_with_grad_names(net_outputs):
-            return [net_outputs[i] for i in outputs_with_grad_ids]
-
-        if print_net_images:
-            for i, net in enumerate(nets):
-                png = net_drawer.GetPydotGraph(net).create_png()
-                with open("caffe2_net_forward_" + str(i) + net.Name() + ".png",
-                          'wb') \
-                        as f:
-                    f.write(png)
-
-        results = [
-            _get_grad(net, net_outputs,
-                      _get_output_with_grad_names(net_outputs),
-                      input_values, inputs_with_grads)
-            for net, net_outputs in zip(nets, outputs)
-        ]
-
-        if print_net_images:
-            _, _, backward_nets = zip(*results)
-            for i, net in enumerate(backward_nets):
-                png = net_drawer.GetPydotGraph(net).create_png()
-                with open("caffe2_net_" + str(i) + net.Name() + ".png", 'wb') \
-                        as f:
-                    f.write(png)
-
-        first_net_results, first_net_grads, _ = results[0]
-        for net_results, net_grads, _ in results[1:]:
-            assert len(net_results) == len(first_net_results)
-            for idx, ((blob1, blob_value1), (blob2, blob_value2)) in enumerate(
-                    zip(first_net_results, net_results)):
-                _assert_close(
-                    blob_value1, blob_value2, threshold,
-                    err_msg="Different forward pass results for output id {}. "
-                    "Corresponding output blobs: {} and {}".format(
-                        idx, blob1, blob2))
-
-            assert net_grads.keys() == first_net_grads.keys()
-            for blob, blob_grad_value in net_grads.items():
-                _assert_close(
-                    first_net_grads[blob], blob_grad_value, threshold,
-                    err_msg="Different gradients for input {}".format(blob))
-
-    @staticmethod
-    def Check(net, outputs_with_grad, input_values,
-              input_to_check, step_size=0.0001,
-              threshold=0.05, print_net=True):
-
-        net_results, net_grads, full_net = _get_grad(
-            net, [], outputs_with_grad, input_values, [input_to_check])
-        analytic_grad = net_grads[input_to_check]
-
-        def GetLoss(new_value):
-            workspace.blobs[input_to_check] = new_value
-            workspace.RunNetOnce(full_net)
-            return sum([
-                workspace.blobs[output]
-                for output in outputs_with_grad
-            ]).sum()
-
-        def GetValue(dim, delta):
-            input_value = input_values[input_to_check].copy()
-            input_value.flat[dim] += delta
-            return input_value
-
-        grad_estimate = np.zeros_like(input_values[input_to_check])
-        for dim in range(input_values[input_to_check].size):
-            pos_loss = GetLoss(GetValue(dim, step_size))
-            neg_loss = GetLoss(GetValue(dim, -step_size))
-            grad_estimate.flat[dim] = (pos_loss - neg_loss) / step_size / 2
-
-        err_msg = "Error in gradient check for net_copy {}".format(
-            net.Name())
-        if print_net:
-            err_msg += ": {}".format(net.Proto())
-
-        return _assert_close(analytic_grad, grad_estimate, threshold, err_msg)
-
-
-class GradientChecker:
-    """A gradient checker in Python.
-
-    This is not the most efficient way to check gradients, as the Python
-    interface will involve a lot of copies back and forth operations. Use at your
-    own risk.
-    """
-
-    def __init__(
-        self,
-        stepsize,
-        threshold,
-        device_option=None,
-        workspace_name="gradient_check",
-        input_device_options=None,
-    ):
-        self._stepsize = stepsize
-        self._threshold = threshold
-        self._device_option = device_option or caffe2_pb2.DeviceOption()
-        self._workspace_name = workspace_name
-        if input_device_options is None:
-            self._input_device_options = {}
-        else:
-            self._input_device_options = input_device_options
-
-    def GetLossAndGrad(
-        self, op, grad_ops, inputs, input_names, input_to_check, grad_name,
-        outputs_with_grads
-    ):
-        for i in range(len(inputs)):
-            workspace.FeedBlob(input_names[i], inputs[i],
-                               self._input_device_options.get(
-                input_names[i], self._device_option))
-        x = inputs[input_to_check]
-        # Run.
-        workspace.RunOperatorOnce(op)
-        loss = 0.
-        # Get Loss and feed in the gradients, run gradient ops.
-        for idx in outputs_with_grads:
-            name = op.output[idx]
-            arr = workspace.FetchBlob(name)
-            loss += (arr**2).sum()
-            workspace.FeedBlob(name + '_grad', arr, self._device_option)
-        loss /= 2.
-        # Run gradient ops
-        workspace.RunOperatorsOnce(grad_ops)
-        # Get gradients
-        if isinstance(grad_name, core.GradientSlice):
-            workspace.FeedBlob('zeros', np.zeros_like(x, dtype=np.float32))
-            workspace.FeedBlob('ones', np.ones(1, dtype=np.float32))
-            gv_cpu_op = core.CreateOperator(
-                'EnsureCPUOutput', grad_name.values, grad_name.values + '_cpu',
-                device_option=self._device_option
-            )
-            gi_cpu_op = core.CreateOperator(
-                'EnsureCPUOutput', grad_name.indices, grad_name.indices + '_cpu',
-                device_option=self._device_option
-            )
-            sparse_to_dense_op = core.CreateOperator(
-                'ScatterWeightedSum',
-                [
-                    'zeros', 'ones', grad_name.indices + '_cpu',
-                    grad_name.values + '_cpu', 'ones'
-                ],
-                'zeros',
-            )
-            workspace.RunOperatorOnce(gv_cpu_op)
-            workspace.RunOperatorOnce(gi_cpu_op)
-            workspace.RunOperatorOnce(sparse_to_dense_op)
-            grad = workspace.FetchBlob('zeros')
-        else:
-            grad = workspace.FetchBlob(grad_name)
-        return loss, grad
-
-    def CheckSimple(
-        self,
-        op,
-        inputs,
-        input_to_check,
-        outputs_with_grads,
-        grad_ops=None,
-        input_device_options=None,
-        ensure_outputs_are_inferred=False,
-    ):
-        """Checks the operator in a very simple fashion by stacking a sum of
-        squares on the top.
-
-        Inputs:
-          op: the operator to be checked.
-          inputs: the input data in numpy arrays.
-          input_to_check: an index specifying which input blob we should
-              check.
-          outputs_with_grads: indices specifying which output blobs will we
-              need to check gradients with. For these outputs, we will collect a
-              squared sum and also feed in their gradients.
-          grad_operator: the gradient operator. If not given, we will get the
-              gradient operator from the gradient registry.
-          input_device_options: an optional mapping from input names to
-              DeviceOptions (to override the default DeviceOption)
-          ensure_outputs_are_inferred: if set will assert that the gradient output
-              shapes matches the inferred shapes
-        Outputs:
-          boolean: True if it passes, False if it does not pass.
-        """
-        # Entering the checker workspace
-        old_ws_name = workspace.CurrentWorkspace()
-        if self._workspace_name != old_ws_name:
-            workspace.SwitchWorkspace(self._workspace_name, True)
-
-        op.device_option.CopyFrom(self._device_option)
-        if grad_ops is None:
-            # TODO(jiayq): use the gradient registration instead of the old
-            # hack.
-            grad_ops, g_input = getGradientForOp(op)
-
-
-        _input_device_options = input_device_options or \
-            core.InferOpBlobDevicesAsDict(op)[0]
-        # First, feed in the input.
-        for i, arr in enumerate(inputs):
-            workspace.FeedBlob(
-                op.input[i], arr,
-                _input_device_options.get(
-                    op.input[i], self._device_option))
-
-        # Get the loss and gradient for the original.
-        grad_name = g_input[input_to_check]
-        loss, grad = self.GetLossAndGrad(
-            op, grad_ops, inputs, op.input, input_to_check, grad_name,
-            outputs_with_grads,
-        )
-        grad_estimate = np.zeros_like(inputs[input_to_check])
-        if grad_estimate.shape != grad.shape:
-            raise Exception(
-                "Mismatched gradient shapes: estimated ({}), grad ({})".format(
-                    grad_estimate.shape, grad.shape))
-
-        if ensure_outputs_are_inferred:
-            self._assertInferTensorChecks(op, grad_ops)
-
-        full_grad_check = os.getenv('CAFFE2_FULL_GRAD_CHECK') == '1'
-
-        dims_to_check = inputs[input_to_check].size
-        for current_dim in range(dims_to_check):
-            # Grad check is very expensive (as it involves running the op from
-            # scratch for each of the input tensor elements). Thus, let's
-            # run it by default only on a small subset of dimensions. Here we
-            # apply very scientific approach: the first and the last 3 elements
-            # of each tensor. Pass CAFFE2_FULL_GRAD_CHECK=1 env var to enable
-            # the full check
-            if not full_grad_check and current_dim >= 3 and \
-                    current_dim + 3 < dims_to_check:
-                grad_estimate.flat[current_dim] = grad.flat[current_dim]
-                continue
-            # Positive gradient
-            inputs[input_to_check].flat[current_dim] += self._stepsize
-            pos_loss, _ = self.GetLossAndGrad(
-                op, grad_ops, inputs, op.input, input_to_check, grad_name,
-                outputs_with_grads
-            )
-            # Negative gradient
-            inputs[input_to_check].flat[current_dim] -= self._stepsize * 2
-            neg_loss, _ = self.GetLossAndGrad(
-                op, grad_ops, inputs, op.input, input_to_check, grad_name,
-                outputs_with_grads
-            )
-            # Recover the value
-            inputs[input_to_check].flat[current_dim] += self._stepsize
-            grad_estimate.flat[current_dim] = (
-                pos_loss - neg_loss) / self._stepsize / 2
-        # Now, check correctness
-        fail_mat = ~np.isclose(
-            grad, grad_estimate, atol=self._threshold, rtol=self._threshold)
-        if np.any(fail_mat):
-            idx = np.flatnonzero(fail_mat)
-            print('Failed. [idx, grad, grad_estimate] are:')
-            print(np.vstack([idx, grad.flat[idx], grad_estimate.flat[idx]]).T)
-            ret = False
-        else:
-            ret = True
-        # After finishing, cleaning up things.
-        if self._workspace_name != old_ws_name:
-            # We reset the workspace to make sure everything intermediate is
-            # cleaned up. Note that there is no need to delete a workspace -
-            # when empty it takes a very limited amount of memory.
-            workspace.ResetWorkspace()
-            workspace.SwitchWorkspace(old_ws_name)
-        return ret, grad, grad_estimate
-
-    def _assertInferTensorChecks(self, op, grad_ops):
-        tmp_net = caffe2_pb2.NetDef()
-        tmp_net.op.extend([op])
-        tmp_net.op.extend(grad_ops)
-        inferred_shapes, inferred_types = workspace.InferShapesAndTypes(
-            [tmp_net],
-            nets_proto=True,
-        )
-
-        outputs = set()
-        for grad_op in grad_ops:
-            outputs.update(grad_op.output)
-
-        for output in outputs:
-            if output not in inferred_shapes:
-                raise Exception(
-                    "expected output {} to be inferred".format(output))
-            blob = workspace.FetchBlob(output)
-            correct_shape = list(blob.shape)
-            inferred_shape = list(inferred_shapes[output])
-            if correct_shape != inferred_shape:
-                raise Exception(
-                    "Mismatched inferred shape: want({}), got({})".format(
-                        correct_shape, inferred_shape))
-
-            if type(blob) is np.ndarray:
-                if blob.dtype == np.dtype('float64'):
-                    correct_type = caffe2_pb2.TensorProto.DOUBLE
-                elif blob.dtype == np.dtype('float32'):
-                    correct_type = caffe2_pb2.TensorProto.FLOAT
-                elif blob.dtype == np.dtype('int32'):
-                    correct_type = caffe2_pb2.TensorProto.INT32
-                elif blob.dtype == np.dtype('int64'):
-                    correct_type = caffe2_pb2.TensorProto.INT64
-                else:
-                    correct_type = "unknown {}".format(np.dtype)
-            else:
-                correct_type = str(type(blob))
-            inferred_type = inferred_types[output]
-            if correct_type != inferred_type:
-                raise Exception(
-                    "Mismatched inferred type: want({}), got({})".format(
-                        correct_type, inferred_type))
diff --git a/caffe2/python/gru_cell.py b/caffe2/python/gru_cell.py
deleted file mode 100644
index f5bb71abc657..000000000000
--- a/caffe2/python/gru_cell.py
+++ /dev/null
@@ -1,172 +0,0 @@
-
-
-
-
-
-import functools
-from caffe2.python import brew, rnn_cell
-
-
-class GRUCell(rnn_cell.RNNCell):
-
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        forget_bias,  # Currently unused!  Values here will be ignored.
-        memory_optimization,
-        drop_states=False,
-        linear_before_reset=False,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.forget_bias = float(forget_bias)
-        self.memory_optimization = memory_optimization
-        self.drop_states = drop_states
-        self.linear_before_reset = linear_before_reset
-
-    # Unlike LSTMCell, GRUCell needs the output of one gate to feed into another.
-    # (reset gate -> output_gate)
-    # So, much of the logic to calculate the reset gate output and modified
-    # output gate input is set here, in the graph definition.
-    # The remaining logic lives in gru_unit_op.{h,cc}.
-    def _apply(
-        self,
-        model,
-        input_t,
-        seq_lengths,
-        states,
-        timestep,
-        extra_inputs=None,
-    ):
-        hidden_t_prev = states[0]
-
-        # Split input tensors to get inputs for each gate.
-        input_t_reset, input_t_update, input_t_output = model.net.Split(
-            [
-                input_t,
-            ],
-            [
-                self.scope('input_t_reset'),
-                self.scope('input_t_update'),
-                self.scope('input_t_output'),
-            ],
-            axis=2,
-        )
-
-        # Fully connected layers for reset and update gates.
-        reset_gate_t = brew.fc(
-            model,
-            hidden_t_prev,
-            self.scope('reset_gate_t'),
-            dim_in=self.hidden_size,
-            dim_out=self.hidden_size,
-            axis=2,
-        )
-        update_gate_t = brew.fc(
-            model,
-            hidden_t_prev,
-            self.scope('update_gate_t'),
-            dim_in=self.hidden_size,
-            dim_out=self.hidden_size,
-            axis=2,
-        )
-
-        # Calculating the modified hidden state going into output gate.
-        reset_gate_t = model.net.Sum(
-            [reset_gate_t, input_t_reset],
-            self.scope('reset_gate_t')
-        )
-        reset_gate_t_sigmoid = model.net.Sigmoid(
-            reset_gate_t,
-            self.scope('reset_gate_t_sigmoid')
-        )
-
-        # `self.linear_before_reset = True` matches cudnn semantics
-        if self.linear_before_reset:
-            output_gate_fc = brew.fc(
-                model,
-                hidden_t_prev,
-                self.scope('output_gate_t'),
-                dim_in=self.hidden_size,
-                dim_out=self.hidden_size,
-                axis=2,
-            )
-            output_gate_t = model.net.Mul(
-                [reset_gate_t_sigmoid, output_gate_fc],
-                self.scope('output_gate_t_mul')
-            )
-        else:
-            modified_hidden_t_prev = model.net.Mul(
-                [reset_gate_t_sigmoid, hidden_t_prev],
-                self.scope('modified_hidden_t_prev')
-            )
-            output_gate_t = brew.fc(
-                model,
-                modified_hidden_t_prev,
-                self.scope('output_gate_t'),
-                dim_in=self.hidden_size,
-                dim_out=self.hidden_size,
-                axis=2,
-            )
-
-        # Add input contributions to update and output gate.
-        # We already (in-place) added input contributions to the reset gate.
-        update_gate_t = model.net.Sum(
-            [update_gate_t, input_t_update],
-            self.scope('update_gate_t'),
-        )
-        output_gate_t = model.net.Sum(
-            [output_gate_t, input_t_output],
-            self.scope('output_gate_t_summed'),
-        )
-
-        # Join gate outputs and add input contributions
-        gates_t, _gates_t_concat_dims = model.net.Concat(
-            [
-                reset_gate_t,
-                update_gate_t,
-                output_gate_t,
-            ],
-            [
-                self.scope('gates_t'),
-                self.scope('_gates_t_concat_dims'),
-            ],
-            axis=2,
-        )
-
-        if seq_lengths is not None:
-            inputs = [hidden_t_prev, gates_t, seq_lengths, timestep]
-        else:
-            inputs = [hidden_t_prev, gates_t, timestep]
-
-        hidden_t = model.net.GRUUnit(
-            inputs,
-            list(self.get_state_names()),
-            forget_bias=self.forget_bias,
-            drop_states=self.drop_states,
-            sequence_lengths=(seq_lengths is not None),
-        )
-        model.net.AddExternalOutputs(hidden_t)
-        return (hidden_t,)
-
-    def prepare_input(self, model, input_blob):
-        return brew.fc(
-            model,
-            input_blob,
-            self.scope('i2h'),
-            dim_in=self.input_size,
-            dim_out=3 * self.hidden_size,
-            axis=2,
-        )
-
-    def get_state_names(self):
-        return (self.scope('hidden_t'),)
-
-    def get_output_dim(self):
-        return self.hidden_size
-
-
-GRU = functools.partial(rnn_cell._LSTM, GRUCell)
diff --git a/caffe2/python/helpers/__init__.py b/caffe2/python/helpers/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/helpers/algebra.py b/caffe2/python/helpers/algebra.py
deleted file mode 100644
index 2b626677b029..000000000000
--- a/caffe2/python/helpers/algebra.py
+++ /dev/null
@@ -1,48 +0,0 @@
-## @package algebra
-# Module caffe2.python.helpers.algebra
-
-
-
-
-
-
-def transpose(model, blob_in, blob_out, use_cudnn=False, **kwargs):
-    """Transpose."""
-    if use_cudnn:
-        kwargs['engine'] = 'CUDNN'
-    return model.net.Transpose(blob_in, blob_out, **kwargs)
-
-
-def sum(model, blob_in, blob_out, **kwargs):
-    """Sum"""
-    return model.net.Sum(blob_in, blob_out, **kwargs)
-
-
-def reduce_sum(model, blob_in, blob_out, **kwargs):
-    """ReduceSum"""
-    return model.net.ReduceSum(blob_in, blob_out, **kwargs)
-
-
-def sub(model, blob_in, blob_out, **kwargs):
-    """Subtract"""
-    return model.net.Sub(blob_in, blob_out, **kwargs)
-
-
-def mat_mul(model, blob_in, blob_out, **kwargs):
-    """Matrix multiplication"""
-    return model.net.MatMul(blob_in, blob_out, **kwargs)
-
-
-def arg_min(model, blob_in, blob_out, **kwargs):
-    """ArgMin"""
-    return model.net.ArgMin(blob_in, blob_out, **kwargs)
-
-def batch_mat_mul(model, blob_in, blob_out,
-                  enable_tensor_core=False, **kwargs):
-    if enable_tensor_core:
-        kwargs['engine'] = 'TENSORCORE'
-
-    return model.net.BatchMatMul(blob_in, blob_out, **kwargs)
-
-def sparse_lengths_sum_4bit_rowwise_sparse(model, blob_in, blob_out, **kwargs):
-    return model.net.SparseLengthsSum4BitRowwiseSparse(blob_in, blob_out, **kwargs)
diff --git a/caffe2/python/helpers/arg_scope.py b/caffe2/python/helpers/arg_scope.py
deleted file mode 100644
index a112e9b84c5d..000000000000
--- a/caffe2/python/helpers/arg_scope.py
+++ /dev/null
@@ -1,35 +0,0 @@
-
-
-
-import contextlib
-import copy
-import threading
-
-_threadlocal_scope = threading.local()
-
-
-@contextlib.contextmanager
-def arg_scope(single_helper_or_list, **kwargs):
-    global _threadlocal_scope
-    if not isinstance(single_helper_or_list, list):
-        assert callable(single_helper_or_list), \
-            "arg_scope is only supporting single or a list of helper functions."
-        single_helper_or_list = [single_helper_or_list]
-    old_scope = copy.deepcopy(get_current_scope())
-    for helper in single_helper_or_list:
-        assert callable(helper), \
-            "arg_scope is only supporting a list of callable helper functions."
-        helper_key = helper.__name__
-        if helper_key not in old_scope:
-            _threadlocal_scope.current_scope[helper_key] = {}
-        _threadlocal_scope.current_scope[helper_key].update(kwargs)
-
-    yield
-    _threadlocal_scope.current_scope = old_scope
-
-
-def get_current_scope():
-    global _threadlocal_scope
-    if not hasattr(_threadlocal_scope, "current_scope"):
-        _threadlocal_scope.current_scope = {}
-    return _threadlocal_scope.current_scope
diff --git a/caffe2/python/helpers/array_helpers.py b/caffe2/python/helpers/array_helpers.py
deleted file mode 100644
index fae0011bf1f6..000000000000
--- a/caffe2/python/helpers/array_helpers.py
+++ /dev/null
@@ -1,25 +0,0 @@
-## @package arra_helpers
-# Module caffe2.python.helpers.array_helpers
-
-
-
-
-
-
-def concat(model, blobs_in, blob_out, **kwargs):
-    """Depth Concat."""
-    if kwargs.get('order') and kwargs.get('axis'):
-        # The backend throws an error if both are given
-        kwargs.pop('order')
-
-    return model.net.Concat(
-        blobs_in,
-        [blob_out, "_" + blob_out + "_concat_dims"],
-        **kwargs
-    )[0]
-
-
-def depth_concat(model, blobs_in, blob_out, **kwargs):
-    """The old depth concat function - we should move to use concat."""
-    print("DepthConcat is deprecated. use Concat instead.")
-    return concat(blobs_in, blob_out, **kwargs)
diff --git a/caffe2/python/helpers/control_ops.py b/caffe2/python/helpers/control_ops.py
deleted file mode 100644
index c6f71d0761a5..000000000000
--- a/caffe2/python/helpers/control_ops.py
+++ /dev/null
@@ -1,28 +0,0 @@
-## @package control_ops
-# Module caffe2.python.helpers.control_ops
-
-
-
-
-
-from caffe2.python.control_ops_util import add_if_op, add_while_op
-
-
-def cond(model, cond_blob, external_blobs, then_model, else_model=None):
-    """Condition"""
-    add_if_op(
-        model.net,
-        cond_blob,
-        external_blobs,
-        then_model.net,
-        else_model.net if else_model else None)
-
-
-def loop(model, cond_blob, external_blobs, loop_model, cond_model=None):
-    """Loop"""
-    add_while_op(
-        model.net,
-        cond_blob,
-        external_blobs,
-        loop_model.net,
-        cond_model.net if cond_model else None)
diff --git a/caffe2/python/helpers/conv.py b/caffe2/python/helpers/conv.py
deleted file mode 100644
index dfca165084df..000000000000
--- a/caffe2/python/helpers/conv.py
+++ /dev/null
@@ -1,362 +0,0 @@
-## @package conv
-# Module caffe2.python.helpers.conv
-
-
-
-
-
-from caffe2.python import core
-from caffe2.python.modeling import initializers
-from caffe2.python.modeling.parameter_info import ParameterTags
-
-def _ConvBase(
-    model,
-    is_nd,
-    blob_in,
-    blob_out,
-    dim_in,
-    dim_out,
-    kernel,
-    weight_init=None,
-    bias_init=None,
-    WeightInitializer=None,
-    BiasInitializer=None,
-    group=1,
-    transform_inputs=None,
-    use_cudnn=False,
-    order="NCHW",
-    cudnn_exhaustive_search=False,
-    ws_nbytes_limit=None,
-    float16_compute=False,
-    **kwargs
-):
-    kernels = []
-    if is_nd:
-        if not isinstance(kernel, list):
-            kernels = [kernel]
-        else:
-            kernels = kernel
-    else:
-        if isinstance(kernel, list):
-            assert len(kernel) == 2, "Conv support only a 2D kernel."
-            kernels = kernel
-        else:
-            kernels = [kernel] * 2
-
-    requested_engine = kwargs.get('engine')
-    if requested_engine is not None:
-        if use_cudnn and requested_engine != 'CUDNN':
-            raise ValueError(
-                'When use_cudnn=True, the only engine you can specify is '
-                '"CUDNN"')
-        elif not use_cudnn and requested_engine == 'CUDNN':
-            raise ValueError(
-                'When use_cudnn=False, the only engine you can specify is '
-                '""')
-
-    if use_cudnn:
-        kwargs['engine'] = 'CUDNN'
-        kwargs['exhaustive_search'] = cudnn_exhaustive_search
-        if ws_nbytes_limit:
-            kwargs['ws_nbytes_limit'] = ws_nbytes_limit
-
-    use_bias =\
-            False if ("no_bias" in kwargs and kwargs["no_bias"]) else True
-    blob_out = blob_out or model.net.NextName()
-    weight_shape = [dim_out]
-    if order == "NCHW":
-        weight_shape.append(int(dim_in / group))
-        weight_shape.extend(kernels)
-    else:
-        weight_shape.extend(kernels)
-        weight_shape.append(int(dim_in / group))
-
-    WeightInitializer = initializers.update_initializer(
-        WeightInitializer, weight_init, ("XavierFill", {})
-    )
-    BiasInitializer = initializers.update_initializer(
-        BiasInitializer, bias_init, ("ConstantFill", {})
-    )
-    if not model.init_params:
-        WeightInitializer = initializers.ExternalInitializer()
-        BiasInitializer = initializers.ExternalInitializer()
-
-    weight = model.create_param(
-        param_name=blob_out + '_w',
-        shape=weight_shape,
-        initializer=WeightInitializer,
-        tags=ParameterTags.WEIGHT
-    )
-    if use_bias:
-        bias = model.create_param(
-            param_name=blob_out + '_b',
-            shape=[dim_out, ],
-            initializer=BiasInitializer,
-            tags=ParameterTags.BIAS
-        )
-
-    if use_bias:
-        inputs = [blob_in, weight, bias]
-    else:
-        inputs = [blob_in, weight]
-
-    if transform_inputs is not None:
-        transform_inputs(model, blob_out, inputs)
-
-    # Enable float 16 compute kernel (relevant for CUDA)
-    if float16_compute:
-        kwargs['float16_compute'] = True
-
-    # For the operator, we no longer need to provide the no_bias field
-    # because it can automatically figure this out from the number of
-    # inputs.
-    if 'no_bias' in kwargs:
-        del kwargs['no_bias']
-    if group != 1:
-        kwargs['group'] = group
-    if is_nd:
-        return model.net.Conv(
-            inputs,
-            blob_out,
-            kernels=kernels,
-            order=order,
-            **kwargs)
-    else:
-        if isinstance(kernel, list):
-            return model.net.Conv(
-                inputs,
-                blob_out,
-                kernel_h=kernel[0],
-                kernel_w=kernel[1],
-                order=order,
-                **kwargs)
-        else:
-            return model.net.Conv(
-                inputs,
-                blob_out,
-                kernel=kernel,
-                order=order,
-                **kwargs)
-
-
-
-def conv_nd(
-    model,
-    blob_in,
-    blob_out,
-    dim_in,
-    dim_out,
-    kernel,
-    weight_init=None,
-    bias_init=None,
-    WeightInitializer=None,
-    BiasInitializer=None,
-    group=1,
-    transform_inputs=None,
-    order="NCHW",
-    **kwargs
-):
-    """N-dimensional convolution for inputs with NCHW storage order.
-    """
-    assert order == "NCHW", "ConvNd only supported for NCHW storage."
-    return _ConvBase(model, True, blob_in, blob_out, dim_in, dim_out, kernel,
-                     weight_init, bias_init, WeightInitializer, BiasInitializer,
-                     group, transform_inputs, order=order, **kwargs)
-
-
-def conv(
-    model,
-    blob_in,
-    blob_out,
-    dim_in,
-    dim_out,
-    kernel,
-    weight_init=None,
-    bias_init=None,
-    WeightInitializer=None,
-    BiasInitializer=None,
-    group=1,
-    transform_inputs=None,
-    **kwargs
-):
-    """2-dimensional convolution.
-    """
-    return _ConvBase(model, False, blob_in, blob_out, dim_in, dim_out, kernel,
-                     weight_init, bias_init, WeightInitializer, BiasInitializer,
-                     group, transform_inputs, **kwargs)
-
-
-def conv_transpose(
-    model,
-    blob_in,
-    blob_out,
-    dim_in,
-    dim_out,
-    kernel,
-    weight_init=None,
-    bias_init=None,
-    use_cudnn=False,
-    order="NCHW",
-    cudnn_exhaustive_search=False,
-    ws_nbytes_limit=None,
-    **kwargs
-):
-    """ConvTranspose.
-    """
-    weight_init = weight_init if weight_init else ('XavierFill', {})
-    bias_init = bias_init if bias_init else ('ConstantFill', {})
-    blob_out = blob_out or model.net.NextName()
-    weight_shape = (
-        [dim_in, dim_out, kernel, kernel]
-        if order == "NCHW" else [dim_in, kernel, kernel, dim_out]
-    )
-    if model.init_params:
-        weight = model.param_init_net.__getattr__(weight_init[0])(
-            [],
-            blob_out + '_w',
-            shape=weight_shape,
-            **weight_init[1]
-        )
-        bias = model.param_init_net.__getattr__(bias_init[0])(
-            [],
-            blob_out + '_b',
-            shape=[dim_out, ],
-            **bias_init[1]
-        )
-    else:
-        weight = core.ScopedBlobReference(
-            blob_out + '_w', model.param_init_net)
-        bias = core.ScopedBlobReference(
-            blob_out + '_b', model.param_init_net)
-    model.AddParameter(weight, ParameterTags.WEIGHT)
-    model.AddParameter(bias, ParameterTags.BIAS)
-    if use_cudnn:
-        kwargs['engine'] = 'CUDNN'
-        kwargs['exhaustive_search'] = cudnn_exhaustive_search
-        if ws_nbytes_limit:
-            kwargs['ws_nbytes_limit'] = ws_nbytes_limit
-    return model.net.ConvTranspose(
-        [blob_in, weight, bias],
-        blob_out,
-        kernel=kernel,
-        order=order,
-        **kwargs
-    )
-
-
-def group_conv(
-    model,
-    blob_in,
-    blob_out,
-    dim_in,
-    dim_out,
-    kernel,
-    weight_init=None,
-    bias_init=None,
-    group=1,
-    **kwargs
-):
-    """Group Convolution.
-
-    This is essentially the same as Conv with a group argument passed in.
-    We specialize this for backward interface compatibility.
-    """
-    return conv(model, blob_in, blob_out, dim_in, dim_out, kernel,
-                weight_init=weight_init, bias_init=bias_init,
-                group=group, **kwargs)
-
-
-def group_conv_deprecated(
-    model,
-    blob_in,
-    blob_out,
-    dim_in,
-    dim_out,
-    kernel,
-    weight_init=None,
-    bias_init=None,
-    group=1,
-    use_cudnn=False,
-    order="NCHW",
-    cudnn_exhaustive_search=False,
-    ws_nbytes_limit=None,
-    **kwargs
-):
-    """GroupConvolution's deprecated interface.
-
-    This is used to simulate a group convolution via split and concat. You
-    should always use the new group convolution in your new code.
-    """
-    weight_init = weight_init if weight_init else ('XavierFill', {})
-    bias_init = bias_init if bias_init else ('ConstantFill', {})
-    use_bias = False if ("no_bias" in kwargs and kwargs["no_bias"]) else True
-    if use_cudnn:
-        kwargs['engine'] = 'CUDNN'
-        kwargs['exhaustive_search'] = cudnn_exhaustive_search
-        if ws_nbytes_limit:
-            kwargs['ws_nbytes_limit'] = ws_nbytes_limit
-            if dim_in % group:
-                raise ValueError("dim_in should be divisible by group.")
-    if dim_out % group:
-        raise ValueError("dim_out should be divisible by group.")
-    splitted_blobs = model.net.DepthSplit(
-        blob_in,
-        ['_' + blob_out + '_gconv_split_' + str(i) for i in range(group)],
-        dimensions=[int(dim_in / group) for i in range(group)],
-        order=order
-    )
-    weight_shape = (
-        [dim_out / group, dim_in / group, kernel, kernel]
-        if order == "NCHW" else
-        [dim_out / group, kernel, kernel, dim_in / group]
-    )
-    # Make sure that the shapes are of int format. Especially for py3 where
-    # int division gives float output.
-    weight_shape = [int(v) for v in weight_shape]
-    conv_blobs = []
-    for i in range(group):
-        if model.init_params:
-            weight = model.param_init_net.__getattr__(weight_init[0])(
-                [],
-                blob_out + '_gconv_%d_w' % i,
-                shape=weight_shape,
-                **weight_init[1]
-            )
-            if use_bias:
-                bias = model.param_init_net.__getattr__(bias_init[0])(
-                    [],
-                    blob_out + '_gconv_%d_b' % i,
-                    shape=[int(dim_out / group)],
-                    **bias_init[1]
-                )
-        else:
-            weight = core.ScopedBlobReference(
-                blob_out + '_gconv_%d_w' % i, model.param_init_net)
-            if use_bias:
-                bias = core.ScopedBlobReference(
-                    blob_out + '_gconv_%d_b' % i, model.param_init_net)
-        model.AddParameter(weight, ParameterTags.WEIGHT)
-        if use_bias:
-            model.AddParameter(bias, ParameterTags.BIAS)
-        if use_bias:
-            inputs = [weight, bias]
-        else:
-            inputs = [weight]
-        if 'no_bias' in kwargs:
-            del kwargs['no_bias']
-        conv_blobs.append(
-            splitted_blobs[i].Conv(
-                inputs,
-                blob_out + '_gconv_%d' % i,
-                kernel=kernel,
-                order=order,
-                **kwargs
-            )
-        )
-    concat, concat_dims = model.net.Concat(
-        conv_blobs,
-        [blob_out,
-         "_" + blob_out + "_concat_dims"],
-        order=order
-    )
-    return concat
diff --git a/caffe2/python/helpers/db_input.py b/caffe2/python/helpers/db_input.py
deleted file mode 100644
index d5772cb7653e..000000000000
--- a/caffe2/python/helpers/db_input.py
+++ /dev/null
@@ -1,17 +0,0 @@
-## @package db_input
-# Module caffe2.python.helpers.db_input
-
-
-
-
-
-def db_input(model, blobs_out, batch_size, db, db_type):
-    dbreader_name = "dbreader_" + db
-    dbreader = model.param_init_net.CreateDB(
-        [],
-        dbreader_name,
-        db=db,
-        db_type=db_type,
-    )
-    return model.net.TensorProtosDBInput(
-        dbreader, blobs_out, batch_size=batch_size)
diff --git a/caffe2/python/helpers/dropout.py b/caffe2/python/helpers/dropout.py
deleted file mode 100644
index d7280318f60d..000000000000
--- a/caffe2/python/helpers/dropout.py
+++ /dev/null
@@ -1,17 +0,0 @@
-## @package dropout
-# Module caffe2.python.helpers.dropout
-
-
-
-
-
-
-def dropout(model, blob_in, blob_out, use_cudnn=False, **kwargs):
-    """dropout"""
-    if use_cudnn:
-        kwargs['engine'] = 'CUDNN'
-    else:
-        kwargs['engine'] = 'DEFAULT'
-    assert 'is_test' in kwargs, "Argument 'is_test' is required"
-    return model.net.Dropout(
-        blob_in, [blob_out, "_" + blob_out + "_mask"], **kwargs)[0]
diff --git a/caffe2/python/helpers/elementwise_linear.py b/caffe2/python/helpers/elementwise_linear.py
deleted file mode 100644
index ef9184d00dd2..000000000000
--- a/caffe2/python/helpers/elementwise_linear.py
+++ /dev/null
@@ -1,46 +0,0 @@
-## @package elementwise_linear
-# Module caffe2.python.helpers.elementwise_linear
-
-
-
-
-
-from caffe2.python import core
-from caffe2.python.modeling.parameter_info import ParameterTags
-
-
-def _elementwise_linear(
-    model, op_call, blob_in, blob_out, dim,
-    weight_init=None, bias_init=None, **kwargs
-):
-    """Elementwise_Linear"""
-    weight_init = weight_init or ('ConstantFill', {'value': 1.0})
-    bias_init = bias_init or ('ConstantFill', {'value': 0.0})
-    blob_out = blob_out or model.net.NextName()
-    if model.init_params:
-        weight = model.param_init_net.__getattr__(weight_init[0])(
-            [],
-            blob_out + '_w',
-            shape=[dim],
-            **weight_init[1]
-        )
-        bias = model.param_init_net.__getattr__(bias_init[0])(
-            [],
-            blob_out + '_b',
-            shape=[dim],
-            **bias_init[1]
-        )
-    else:
-        weight = core.ScopedBlobReference(
-            blob_out + '_w', model.param_init_net)
-        bias = core.ScopedBlobReference(
-            blob_out + '_b', model.param_init_net)
-
-    model.AddParameter(weight, ParameterTags.WEIGHT)
-    model.AddParameter(bias, ParameterTags.BIAS)
-    return op_call([blob_in, weight, bias], blob_out, **kwargs)
-
-
-def elementwise_linear(model, *args, **kwargs):
-    return _elementwise_linear(
-        model, model.net.ElementwiseLinear, *args, **kwargs)
diff --git a/caffe2/python/helpers/fc.py b/caffe2/python/helpers/fc.py
deleted file mode 100644
index 0feb2b65745e..000000000000
--- a/caffe2/python/helpers/fc.py
+++ /dev/null
@@ -1,197 +0,0 @@
-## @package fc
-# Module caffe2.python.helpers.fc
-
-
-
-
-
-from caffe2.python import core
-from caffe2.python.modeling import initializers
-from caffe2.python.modeling.parameter_info import ParameterTags
-
-
-def _FC_or_packed_FC(
-    model, op_call, blob_in, blob_out, dim_in, dim_out, weight_init=None,
-        bias_init=None, WeightInitializer=None, BiasInitializer=None,
-        enable_tensor_core=False, float16_compute=False, **kwargs
-):
-    WeightInitializer = initializers.update_initializer(
-        WeightInitializer, weight_init, ("XavierFill", {})
-    )
-    BiasInitializer = initializers.update_initializer(
-        BiasInitializer, bias_init, ("ConstantFill", {})
-    )
-    if not model.init_params:
-        WeightInitializer = initializers.ExternalInitializer()
-        BiasInitializer = initializers.ExternalInitializer()
-
-    blob_out = blob_out or model.net.NextName()
-    bias_tags = [ParameterTags.BIAS]
-    if 'freeze_bias' in kwargs:
-        bias_tags.append(ParameterTags.COMPUTED_PARAM)
-
-    weight = model.create_param(
-        param_name=blob_out + '_w',
-        shape=[dim_out, dim_in],
-        initializer=WeightInitializer,
-        tags=ParameterTags.WEIGHT
-    )
-    bias = model.create_param(
-        param_name=blob_out + '_b',
-        shape=[dim_out, ],
-        initializer=BiasInitializer,
-        tags=bias_tags
-    )
-
-    # enable TensorCore by setting appropriate engine
-    if enable_tensor_core:
-        kwargs['engine'] = 'TENSORCORE'
-
-    # Enable float 16 compute kernel (relevant for CUDA)
-    if float16_compute:
-        kwargs['float16_compute'] = True
-
-    return op_call([blob_in, weight, bias], blob_out, **kwargs)
-
-
-def fc(model, *args, **kwargs):
-    return _FC_or_packed_FC(model, model.net.FC, *args, **kwargs)
-
-
-def packed_fc(model, *args, **kwargs):
-    return _FC_or_packed_FC(model, model.net.PackedFC, *args, **kwargs)
-
-
-def fc_decomp(
-    model, blob_in, blob_out, dim_in, dim_out,
-    rank_approx=5, weight_init=None, bias_init=None,
-    WeightInitializer=None, BiasInitializer=None, **kwargs
-):
-    """FC_Decomp version
-    Here we assume that the rank of original input is bigger than 5.
-    """
-    WeightInitializer = initializers.update_initializer(
-        WeightInitializer, weight_init, ("XavierFill", {})
-    )
-    BiasInitializer = initializers.update_initializer(
-        BiasInitializer, bias_init, ("ConstantFill", {})
-    )
-    blob_out = blob_out or model.net.NextName()
-    u = model.create_param(
-        param_name=blob_out + '_u',
-        shape=[dim_out, rank_approx],
-        initializer=WeightInitializer,
-    )
-    v = model.create_param(
-        param_name=blob_out + '_v',
-        shape=[dim_in, rank_approx],
-        initializer=WeightInitializer,
-    )
-    bias = model.create_param(
-        param_name=blob_out + '_b',
-        shape=[dim_out, ],
-        initializer=BiasInitializer,
-    )
-    return model.net.FC_Decomp([blob_in, u, v, bias], blob_out, **kwargs)
-
-
-def fc_prune(
-    model, blob_in, blob_out, dim_in, dim_out,
-    weight_init=None, bias_init=None, mask_init=None,
-    threshold=0.00001, need_compress_rate=False,
-    comp_lb=0.05,
-    **kwargs
-):
-    """FC_Prune version
-    Runnable so far. Great!:)
-    """
-    weight_init = weight_init if weight_init else ('XavierFill', {})
-    bias_init = bias_init if bias_init else ('ConstantFill', {})
-    mask_init = mask_init if mask_init else ('ConstantFill', {})
-    blob_out = blob_out or model.net.NextName()
-    compress_rate = blob_out + '_compress_rate'
-    if model.init_params:
-        compress_lb = model.param_init_net.ConstantFill(
-            [],
-            blob_out + '_lb',
-            shape=[1],
-            value=comp_lb
-        )
-        weight = model.param_init_net.__getattr__(weight_init[0])(
-            [],
-            blob_out + '_w',
-            shape=[dim_out, dim_in],
-            **weight_init[1]
-        )
-        mask = model.param_init_net.ConstantFill(
-            [],
-            blob_out + '_m',
-            shape=[dim_out, dim_in],
-            value=1.0
-        )
-        ag_dw = model.param_init_net.__getattr__(mask_init[0])(
-            [],
-            blob_out + '_ag_dw',
-            shape=[dim_out, dim_in],
-            **mask_init[1]
-        )
-        bias = model.param_init_net.__getattr__(bias_init[0])(
-            [],
-            blob_out + '_b',
-            shape=[dim_out, ],
-            **bias_init[1]
-        )
-        mask_seq = model.param_init_net.__getattr__(mask_init[0])(
-            [],
-            blob_out + '_mask_seq',
-            shape=[dim_out, dim_in],
-            **mask_init[1]
-        )
-        thres = model.param_init_net.ConstantFill(
-            [],
-            blob_out + '_thres',
-            shape=[1],
-            value=threshold
-        )
-    else:
-        compress_lb = core.ScopedBlobReference(
-            blob_out + '_lb', model.param_init_net)
-        weight = core.ScopedBlobReference(
-            blob_out + '_w', model.param_init_net)
-        bias = core.ScopedBlobReference(
-            blob_out + '_b', model.param_init_net)
-        mask = core.ScopedBlobReference(
-            blob_out + '_m', model.param_init_net)
-        ag_dw = core.ScopedBlobReference(
-            blob_out + '_ag_dw', model.param_init_net)
-        mask_seq = core.ScopedBlobReference(
-            blob_out + '_mask_seq', model.param_init_net)
-        thres = core.ScopedBlobReference(
-            blob_out + '_thres', model.param_init_net)
-
-    model.AddParameter(weight)
-    model.AddParameter(bias)
-    if need_compress_rate:
-        return model.net.FC_Prune([blob_in, weight, mask, bias, ag_dw, mask_seq,
-                                   thres, compress_lb],
-                                  [blob_out, compress_rate], **kwargs)
-    else:
-        return model.net.FC_Prune([blob_in, weight, mask,
-                                   bias, ag_dw, mask_seq,
-                                   thres, compress_lb],
-                                  blob_out, **kwargs)
-
-
-def fc_sparse(
-    model, blob_in, blob_out, w_csr, iw, jw, bias,
-    **kwargs
-):
-    """FC_Sparse: Only takes in allocated weights"""
-    if not (w_csr and iw and jw and bias):
-        print("Warning...")
-    model.AddParameter(w_csr)
-    model.AddParameter(iw)
-    model.AddParameter(jw)
-    model.AddParameter(bias)
-    return model.net.FC_Sparse([blob_in, w_csr, iw, jw, bias],
-                               blob_out, **kwargs)
diff --git a/caffe2/python/helpers/nonlinearity.py b/caffe2/python/helpers/nonlinearity.py
deleted file mode 100644
index 3a8be3bb056a..000000000000
--- a/caffe2/python/helpers/nonlinearity.py
+++ /dev/null
@@ -1,43 +0,0 @@
-## @package nonlinearity
-# Module caffe2.python.helpers.nonlinearity
-
-
-
-
-
-from caffe2.python import core
-
-
-def prelu(model, blob_in, blob_out, num_channels=1, slope_init=None,
-          **kwargs):
-    """PRelu"""
-    slope_init = (
-        slope_init if slope_init else ('ConstantFill', {'value': 0.25}))
-    if model.init_params:
-        slope = model.param_init_net.__getattr__(slope_init[0])(
-            [],
-            blob_out + '_slope',
-            shape=[num_channels],
-            **slope_init[1]
-        )
-    else:
-        slope = core.ScopedBlobReference(
-            blob_out + '_slope', model.param_init_net)
-
-    model.AddParameter(slope)
-
-    return model.net.PRelu([blob_in, slope], [blob_out])
-
-
-def relu(model, blob_in, blob_out, use_cudnn=False, order="NCHW", **kwargs):
-    """Relu."""
-    if use_cudnn:
-        kwargs['engine'] = 'CUDNN'
-    return model.net.Relu(blob_in, blob_out, order=order, **kwargs)
-
-
-def tanh(model, blob_in, blob_out, use_cudnn=False, order="NCHW", **kwargs):
-    """Tanh."""
-    if use_cudnn:
-        kwargs['engine'] = 'CUDNN'
-    return model.net.Tanh(blob_in, blob_out, order=order, **kwargs)
diff --git a/caffe2/python/helpers/normalization.py b/caffe2/python/helpers/normalization.py
deleted file mode 100644
index b13b43f6859a..000000000000
--- a/caffe2/python/helpers/normalization.py
+++ /dev/null
@@ -1,322 +0,0 @@
-## @package normalization
-# Module caffe2.python.helpers.normalization
-
-
-
-
-
-from caffe2.python import scope
-from caffe2.python.modeling.parameter_info import ParameterTags
-from caffe2.proto import caffe2_pb2
-from caffe2.python.modeling import initializers
-
-
-def lrn(model, blob_in, blob_out, order="NCHW", use_cudnn=False, **kwargs):
-    """LRN"""
-    dev = kwargs['device_option'] if 'device_option' in kwargs \
-        else scope.CurrentDeviceScope()
-    is_cpu = dev is None or dev.device_type == caffe2_pb2.CPU
-    if use_cudnn and (not is_cpu):
-        kwargs['engine'] = 'CUDNN'
-        blobs_out = blob_out
-    else:
-        blobs_out = [blob_out, "_" + blob_out + "_scale"]
-    lrn = model.net.LRN(
-        blob_in,
-        blobs_out,
-        order=order,
-        **kwargs
-    )
-
-    if use_cudnn and (not is_cpu):
-        return lrn
-    else:
-        return lrn[0]
-
-
-def softmax(model, blob_in, blob_out=None, use_cudnn=False, **kwargs):
-    """Softmax."""
-    if use_cudnn:
-        kwargs['engine'] = 'CUDNN'
-    if blob_out is not None:
-        return model.net.Softmax(blob_in, blob_out, **kwargs)
-    else:
-        return model.net.Softmax(blob_in, **kwargs)
-
-
-def instance_norm(model, blob_in, blob_out, dim_in, order="NCHW", **kwargs):
-    blob_out = blob_out or model.net.NextName()
-    # Input: input, scale, bias
-    # Output: output, saved_mean, saved_inv_std
-    # scale: initialize with ones
-    # bias: initialize with zeros
-
-    def init_blob(value, suffix):
-        return model.param_init_net.ConstantFill(
-            [], blob_out + "_" + suffix, shape=[dim_in], value=value)
-    scale, bias = init_blob(1.0, "s"), init_blob(0.0, "b")
-
-    model.AddParameter(scale, ParameterTags.WEIGHT)
-    model.AddParameter(bias, ParameterTags.BIAS)
-    blob_outs = [blob_out, blob_out + "_sm", blob_out + "_siv"]
-    if 'is_test' in kwargs and kwargs['is_test']:
-        blob_outputs = model.net.InstanceNorm(
-            [blob_in, scale, bias], [blob_out],
-            order=order, **kwargs)
-        return blob_outputs
-    else:
-        blob_outputs = model.net.InstanceNorm(
-            [blob_in, scale, bias], blob_outs,
-            order=order, **kwargs)
-        # Return the output
-        return blob_outputs[0]
-
-
-def spatial_bn(model, blob_in, blob_out, dim_in,
-               init_scale=1., init_bias=0.,
-               ScaleInitializer=None, BiasInitializer=None,
-               RunningMeanInitializer=None, RunningVarianceInitializer=None,
-               order="NCHW", **kwargs):
-    blob_out = blob_out or model.net.NextName()
-    # Input: input, scale, bias, est_mean, est_inv_var
-    # Output: output, running_mean, running_inv_var, saved_mean,
-    #         saved_inv_var
-    # scale: initialize with init_scale (default 1.)
-    # bias: initialize with init_bias (default 0.)
-    # est mean: zero
-    # est var: ones
-
-    if model.init_params:
-        scale_init = ("ConstantFill", {'value': init_scale})
-        bias_init = ("ConstantFill", {'value': init_bias})
-        rm_init = ("ConstantFill", {'value': 0.0})
-        riv_init = ("ConstantFill", {'value': 1.0})
-
-        ScaleInitializer = initializers.update_initializer(
-            ScaleInitializer, scale_init, ("ConstantFill", {})
-        )
-        BiasInitializer = initializers.update_initializer(
-            BiasInitializer, bias_init, ("ConstantFill", {})
-        )
-        RunningMeanInitializer = initializers.update_initializer(
-            RunningMeanInitializer, rm_init, ("ConstantFill", {})
-        )
-        RunningVarianceInitializer = initializers.update_initializer(
-            RunningVarianceInitializer, riv_init, ("ConstantFill", {})
-        )
-    else:
-        ScaleInitializer = initializers.ExternalInitializer()
-        BiasInitializer = initializers.ExternalInitializer()
-        RunningMeanInitializer = initializers.ExternalInitializer()
-        RunningVarianceInitializer = initializers.ExternalInitializer()
-
-    scale = model.create_param(
-        param_name=blob_out + '_s',
-        shape=[dim_in],
-        initializer=ScaleInitializer,
-        tags=ParameterTags.WEIGHT
-    )
-
-    bias = model.create_param(
-        param_name=blob_out + '_b',
-        shape=[dim_in],
-        initializer=BiasInitializer,
-        tags=ParameterTags.BIAS
-    )
-
-    running_mean = model.create_param(
-        param_name=blob_out + '_rm',
-        shape=[dim_in],
-        initializer=RunningMeanInitializer,
-        tags=ParameterTags.COMPUTED_PARAM
-    )
-
-    running_inv_var = model.create_param(
-        param_name=blob_out + '_riv',
-        shape=[dim_in],
-        initializer=RunningVarianceInitializer,
-        tags=ParameterTags.COMPUTED_PARAM
-    )
-
-    blob_outs = [blob_out, running_mean, running_inv_var,
-                 blob_out + "_sm", blob_out + "_siv"]
-    if 'is_test' in kwargs and kwargs['is_test']:
-        blob_outputs = model.net.SpatialBN(
-            [blob_in, scale, bias, blob_outs[1], blob_outs[2]], [blob_out],
-            order=order, **kwargs)
-        return blob_outputs
-    else:
-        blob_outputs = model.net.SpatialBN(
-            [blob_in, scale, bias, blob_outs[1], blob_outs[2]], blob_outs,
-            order=order, **kwargs)
-        # Return the output
-        return blob_outputs[0]
-
-
-def spatial_gn(model, blob_in, blob_out, dim_in,
-               init_scale=1., init_bias=0.,
-               ScaleInitializer=None, BiasInitializer=None,
-               RunningMeanInitializer=None, RunningVarianceInitializer=None,
-               order="NCHW", **kwargs):
-    '''
-    Group normalizes the input, cf. https://arxiv.org/abs/1803.08494.
-    '''
-
-    blob_out = blob_out or model.net.NextName()
-    # Input: input, scale, bias
-    # Output: output, group_mean, group_inv_std
-    # scale: initialize with init_scale (default 1.)
-    # [recommendation: set init_scale = 0. in the last layer for each res block]
-    # bias: initialize with init_bias (default 0.)
-
-    if model.init_params:
-        scale_init = ("ConstantFill", {'value': init_scale})
-        bias_init = ("ConstantFill", {'value': init_bias})
-
-        ScaleInitializer = initializers.update_initializer(
-            ScaleInitializer, scale_init, ("ConstantFill", {})
-        )
-        BiasInitializer = initializers.update_initializer(
-            BiasInitializer, bias_init, ("ConstantFill", {})
-        )
-    else:
-        ScaleInitializer = initializers.ExternalInitializer()
-        BiasInitializer = initializers.ExternalInitializer()
-
-    scale = model.create_param(
-        param_name=blob_out + '_s',
-        shape=[dim_in],
-        initializer=ScaleInitializer,
-        tags=ParameterTags.WEIGHT
-    )
-
-    bias = model.create_param(
-        param_name=blob_out + '_b',
-        shape=[dim_in],
-        initializer=BiasInitializer,
-        tags=ParameterTags.BIAS
-    )
-
-    blob_outs = [blob_out,
-                 blob_out + "_mean", blob_out + "_std"]
-
-    blob_outputs = model.net.GroupNorm(
-        [blob_in, scale, bias],
-        blob_outs,
-        **kwargs)
-    # Return the output
-    return blob_outputs[0]
-
-
-def layer_norm(
-    model,
-    blob_in,
-    blob_out,
-    dim_in,
-    axis=1,
-    epsilon=1e-4,
-    initial_scale=1.0,
-    initial_bias=0.0,
-):
-    '''
-    Layer normalizes the input, cf. https://arxiv.org/pdf/1607.06450.pdf.
-
-    Args:
-        blob_in: The input blob to layer normalize.
-        blob_out: The layer normalized output blob.
-        dim_in: The dimension of the scale and bias. For example, if blob_in is
-            a 2D design matrix and axis is 1, this would be the number of
-            columns.
-        axis: (optional) The axis to normalize. Typically the feature axis.
-            Defaults to 1.
-        epsilon: (optional) A small value used for numerical stability in
-            calculation. Defaults to 1e-4.
-        initial_scale: (optional) The initial value for the learned scale
-            parameter. Defaults to 1.0
-        initial_bias: (optional) The initial value for the learned bias
-            parameter of the layerwise standard deviation. Defaults to 0.0.
-
-    Returns:
-        A 3-tuple consisting of:
-            - The layer normalized input blob.
-            - The mean of the input blob across the given axis.
-            - The standard deviation of the input blob acress the given axis.
-    '''
-
-    # The learned multiplicative scale or "gain".
-    scale = model.create_param(
-        param_name='{}_scale'.format(blob_out),
-        shape=[dim_in] if isinstance(dim_in, int) else dim_in,
-        initializer=initializers.Initializer(
-            'ConstantFill',
-            value=initial_scale,
-        ),
-        tags=ParameterTags.WEIGHT,
-    )
-
-    # The learned additive bias or "shift".
-    bias = model.create_param(
-        param_name='{}_bias'.format(blob_out),
-        shape=[dim_in] if isinstance(dim_in, int) else dim_in,
-        initializer=initializers.Initializer(
-            'ConstantFill',
-            value=initial_bias,
-        ),
-        tags=ParameterTags.BIAS,
-    )
-
-    normalized, mean, std = model.net.LayerNorm(
-        [blob_in, scale, bias],
-        [blob_out, blob_out + "_mean", blob_out + "_std"],
-        axis=axis,
-        epsilon=epsilon,
-        elementwise_affine=True,
-    )
-
-    return normalized, mean, std
-
-def moments_with_running_stats(model, blob_in, blob_out, dim_in,
-                                     RunningMeanInitializer=None, RunningVarianceInitializer=None,
-                                     order="NCHW", **kwargs):
-
-    if model.init_params:
-        rm_init = ("ConstantFill", {'value': 0.0})
-        riv_init = ("ConstantFill", {'value': 1.0})
-
-        RunningMeanInitializer = initializers.update_initializer(
-            RunningMeanInitializer, rm_init, ("ConstantFill", {})
-        )
-        RunningVarianceInitializer = initializers.update_initializer(
-            RunningVarianceInitializer, riv_init, ("ConstantFill", {})
-        )
-    else:
-        RunningMeanInitializer = initializers.ExternalInitializer()
-        RunningVarianceInitializer = initializers.ExternalInitializer()
-
-    running_mean = model.create_param(
-        param_name=blob_out + '_rm',
-        shape=[dim_in],
-        initializer=RunningMeanInitializer,
-        tags=ParameterTags.COMPUTED_PARAM
-    )
-
-    # this is just running variance
-    running_inv_var = model.create_param(
-        param_name=blob_out + '_riv',
-        shape=[dim_in],
-        initializer=RunningVarianceInitializer,
-        tags=ParameterTags.COMPUTED_PARAM
-    )
-
-    blob_outs = [blob_out + "_sm", blob_out + "_sv"]
-    if order == 'NCHW':
-        blob_outputs = model.net.Moments(
-            [blob_in], blob_outs,
-            axes=[0, 2, 3],
-            order=order, keepdims=False, **kwargs)
-    elif order == 'NHWC':
-        blob_outputs = model.net.Moments(
-            [blob_in], blob_outs,
-            axes=[0, 1, 2],
-            order=order, keepdims=False, **kwargs)
-    return blob_outputs
diff --git a/caffe2/python/helpers/pooling.py b/caffe2/python/helpers/pooling.py
deleted file mode 100644
index 9e6fc784f289..000000000000
--- a/caffe2/python/helpers/pooling.py
+++ /dev/null
@@ -1,38 +0,0 @@
-## @package pooling
-# Module caffe2.python.helpers.pooling
-## @package fc
-# Module caffe2.python.helpers.pooling
-
-
-
-
-
-
-def max_pool(model, blob_in, blob_out, use_cudnn=False, order="NCHW", **kwargs):
-    """Max pooling"""
-    if use_cudnn:
-        kwargs['engine'] = 'CUDNN'
-    return model.net.MaxPool(blob_in, blob_out, order=order, **kwargs)
-
-
-def average_pool(model, blob_in, blob_out, use_cudnn=False, order="NCHW",
-                 **kwargs):
-    """Average pooling"""
-    if use_cudnn:
-        kwargs['engine'] = 'CUDNN'
-    return model.net.AveragePool(
-        blob_in,
-        blob_out,
-        order=order,
-        **kwargs
-    )
-
-
-def max_pool_with_index(model, blob_in, blob_out, order="NCHW", **kwargs):
-    """Max pooling with an explicit index of max position"""
-    return model.net.MaxPoolWithIndex(
-        blob_in,
-        [blob_out, blob_out + "_index"],
-        order=order,
-        **kwargs
-    )[0]
diff --git a/caffe2/python/helpers/quantization.py b/caffe2/python/helpers/quantization.py
deleted file mode 100644
index 4e7a6da32436..000000000000
--- a/caffe2/python/helpers/quantization.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# @package quantization
-# Module caffe2.python.helpers.quantization
-
-
-def fused_8bit_rowwise_quantized_to_float(
-    model, blob_in, blob_out
-):
-    """Fused8BitRowwiseQuantizedToFloat"""
-    return model.net.Fused8BitRowwiseQuantizedToFloat(blob_in, blob_out)
diff --git a/caffe2/python/helpers/tools.py b/caffe2/python/helpers/tools.py
deleted file mode 100644
index 178620eab593..000000000000
--- a/caffe2/python/helpers/tools.py
+++ /dev/null
@@ -1,34 +0,0 @@
-## @package tools
-# Module caffe2.python.helpers.tools
-
-
-
-
-
-
-def image_input(
-    model, blob_in, blob_out, order="NCHW", use_gpu_transform=False, **kwargs
-):
-    assert 'is_test' in kwargs, "Argument 'is_test' is required"
-    if order == "NCHW":
-        if (use_gpu_transform):
-            kwargs['use_gpu_transform'] = 1 if use_gpu_transform else 0
-            # GPU transform will handle NHWC -> NCHW
-            outputs = model.net.ImageInput(blob_in, blob_out, **kwargs)
-            pass
-        else:
-            outputs = model.net.ImageInput(
-                blob_in, [blob_out[0] + '_nhwc'] + blob_out[1:], **kwargs
-            )
-            outputs_list = list(outputs)
-            outputs_list[0] = model.net.NHWC2NCHW(outputs_list[0], blob_out[0])
-            outputs = tuple(outputs_list)
-    else:
-        outputs = model.net.ImageInput(blob_in, blob_out, **kwargs)
-    return outputs
-
-
-def video_input(model, blob_in, blob_out, **kwargs):
-    # size of outputs can vary depending on kwargs
-    outputs = model.net.VideoInput(blob_in, blob_out, **kwargs)
-    return outputs
diff --git a/caffe2/python/helpers/train.py b/caffe2/python/helpers/train.py
deleted file mode 100644
index 02883af7402d..000000000000
--- a/caffe2/python/helpers/train.py
+++ /dev/null
@@ -1,78 +0,0 @@
-## @package train
-# Module caffe2.python.helpers.train
-
-
-
-
-
-from caffe2.python import core, scope
-from caffe2.proto import caffe2_pb2
-
-
-def _get_weights(model, namescope=None):
-    if namescope is None:
-        namescope = scope.CurrentNameScope()
-
-    if namescope == '':
-        return model.weights[:]
-    else:
-        return [w for w in model.weights if w.GetNameScope() == namescope]
-
-
-def iter(model, blob_out, **kwargs):
-    if 'device_option' in kwargs:
-        del kwargs['device_option']
-    model.param_init_net.ConstantFill(
-        [],
-        blob_out,
-        shape=[1],
-        value=0,
-        dtype=core.DataType.INT64,
-        device_option=core.DeviceOption(caffe2_pb2.CPU, 0),
-        **kwargs
-    )
-    return model.net.Iter(blob_out, blob_out, **kwargs)
-
-
-def accuracy(model, blob_in, blob_out, **kwargs):
-    dev = kwargs['device_option'] if 'device_option' in kwargs \
-        else scope.CurrentDeviceScope()
-    is_cpu = dev is None or dev.device_type == caffe2_pb2.CPU
-
-    # We support top_k > 1 only on CPU
-    if not is_cpu and 'top_k' in kwargs and kwargs['top_k'] > 1:
-        pred_host = model.net.CopyGPUToCPU(blob_in[0], blob_in[0] + "_host")
-        label_host = model.net.CopyGPUToCPU(blob_in[1], blob_in[1] + "_host")
-
-        # Now use the Host version of the accuracy op
-        model.net.Accuracy(
-            [pred_host, label_host],
-            blob_out,
-            device_option=core.DeviceOption(caffe2_pb2.CPU, 0),
-            **kwargs
-        )
-    else:
-        model.net.Accuracy(blob_in, blob_out)
-
-
-def add_weight_decay(model, weight_decay):
-    """Adds a decay to weights in the model.
-
-    This is a form of L2 regularization.
-
-    Args:
-        weight_decay: strength of the regularization
-    """
-    if weight_decay <= 0.0:
-        return
-    wd = model.param_init_net.ConstantFill(
-        [], 'wd', shape=[1], value=weight_decay
-    )
-    ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
-    for param in _get_weights(model):
-        #  Equivalent to: grad += wd * param
-        grad = model.param_to_grad[param]
-        model.net.WeightedSum(
-            [grad, ONE, param, wd],
-            grad,
-        )
diff --git a/caffe2/python/hip_test_util.py b/caffe2/python/hip_test_util.py
deleted file mode 100644
index beab3be1c40a..000000000000
--- a/caffe2/python/hip_test_util.py
+++ /dev/null
@@ -1,18 +0,0 @@
-## @package hip_test_util
-# Module caffe2.python.hip_test_util
-"""
-The HIP test utils is a small addition on top of the hypothesis test utils
-under caffe2/python, which allows one to more easily test HIP/ROCm related
-operators.
-"""
-
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-
-def run_in_hip(gc, dc):
-    return (gc.device_type == caffe2_pb2.HIP) or (
-        caffe2_pb2.HIP in {d.device_type for d in dc})
diff --git a/caffe2/python/hsm_util.py b/caffe2/python/hsm_util.py
deleted file mode 100644
index ec465c12240e..000000000000
--- a/caffe2/python/hsm_util.py
+++ /dev/null
@@ -1,70 +0,0 @@
-## @package hsm_util
-# Module caffe2.python.hsm_util
-
-
-
-
-
-from caffe2.proto import hsm_pb2
-
-'''
-    Hierarchical softmax utility methods that can be used to:
-    1) create TreeProto structure given list of word_ids or NodeProtos
-    2) create HierarchyProto structure using the user-inputted TreeProto
-'''
-
-
-def create_node_with_words(words, name='node'):
-    node = hsm_pb2.NodeProto()
-    node.name = name
-    for word in words:
-        node.word_ids.append(word)
-    return node
-
-
-def create_node_with_nodes(nodes, name='node'):
-    node = hsm_pb2.NodeProto()
-    node.name = name
-    for child_node in nodes:
-        new_child_node = node.children.add()
-        new_child_node.MergeFrom(child_node)
-    return node
-
-
-def create_hierarchy(tree_proto):
-    max_index = 0
-
-    def create_path(path, word):
-        path_proto = hsm_pb2.PathProto()
-        path_proto.word_id = word
-        for entry in path:
-            new_path_node = path_proto.path_nodes.add()
-            new_path_node.index = entry[0]
-            new_path_node.length = entry[1]
-            new_path_node.target = entry[2]
-        return path_proto
-
-    def recursive_path_builder(node_proto, path, hierarchy_proto, max_index):
-        node_proto.offset = max_index
-        path.append([max_index,
-                    len(node_proto.word_ids) + len(node_proto.children), 0])
-        max_index += len(node_proto.word_ids) + len(node_proto.children)
-        if hierarchy_proto.size < max_index:
-            hierarchy_proto.size = max_index
-        for target, node in enumerate(node_proto.children):
-            path[-1][2] = target
-            max_index = recursive_path_builder(node, path, hierarchy_proto,
-                                               max_index)
-        for target, word in enumerate(node_proto.word_ids):
-            path[-1][2] = target + len(node_proto.children)
-            path_entry = create_path(path, word)
-            new_path_entry = hierarchy_proto.paths.add()
-            new_path_entry.MergeFrom(path_entry)
-        del path[-1]
-        return max_index
-
-    node = tree_proto.root_node
-    hierarchy_proto = hsm_pb2.HierarchyProto()
-    path = []
-    max_index = recursive_path_builder(node, path, hierarchy_proto, max_index)
-    return hierarchy_proto
diff --git a/caffe2/python/hypothesis_test.py b/caffe2/python/hypothesis_test.py
deleted file mode 100644
index 894e88623531..000000000000
--- a/caffe2/python/hypothesis_test.py
+++ /dev/null
@@ -1,2802 +0,0 @@
-import copy
-import threading
-import time
-import unittest
-from functools import partial, reduce
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.proto import caffe2_pb2
-
-from caffe2.python import core, dyndep, tt_core, workspace
-from hypothesis import assume, given, HealthCheck, settings
-
-dyndep.InitOpsLibrary('@/caffe2/caffe2/fb/optimizers:sgd_simd_ops')
-
-if workspace.has_gpu_support:
-    # NOTE: During GPU stress tests, the number of workers exceeds the number
-    #       of GPUs which results in flakiness from GPU contention. As a
-    #       result, deadlines are not enforced on CUDA runs.
-    _hypothesis_settings = settings
-
-    def settings(**kwargs):
-        if 'deadline' in kwargs:
-            kwargs['deadline'] = None
-            kwargs.setdefault('max_examples', 50)
-
-        def wrapped(f):
-            return _hypothesis_settings(**kwargs)(f)
-        return wrapped
-
-
-def sigmoid(x):
-    return 1.0 / (1.0 + np.exp(-x))
-
-
-@st.composite
-def _tensor_and_prefix(draw, dtype, elements, min_dim=1, max_dim=4, **kwargs):
-    dims_ = draw(
-        st.lists(hu.dims(**kwargs), min_size=min_dim, max_size=max_dim))
-    extra_ = draw(
-        st.lists(hu.dims(**kwargs), min_size=min_dim, max_size=max_dim))
-    assume(len(dims_) + len(extra_) < max_dim)
-    return (draw(hu.arrays(dims_ + extra_, dtype, elements)),
-            draw(hu.arrays(extra_, dtype, elements)))
-
-
-def _tensor_and_indices(min_dim=1, max_dim=4, dtype=np.float32,
-                        elements=None, **kwargs):
-    """ generates a tensor and a list of indices of larger tensor of same dim"""
-    data_dims_ = st.lists(hu.dims(**kwargs), min_size=min_dim, max_size=max_dim)
-    original_dim = st.integers(min_value=2, max_value=10)
-    return st.tuples(data_dims_, original_dim).flatmap(lambda pair: st.tuples(
-        st.just(pair[1]),  # original dimension
-        hu.arrays(pair[0], dtype, elements),  # data tensor
-        hu.arrays(pair[0][0], dtype=np.int64, elements=st.integers(
-            min_value=0, max_value=pair[1] - 1)),
-    ))
-
-
-_NUMPY_TYPE_TO_ENUM = {
-    np.float32: core.DataType.FLOAT,
-    np.int32: core.DataType.INT32,
-    bool: core.DataType.BOOL,
-    np.uint8: core.DataType.UINT8,
-    np.int8: core.DataType.INT8,
-    np.uint16: core.DataType.UINT16,
-    np.int16: core.DataType.INT16,
-    np.int64: core.DataType.INT64,
-    np.float64: core.DataType.DOUBLE,
-}
-
-
-def _dtypes(dtypes=None):
-    dtypes = dtypes if dtypes else [np.int32, np.int64, np.float32]
-    return st.sampled_from(dtypes)
-
-
-def _test_binary(name, ref, filter_=None, gcs=hu.gcs,
-                 test_gradient=False, allow_inplace=False, dtypes=_dtypes):
-    @given(
-        inputs=dtypes().flatmap(
-            lambda dtype: hu.tensors(
-                n=2, dtype=dtype,
-                elements=hu.elements_of_type(dtype, filter_=filter_))),
-        out=st.sampled_from(('Y', 'X1', 'X2') if allow_inplace else ('Y',)),
-        **gcs)
-    @settings(
-        max_examples=20,
-        deadline=None,
-        suppress_health_check=[HealthCheck.filter_too_much])
-    def test_binary(self, inputs, out, gc, dc):
-        op = core.CreateOperator(name, ["X1", "X2"], [out])
-        X1, X2 = inputs
-        self.assertDeviceChecks(dc, op, [X1, X2], [0])
-        # We only do gradient check with float32 types.
-        if test_gradient and X1.dtype == np.float32:
-            self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
-        self.assertReferenceChecks(gc, op, [X1, X2], ref)
-
-    return test_binary
-
-
-def _test_binary_broadcast(name, ref, filter_=None,
-                           gcs=hu.gcs, allow_inplace=False, dtypes=_dtypes):
-    @given(
-        inputs=dtypes().flatmap(lambda dtype: _tensor_and_prefix(
-            dtype=dtype,
-            elements=hu.elements_of_type(dtype, filter_=filter_))),
-        in_place=(st.booleans() if allow_inplace else st.just(False)),
-        **gcs)
-    @settings(
-        max_examples=3,
-        deadline=100,
-        suppress_health_check=[HealthCheck.filter_too_much])
-    def test_binary_broadcast(self, inputs, in_place, gc, dc):
-        op = core.CreateOperator(
-            name, ["X1", "X2"], ["X1" if in_place else "Y"], broadcast=1)
-        X1, X2 = inputs
-        self.assertDeviceChecks(dc, op, [X1, X2], [0])
-
-        def cast_ref(x, y):
-            return (np.array(ref(x, y)[0], dtype=x.dtype), )
-
-        # gradient not implemented yet
-        # self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
-        self.assertReferenceChecks(gc, op, [X1, X2], cast_ref)
-
-    return test_binary_broadcast
-
-
-class TestOperators(hu.HypothesisTestCase):
-
-    def test_comparison_ops(self):
-        ops = {"LT": lambda x1, x2: [x1 < x2],
-               "LE": lambda x1, x2: [x1 <= x2],
-               "GT": lambda x1, x2: [x1 > x2],
-               "GE": lambda x1, x2: [x1 >= x2]}
-        for name, ref in ops.items():
-            _test_binary(name, ref, gcs=hu.gcs_cpu_only)(self)
-            _test_binary_broadcast(name, ref, gcs=hu.gcs_cpu_only)(self)
-
-    @given(inputs=hu.tensors(n=2), in_place=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_sum(self, inputs, in_place, gc, dc):
-        op = core.CreateOperator("Sum", ["X1", "X2"],
-                                        ["Y" if not in_place else "X1"])
-        X1, X2 = inputs
-        self.assertDeviceChecks(dc, op, [X1, X2], [0])
-        self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
-
-    @given(inputs=hu.tensors(n=2, min_dim=2, max_dim=2), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_row_mul(self, inputs, gc, dc):
-        op = core.CreateOperator("RowMul", ["X1", "X2"], ["Y"])
-        X1, Xtmp = inputs
-        X2 = Xtmp[:, 0]
-
-        def ref(x, y):
-            ret = np.zeros(shape=x.shape, dtype=x.dtype)
-            for i in range(y.size):
-                ret[i, ] = x[i, ] * y[i]
-            return [ret]
-
-        self.assertDeviceChecks(dc, op, [X1, X2], [0])
-        for i in range(2):
-            self.assertGradientChecks(gc, op, [X1, X2], i, [0])
-        self.assertReferenceChecks(gc, op, [X1, X2], ref)
-
-    @given(inputs=hu.tensors(n=2), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_max(self, inputs, gc, dc):
-        op = core.CreateOperator("Max", ["X1", "X2"], ["Y"])
-
-        X1, X2 = inputs
-        # Make X1 and X2 far from each other, since X1=X2 is not differentiable
-        # and the step size of gradient checker is 0.05
-        X1[np.logical_and(X1 >= X2 - 0.05, X1 <= X2)] -= 0.05
-        X1[np.logical_and(X1 <= X2 + 0.05, X1 >= X2)] += 0.05
-        self.assertDeviceChecks(dc, op, [X1, X2], [0])
-        for i in range(2):
-            self.assertGradientChecks(gc, op, [X1, X2], i, [0])
-
-        def elementwise_max(X, Y):
-            return [np.maximum(X, Y)]
-        self.assertReferenceChecks(gc, op, [X1, X2], elementwise_max)
-
-    def test_add(self):
-        def not_overflow(x):
-            if not isinstance(x, float):
-                return abs(x) < (1 << 30) - 1
-            return True
-
-        def ref(x, y):
-            return (x + y, )
-        _test_binary("Add", ref, filter_=not_overflow, test_gradient=True)(self)
-        _test_binary_broadcast("Add", ref, filter_=not_overflow)(self)
-
-    def test_sub(self):
-        def ref(x, y):
-            return (x - y, )
-        # TODO(jiayq): enable gradient test when implemented.
-        _test_binary("Sub", ref, test_gradient=True)(self)
-        _test_binary_broadcast("Sub", ref)(self)
-
-    def test_mul(self):
-        def not_overflow(x):
-            if not isinstance(x, float):
-                return abs(x) < (1 << 15) - 1
-            return True
-
-        def ref(x, y):
-            return (x * y, )
-        _test_binary("Mul", ref, filter_=not_overflow, test_gradient=True)(self)
-        _test_binary_broadcast("Mul", ref, filter_=not_overflow)(self)
-
-    @settings(suppress_health_check=[HealthCheck.too_slow])
-    def test_div(self):
-        def ref(x, y):
-            return (x / y, )
-
-        def non_zero(x):
-            return abs(x) > 1e-2
-
-        def div_dtypes():
-            return st.sampled_from([np.float32, np.float64])
-
-        _test_binary(
-            "Div", ref, filter_=non_zero, test_gradient=True,
-            dtypes=div_dtypes, gcs=hu.gcs_cpu_only
-        )(self)
-        _test_binary(
-            "Div", ref, filter_=non_zero, test_gradient=False,
-            dtypes=div_dtypes
-        )(self)
-        _test_binary_broadcast(
-            "Div", ref, filter_=non_zero, dtypes=div_dtypes)(self)
-
-    @given(X=hu.tensor(), in_place=st.booleans(), **hu.gcs)
-    @settings(deadline=1000)
-    def test_negative(self, X, in_place, gc, dc):
-        op = core.CreateOperator("Negative", ["X"],
-                                 ["Y" if not in_place else "X"])
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(X=hu.tensor(), **hu.gcs)
-    @settings(deadline=1000)
-    def test_tanh(self, X, gc, dc):
-        op = core.CreateOperator("Tanh", "X", "Y")
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(X=hu.tensor(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_averaged_loss(self, X, gc, dc):
-        op = core.CreateOperator("AveragedLoss", ["X"], ["loss"])
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_softsign(self, X, inplace, gc, dc):
-        op = core.CreateOperator("Softsign", ["X"], ["X" if inplace else "Y"])
-
-        def softsign(X):
-            return (X / (1 + np.abs(X)),)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertReferenceChecks(gc, op, [X], softsign)
-        if inplace:
-            with self.assertRaises(Exception):
-                self.assertGradientChecks(gc, op, [X], 0, [0])
-        else:
-            self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(
-        device_options=st.lists(
-            min_size=2,
-            max_size=4,
-            elements=st.sampled_from(hu.expanded_device_options)),
-        set_seed=st.booleans())
-    @settings(deadline=10000)
-    def test_random_seed_behaviour(self, device_options, set_seed):
-        # Assume we are always operating on CUDA or CPU, since RNG is
-        # inconsistent between CPU and GPU.
-        device_options = copy.deepcopy(device_options)
-        assume(len({do.device_type for do in device_options}) == 1)
-        if set_seed:
-            for do in device_options:
-                do.random_seed = 1000
-
-        def run(do):
-            # Reset each time because 'Y' may already exist in the workspace
-            #   on a different device
-            workspace.ResetWorkspace()
-            ws = workspace.C.Workspace()
-            op = core.CreateOperator(
-                "XavierFill", [], ["Y"],
-                device_option=do,
-                shape=[2])
-            ws.run(op)
-            return ws.blobs["Y"].fetch()
-
-        ys = [run(do) for do in device_options]
-        for y in ys[1:]:
-            if set_seed:
-                np.testing.assert_array_equal(ys[0], y)
-            else:
-                with self.assertRaises(AssertionError):
-                    np.testing.assert_array_equal(ys[0], y)
-
-    @given(axis=st.integers(min_value=1, max_value=4),
-           num_output=st.integers(min_value=4, max_value=8),
-           engine=st.sampled_from(["", "PACKED"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_fully_connected_axis(self, axis, num_output, engine, gc, dc):
-        np.random.seed(1)
-        X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)
-
-        def prod(xs):
-            p = 1
-            for x in xs:
-                p *= x
-            return p
-
-        K = prod(list(X.shape)[axis:])
-        N = num_output
-        W = np.random.randn(N, K).astype(np.float32)
-        b = np.random.randn(N).astype(np.float32)
-
-        op = core.CreateOperator(
-            "FC",
-            ["X", "W", "b"],
-            ["Y"],
-            engine=engine,
-            axis=axis)
-        for name, param in [("X", X), ("W", W), ("b", b)]:
-            self.ws.create_blob(name).feed(param)
-        self.ws.run(op)
-        Y = self.ws.blobs["Y"].fetch()
-        self.assertEqual(list(Y.shape), list(X.shape)[:axis] + [N])
-
-        inputs = [X, W, b]
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for param, _ in enumerate(inputs):
-            self.assertGradientChecks(gc, op, inputs, param, [0])
-
-    @unittest.skipIf(not workspace.has_gpu_support,
-                     "Skipping test due to no gpu present.")
-    @settings(deadline=None)
-    @given(hidden_size=st.integers(min_value=1, max_value=3),
-           num_layers=st.integers(min_value=1, max_value=3),
-           bidirectional=st.booleans(),
-           rnn_mode=st.sampled_from(["lstm"]),   # TODO: "gru"
-           input_mode=st.sampled_from(["linear"]),
-           dropout=hu.floats(min_value=1.0, max_value=1.0),
-           T=st.integers(min_value=2, max_value=6),
-           N=st.integers(min_value=1, max_value=4),
-           D=st.integers(min_value=1, max_value=4))
-    def test_recurrent(self, hidden_size, num_layers, bidirectional, rnn_mode,
-                       input_mode, dropout, T, N, D):
-        #there's a bug in miopen for N=1 which would be resolved in the next release.
-        if workspace.has_hip_support:
-            assume(N>1)
-        # Random seed, this one happens to pass
-        seed = 1234
-        np.random.seed(seed)
-        # set device option
-        if workspace.has_hip_support:
-            device_option = hu.hip_do
-            engine = 'MIOPEN'
-        else:
-            device_option = hu.gpu_do
-            engine = 'CUDNN'
-        input_weight_size = hidden_size * D
-        upper_layer_input_weight_size = hidden_size * hidden_size
-        if bidirectional:
-            upper_layer_input_weight_size *= 2
-        recurrent_weight_size = hidden_size * hidden_size
-        input_bias_size = hidden_size
-        recurrent_bias_size = hidden_size
-        num_directions = 2 if bidirectional else 1
-        first_layer_sz = input_weight_size + recurrent_weight_size + \
-                         input_bias_size + recurrent_bias_size
-        upper_layer_sz = upper_layer_input_weight_size + \
-                         recurrent_weight_size + input_bias_size + \
-                         recurrent_bias_size
-        total_sz = 4 * (first_layer_sz + (num_layers - 1) * upper_layer_sz)
-        total_sz *= num_directions
-
-        W = np.random.rand(total_sz).astype(np.float32)
-        self.ws.create_blob("WEIGHT").feed(W, device_option=device_option)
-
-        op = core.CreateOperator(
-            "Recurrent",
-            ["INPUT", "HIDDEN_INPUT", "CELL_INPUT", "WEIGHT"],
-            ["OUTPUT", "HIDDEN_OUTPUT", "CELL_OUTPUT",
-             "RNN_SCRATCH", "DROPOUT_STATES"],
-            hidden_size=hidden_size,
-            bidirectional=bidirectional,
-            rnn_mode=rnn_mode,
-            dropout=dropout,
-            input_mode=input_mode,
-            num_layers=num_layers,
-            seed=seed,
-            engine=engine)
-        X = np.random.randn(T, N, D).astype(np.float32)
-        self.ws.create_blob("INPUT").feed(X, device_option=device_option)
-        W = self.ws.blobs["WEIGHT"].fetch()
-        H = np.random.randn(
-            num_layers, N, hidden_size * num_directions).astype(
-                np.float32)
-        C = np.random.randn(
-            num_layers, N, hidden_size * num_directions).astype(
-                np.float32) if rnn_mode == "lstm" else \
-            np.empty((1,)).astype(np.float32)  # unused in GRU
-        inputs = [X, H, C, W]
-        input_idxs = [i for (i, _) in enumerate(inputs)] \
-            if rnn_mode == "lstm" else [0, 1, 3]  # ignore C
-        for input_idx in input_idxs:
-            self.assertGradientChecks(
-                device_option, op, inputs, input_idx, [0],
-                stepsize=0.01, threshold=0.01)
-
-    @given(ndim=st.integers(1, 4),
-           axis=st.integers(0, 3),
-           add_axis=st.integers(0, 1),
-           num_inputs=st.integers(2, 4), **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_depth_concat(self, ndim, axis, add_axis, num_inputs, gc, dc):
-        assume(axis < ndim)
-        input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
-        shape = [2, 3, 5, 7][:ndim]
-        individual_dims = [1, 2, 3, 4, 5][:num_inputs]
-        inputs = []
-        for i in range(num_inputs):
-            if add_axis == 0:
-                # Sets a unique dim and create the input.
-                shape[axis] = individual_dims[i]
-            inputs.append(np.random.randn(*shape).astype(np.float32))
-        op = core.CreateOperator("Concat", input_names, ["Y", "Y_dims"],
-                                 axis=axis, add_axis=add_axis)
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(num_inputs):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-        # Reference
-        def depth_concat(*inputs):
-            inputs = list(inputs)
-            if add_axis:
-                for i in range(len(inputs)):
-                    inputs[i] = np.expand_dims(inputs[i], axis)
-            input_dims = np.array([np.shape(x)[axis] for x in inputs])
-            return [np.concatenate(inputs, axis=axis), input_dims]
-
-        self.assertReferenceChecks(gc, op, inputs, depth_concat)
-
-    @given(num_inputs=st.integers(2, 4),
-           order=st.sampled_from([("NCHW", 1), ("NHWC", 3)]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_depth_concat_with_order(self, num_inputs, order, gc, dc):
-        input_names = ['X0', 'X1', 'X2', 'X3'][:num_inputs]
-        shape = [2, 3, 5, 7]
-        individual_dims = [1, 2, 3, 4][:num_inputs]
-        inputs = []
-        for i in range(num_inputs):
-            # Sets a unique dim and create the input.
-            shape[order[1]] = individual_dims[i]
-            inputs.append(np.random.rand(*shape).astype(np.float32))
-        op = core.CreateOperator("Concat", input_names, ["Y", "Y_dims"],
-                                 order=order[0])
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(num_inputs):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-        # Reference
-        def depth_concat_with_order(*inputs):
-            inputs = list(inputs)
-            axis = order[1]
-            input_dims = np.array([np.shape(x)[axis] for x in inputs])
-            return [np.concatenate(inputs, axis=axis), input_dims]
-
-        self.assertReferenceChecks(gc, op, inputs, depth_concat_with_order)
-
-    @given(X=hu.arrays(dims=[5, 2],
-                       elements=hu.floats(
-                           min_value=1.0,
-                           max_value=10.0)
-                       ),
-           **hu.gcs_cpu_only)
-    @settings(deadline=1000)
-    def test_last_n_windows(self, X, gc, dc):
-        workspace.FeedBlob('input', X)
-        workspace.FeedBlob('next', np.array(0, dtype=np.int32))
-        workspace.CreateBlob('output')
-        collect_net = core.Net('collect_net')
-        collect_net.LastNWindowCollector(
-            ['output', 'next', 'input'],
-            ['output', 'next'],
-            num_to_collect=7,
-        )
-        plan = core.Plan('collect_data')
-        plan.AddStep(core.execution_step('collect_data',
-                                         [collect_net], num_iter=2))
-        workspace.RunPlan(plan)
-        output = workspace.FetchBlob('output')
-        inputs = workspace.FetchBlob('input')
-        new_output = np.zeros([7, inputs.shape[1]])
-        for i in range(inputs.shape[0] * 2):
-            new_output[i % 7] = inputs[i % inputs.shape[0]]
-        import numpy.testing as npt
-        npt.assert_almost_equal(output, new_output, decimal=5)
-
-    @given(dtype=st.sampled_from([np.float32, np.float64, np.int32, bool]))
-    @settings(deadline=1000)
-    def test_print(self, dtype):
-        data = np.random.permutation(6).astype(dtype)
-        self.ws.create_blob("data").feed(data)
-        op = core.CreateOperator("Print", "data", [])
-        self.ws.run(op)
-
-    @given(inputs=hu.tensors(n=2),
-           in_place=st.booleans(),
-           momentum=hu.floats(min_value=0.1, max_value=0.9),
-           nesterov=st.booleans(),
-           lr=hu.floats(min_value=0.1, max_value=0.9),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_momentum_sgd(
-            self, inputs, in_place, momentum, nesterov, lr, gc, dc):
-        grad, m = inputs
-        lr = np.asarray([lr], dtype=np.float32)
-        op = core.CreateOperator(
-            "MomentumSGD",
-            ["grad", "m", "lr"],
-            ["grad" if in_place else "grad_o",
-             "m" if in_place else "m_o"],
-            momentum=momentum,
-            nesterov=int(nesterov),
-            device_option=gc)
-        self.assertDeviceChecks(
-            dc, op, [grad, m, lr], [0])
-
-        # Reference
-        def momentum_sgd(grad, m, lr):
-            lr = lr[0]
-            if not nesterov:
-                adjusted_gradient = lr * grad + momentum * m
-                return (adjusted_gradient, adjusted_gradient)
-            else:
-                m_new = momentum * m + lr * grad
-                return ((1 + momentum) * m_new - momentum * m, m_new)
-
-        self.assertReferenceChecks(gc, op, [grad, m, lr], momentum_sgd)
-
-    @given(inputs=hu.tensors(n=3),
-           in_place=st.booleans(),
-           decay=hu.floats(min_value=0.1, max_value=0.9),
-           momentum=hu.floats(min_value=0.1, max_value=0.9),
-           lr=hu.floats(min_value=0.1, max_value=0.9),
-           epsilon=hu.floats(min_value=1e-5, max_value=1e-2),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_rmsprop_sgd(self, inputs, in_place, decay, momentum, lr, epsilon,
-                         gc, dc):
-        grad, ms, mom = inputs
-        ms = np.abs(ms) + 0.01
-        lr = np.asarray([lr], dtype=np.float32)
-        op = core.CreateOperator(
-            "RmsProp",
-            ["grad", "ms", "mom", "lr"],
-            ["grad" if in_place else "grad_o",
-             "ms" if in_place else "ms_o",
-             "mom" if in_place else "mom_o"],
-            momentum=momentum, decay=decay, epsilon=epsilon, device_option=gc)
-        self.assertDeviceChecks(dc, op, [grad, ms, mom, lr], [0])
-
-        def rmsprop(grad, ms, mom, lr):
-            lr = lr[0]
-            ms_o = ms + (1. - decay) * (np.square(grad) - ms)
-            mom_o = momentum * mom + lr * grad / np.sqrt(epsilon + ms_o)
-            grad_o = mom_o
-            return (grad_o, ms_o, mom_o)
-        self.assertReferenceChecks(gc, op, [grad, ms, mom, lr], rmsprop)
-
-    # Reference
-    @staticmethod
-    def _dense_ftrl(alpha, beta, lambda1, lambda2, w, nz, g):
-        if isinstance(alpha, np.ndarray):
-            alpha = alpha.item()
-        n = np.take(nz, 0, axis=-1)
-        z = np.take(nz, 1, axis=-1)
-        # python port of Sigrid's implementation
-        g2 = g * g
-        sigma = (np.sqrt(n + g2) - np.sqrt(n)) / alpha
-        z += g - sigma * w
-        n += g2
-        w = (np.sign(z) * lambda1 - z) / (
-            (beta + np.sqrt(n)) / alpha + lambda2)
-        w[np.abs(z) <= lambda1] = 0
-        return (w, np.stack([n, z], axis=-1))
-
-    @given(inputs=hu.tensors(n=4),
-           in_place=st.booleans(),
-           alpha=hu.floats(min_value=0.01, max_value=0.1),
-           beta=hu.floats(min_value=0.1, max_value=0.9),
-           lambda1=hu.floats(min_value=0.001, max_value=0.1),
-           lambda2=hu.floats(min_value=0.001, max_value=0.1),
-           engine=st.sampled_from([None, "SIMD"]),
-           **hu.gcs_cpu_only)
-    @settings(deadline=1000)
-    def test_ftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2,
-                      engine, gc, dc):
-        var, n, z, grad = inputs
-        n = np.abs(n)
-        nz = np.stack([n, z], axis=-1)
-        op = core.CreateOperator(
-            "Ftrl",
-            ["var", "nz", "grad"],
-            ["var" if in_place else "var_o",
-             "nz" if in_place else "nz_o"],
-            alpha=alpha, beta=beta, lambda1=lambda1, lambda2=lambda2,
-            engine=engine,
-            device_option=gc)
-        self.assertDeviceChecks(
-            dc, op, [var, nz, grad], [0])
-
-        self.assertReferenceChecks(
-            gc, op, [var, nz, grad],
-            partial(self._dense_ftrl, alpha, beta, lambda1, lambda2))
-
-    # Reference
-    @staticmethod
-    def _dense_gftrl(alpha, beta, lambda1, lambda2, w, nz, g):
-        if isinstance(alpha, np.ndarray):
-            alpha = alpha.item()
-
-        old_shape = g.shape
-
-        n = np.take(nz, 0, axis=-1)
-        z = np.take(nz, 1, axis=-1)
-
-        output_dim = g.shape[0]
-
-        w = w.reshape(output_dim, -1)
-        g = g.reshape(output_dim, -1)
-
-        n = n.reshape(output_dim, -1)
-        z = z.reshape(output_dim, -1)
-
-        input_dim = g.shape[1]
-
-        g2 = g * g
-        sigma = (np.sqrt(n + g2) - np.sqrt(n)) / alpha
-        z += g - sigma * w
-        n += g2
-
-        z_norms = np.linalg.norm(z, 2, axis=0)
-
-        z_norms = z_norms + 1e-6
-        w = z * ((lambda1 * np.sqrt(output_dim)) / z_norms - 1) / \
-                    ((beta + np.sqrt(n)) / alpha + lambda2)
-        for i in range(input_dim):
-            if z_norms[i] <= lambda1 * np.sqrt(output_dim):
-                w[:, i] = 0
-
-        w = w.reshape(old_shape)
-        n = n.reshape(old_shape)
-        z = z.reshape(old_shape)
-        return (w, np.stack([n, z], axis=-1))
-
-    @given(inputs=hu.tensors(n=4),
-           in_place=st.booleans(),
-           alpha=hu.floats(min_value=0.01, max_value=0.1),
-           beta=hu.floats(min_value=0.1, max_value=0.9),
-           lambda1=hu.floats(min_value=0.001, max_value=0.1),
-           lambda2=hu.floats(min_value=0.001, max_value=0.1),
-           engine=st.sampled_from([None, "SIMD"]),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_gftrl_sgd(self, inputs, in_place, alpha, beta, lambda1, lambda2,
-                       engine, gc, dc):
-        var, n, z, grad = inputs
-        n = np.abs(n)
-        nz = np.stack([n, z], axis=-1)
-        op = core.CreateOperator(
-            "GFtrl",
-            ["var", "nz", "grad"],
-            ["var" if in_place else "var_o",
-             "nz" if in_place else "nz_o"],
-            alpha=alpha, beta=beta, lambda1=lambda1, lambda2=lambda2,
-            engine=engine,
-            device_option=gc)
-        self.assertDeviceChecks(
-            dc, op, [var, nz, grad], [0])
-
-        self.assertReferenceChecks(
-            gc, op, [var, nz, grad],
-            partial(self._dense_gftrl, alpha, beta, lambda1, lambda2))
-
-    @given(inputs=hu.tensors(n=4),
-           alpha=hu.floats(min_value=0.01, max_value=0.1),
-           beta=hu.floats(min_value=0.1, max_value=0.9),
-           lambda1=hu.floats(min_value=0.001, max_value=0.1),
-           lambda2=hu.floats(min_value=0.001, max_value=0.1),
-           engine=st.sampled_from([None, "SIMD"]),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_sparse_ftrl_sgd(self, inputs, alpha, beta, lambda1, lambda2,
-                             engine, gc, dc):
-        var, n, z, grad = inputs
-        # generate fake subset manually because hypothesis is too complicated :)
-        indices = np.arange(var.shape[0])
-        indices = indices[indices % 2 == 0]
-        grad = grad[indices]
-        n = np.abs(n)
-        nz = np.stack([n, z], axis=-1)
-        op = core.CreateOperator(
-            "SparseFtrl",
-            ["var", "nz", "indices", "grad"],
-            ["var", "nz"],
-            alpha=alpha, beta=beta, lambda1=lambda1, lambda2=lambda2,
-            engine=engine,
-            device_option=gc)
-        self.assertDeviceChecks(
-            dc, op, [var, nz, indices, grad], [0])
-
-        # Reference
-        def ftrl(w, nz, i, g):
-            sw, snz = self._dense_ftrl(alpha, beta, lambda1, lambda2,
-                                       w[i], nz[i], g)
-            w[i] = sw
-            nz[i] = snz
-            return (w, nz)
-
-        self.assertReferenceChecks(gc, op, [var, nz, indices, grad], ftrl)
-
-    # Reference
-    @staticmethod
-    def _dense_ftrl_send_alpha_by_input(beta, lambda1, lambda2, w, nz, g, alpha):
-        return TestOperators._dense_ftrl(alpha, beta, lambda1, lambda2, w, nz,
-                                         g)
-
-    @given(inputs=hu.tensors(n=4),
-           in_place=st.booleans(),
-           alpha=hu.floats(min_value=0.01, max_value=0.1),
-           beta=hu.floats(min_value=0.1, max_value=0.9),
-           lambda1=hu.floats(min_value=0.001, max_value=0.1),
-           lambda2=hu.floats(min_value=0.001, max_value=0.1),
-           engine=st.sampled_from([None, "SIMD"]),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_ftrl_sgd_send_alpha_by_input(self, inputs, in_place, alpha, beta,
-                                          lambda1, lambda2, engine, gc, dc):
-        var, n, z, grad = inputs
-        n = np.abs(n)
-        nz = np.stack([n, z], axis=-1)
-        alpha = np.array(alpha).astype(np.float32)
-        op = core.CreateOperator(
-            "Ftrl",
-            ["var", "nz", "grad", "alpha"],
-            ["var" if in_place else "var_o",
-             "nz" if in_place else "nz_o"],
-            beta=beta, lambda1=lambda1, lambda2=lambda2,
-            engine=engine,
-            device_option=gc)
-        self.assertDeviceChecks(
-            dc, op, [var, nz, grad, alpha], [0])
-
-        self.assertReferenceChecks(
-            gc, op, [var, nz, grad, alpha],
-            partial(self._dense_ftrl_send_alpha_by_input, beta, lambda1, lambda2))
-
-    @given(inputs=hu.tensors(n=4),
-           alpha=hu.floats(min_value=0.01, max_value=0.1),
-           beta=hu.floats(min_value=0.1, max_value=0.9),
-           lambda1=hu.floats(min_value=0.001, max_value=0.1),
-           lambda2=hu.floats(min_value=0.001, max_value=0.1),
-           engine=st.sampled_from([None, "SIMD"]),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_sparse_ftrl_sgd_send_alpha_by_input(self, inputs, alpha, beta,
-                                                 lambda1, lambda2, engine, gc,
-                                                 dc):
-        var, n, z, grad = inputs
-        # generate fake subset manually because hypothesis is too complicated :)
-        indices = np.arange(var.shape[0])
-        indices = indices[indices % 2 == 0]
-        grad = grad[indices]
-        n = np.abs(n)
-        nz = np.stack([n, z], axis=-1)
-        alpha = np.array(alpha).astype(np.float32)
-        op = core.CreateOperator(
-            "SparseFtrl",
-            ["var", "nz", "indices", "grad", "alpha"],
-            ["var", "nz"],
-            beta=beta, lambda1=lambda1, lambda2=lambda2,
-            engine=engine,
-            device_option=gc)
-        self.assertDeviceChecks(
-            dc, op, [var, nz, indices, grad, alpha], [0])
-
-        # Reference
-        def ftrl(w, nz, i, g, alpha):
-            sw, snz = self._dense_ftrl_send_alpha_by_input(beta, lambda1,
-                                                           lambda2, w[i], nz[i],
-                                                           g, alpha)
-            w[i] = sw
-            nz[i] = snz
-            return (w, nz)
-
-        self.assertReferenceChecks(gc, op, [var, nz, indices, grad, alpha],
-                                   ftrl)
-
-    @given(input=hu.tensor(max_value=20,
-                           max_dim=1,
-                           dtype=np.int32,
-                           elements=st.integers(min_value=0, max_value=10)),
-           with_remapping=st.booleans(),
-           **hu.gcs_no_hip)
-    @settings(deadline=10000)
-    def test_unique(self, input, with_remapping, gc, dc):
-        op = core.CreateOperator(
-            "Unique",
-            ["input"],
-            ["unique"] + (["remapping"] if with_remapping else []),
-            device_option=gc)
-        self.assertDeviceChecks(dc, op, [input], [0])
-
-        # Validator
-        def unique_valid(input, unique, remapping=None):
-            self.assertEqual(unique.size, len(set(input)))
-            self.assertEqual(sorted(unique), sorted(set(input)))
-            if with_remapping:
-                self.assertEqual(remapping.shape, input.shape)
-                remapped = [unique[remapping[i]] for i in range(len(input))]
-                np.testing.assert_array_equal(remapped, input)
-
-        self.assertValidationChecks(gc, op, [input], unique_valid)
-
-    @given(prediction=hu.arrays(dims=[10, 3],
-                                elements=hu.floats(allow_nan=False,
-                                                   allow_infinity=False,
-                                                   min_value=0,
-                                                   max_value=1)),
-           labels=hu.arrays(dims=[10],
-                            dtype=np.int32,
-                            elements=st.integers(min_value=0,
-                                                 max_value=3 - 1)),
-           top_k=st.integers(min_value=1, max_value=3),
-           **hu.gcs)
-    @settings(deadline=1000)
-    def test_accuracy(self, prediction, labels, top_k, gc, dc):
-        if(top_k > 1):
-            gc = hu.cpu_do
-
-        op = core.CreateOperator(
-            "Accuracy",
-            ["prediction", "labels"],
-            ["accuracy"],
-            top_k=top_k,
-            device_option=gc
-        )
-
-        def op_ref(prediction, labels, top_k):
-            N = prediction.shape[0]
-            correct = 0
-            for i in range(0, len(prediction)):
-                pred_sorted = sorted(
-                    ([item, j] for j, item in enumerate(prediction[i])),
-                    key=lambda x: x[0],
-                    reverse=True
-                )
-                max_ids = [x[1] for x in pred_sorted[0:top_k]]
-                for m in max_ids:
-                    if m == labels[i]:
-                        correct += 1
-            accuracy = correct / N
-            return (accuracy,)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[prediction, labels, top_k],
-            reference=op_ref)
-
-    @given(target_probabilities=hu.arrays(
-        dims=[10], elements=hu.floats(allow_nan=False,
-                                      allow_infinity=False,
-                                      min_value=0.01,
-                                      max_value=1)),
-           **hu.gcs)
-    @settings(deadline=1000)
-    def test_perplexity(self, target_probabilities, gc, dc):
-        op = core.CreateOperator(
-            "Perplexity",
-            ["target_probabilities"],
-            ["perplexity"]
-        )
-
-        def op_ref(target_probabilities):
-            N = target_probabilities.shape[0]
-            perplexities = np.power(target_probabilities, -1.0 / N)
-            perplexity = reduce(lambda x, y: x * y, perplexities)
-            return (perplexity,)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[target_probabilities],
-            reference=op_ref)
-
-    @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
-                            min_size=0,
-                            max_size=10),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_lengths_to_segment_ids(self, lengths, gc, dc):
-        op = core.CreateOperator(
-            "LengthsToSegmentIds",
-            ["lengths"],
-            ["segment_ids"])
-
-        def op_ref(lengths):
-            sids = []
-            for i, l in enumerate(lengths):
-                sids.extend(l * [i])
-            return (np.array(sids, dtype=np.int32), )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[np.array(lengths, dtype=np.int32)],
-            reference=op_ref)
-
-    @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
-                            min_size=0,
-                            max_size=10),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_lengths_range_fill(self, lengths, gc, dc):
-        op = core.CreateOperator(
-            "LengthsRangeFill",
-            ["lengths"],
-            ["increasing_seq"])
-
-        def op_ref(lengths):
-            sids = []
-            for _, l in enumerate(lengths):
-                sids.extend(list(range(l)))
-            return (np.array(sids, dtype=np.int32), )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[np.array(lengths, dtype=np.int32)],
-            reference=op_ref)
-
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_segment_ids_to_ranges(self, gc, dc):
-        lengths = [4, 6, 3, 2, 0, 4]
-        op = core.CreateOperator(
-            "SegmentIdsToRanges",
-            ["segment_ids"],
-            ["ranges"])
-
-        def op_ref(segment_ids):
-            ranges = [np.array([0, 0], dtype=np.int32)]
-            prev = 0
-            for i, sid in enumerate(segment_ids):
-                while sid != prev:
-                    prev += 1
-                    ranges.append(np.array([i, 0], dtype=np.int32))
-                ranges[-1][1] += 1
-            return (np.array(ranges, dtype=np.int32), )
-
-        def lengths_to_segment_ids(lengths):
-            sids = []
-            for i, l in enumerate(lengths):
-                sids.extend(l * [i])
-            return (np.array(sids, dtype=np.int32), )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=np.array(lengths_to_segment_ids(lengths), dtype=np.int32),
-            reference=op_ref)
-
-    @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
-                            min_size=0,
-                            max_size=10),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_lengths_to_ranges(self, lengths, gc, dc):
-        op = core.CreateOperator(
-            "LengthsToRanges",
-            ["lengths"],
-            ["ranges"])
-
-        def op_ref(x):
-            if not x.size:
-                return (x.reshape((0, 2)), )
-            return (np.column_stack((np.concatenate(([0], np.cumsum(x)[:-1])),
-                                     x)), )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[np.array(lengths, dtype=np.int32)],
-            reference=op_ref)
-
-    @given(
-        lengths=st.lists(
-            st.integers(min_value=0, max_value=10), min_size=0, max_size=10
-        ),
-        include_last_offset=st.booleans(),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=None)
-    def test_lengths_to_offsets(self, lengths, include_last_offset, gc, dc):
-        op = core.CreateOperator(
-            "LengthsToOffsets",
-            ["lengths"],
-            ["ranges"],
-            include_last_offset=include_last_offset,
-        )
-
-        def op_ref(x):
-            if not x.size:
-                arr = [x.reshape(0)]
-            else:
-                arr = [np.concatenate(([0], np.cumsum(x)[:-1]))]
-            if include_last_offset:
-                arr[0] = np.concatenate((arr[0], np.array([np.sum(x)])))
-            return tuple(arr)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[np.array(lengths, dtype=np.int32)],
-            reference=op_ref,
-        )
-
-    @given(prediction=hu.arrays(dims=[10, 3],
-                                elements=hu.floats(allow_nan=False,
-                                                   allow_infinity=False,
-                                                   min_value=0,
-                                                   max_value=1)),
-           labels=hu.arrays(dims=[10],
-                            dtype=np.int32,
-                            elements=st.integers(min_value=0,
-                                                 max_value=3 - 1)),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_multi_class_accuracy(self, prediction, labels, gc, dc):
-        op = core.CreateOperator(
-            "MultiClassAccuracy",
-            ["prediction", "labels"],
-            ["accuracies", "amounts"]
-        )
-
-        def op_ref(prediction, labels):
-            N = prediction.shape[0]
-            D = prediction.shape[1]
-            accuracies = np.empty(D, dtype=float)
-            accuracies.fill(0)
-            amounts = np.empty(D, dtype=int)
-            amounts.fill(0)
-            max_ids = np.argmax(prediction, axis=1)
-            for i in range(0, N):
-                max_id = max_ids[i]
-                label_id = labels[i]
-                if max_id == label_id:
-                    accuracies[label_id] += 1
-                amounts[label_id] += 1
-            for i in range(0, D):
-                amount = amounts[i]
-                if amount:
-                    accuracies[i] /= amount
-            return (accuracies, amounts,)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[prediction, labels],
-            reference=op_ref)
-
-    @given(lengths=st.lists(st.integers(min_value=0, max_value=10),
-                            min_size=0,
-                            max_size=10),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_segment_ids_to_lengths(self, lengths, gc, dc):
-        op = core.CreateOperator(
-            "SegmentIdsToLengths",
-            ["segment_ids"],
-            ["lengths"])
-
-        def lengths_to_ids(lengths):
-            sids = []
-            for i, l in enumerate(lengths):
-                sids.extend(l * [i])
-            return sids
-
-        segment_ids = lengths_to_ids(lengths)
-
-        def ids_to_lengths(ids):
-            ids_length = len(ids)
-            if ids_length == 0:
-                return (np.array([], dtype=np.int32),)
-
-            lengths = []
-            # segment id starts with 0
-            prev_id = -1
-            tmp_length = 0
-            for idx in range(ids_length):
-                cur_id = ids[idx]
-                if cur_id != prev_id:
-                    if idx != 0:
-                        lengths.append(tmp_length)
-                    while prev_id + 1 != cur_id:
-                        lengths.append(0)
-                        prev_id += 1
-                    prev_id = cur_id
-                    tmp_length = 0
-                tmp_length += 1
-            lengths.append(tmp_length)
-            return (np.array(lengths, dtype=np.int32),)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[np.array(segment_ids, dtype=np.int32)],
-            reference=ids_to_lengths)
-
-    @given(lengths=st.lists(st.integers(min_value=1, max_value=10),
-                            min_size=0,
-                            max_size=10),
-           power=st.sampled_from([0.5, 1.0, 1.5, 2.0]),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_lengths_to_weights(self, lengths, power, gc, dc):
-        op = core.CreateOperator(
-            "LengthsToWeights",
-            ["lengths"],
-            ["weights"],
-            power=power)
-
-        def lengths_to_weights(lengths):
-            weighted_length = []
-            for l in lengths:
-                weighted_length.extend(l * [1 / pow(l, power)])
-
-            return (np.array(weighted_length, dtype=float),)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[np.array(lengths, dtype=np.int32)],
-            reference=lengths_to_weights)
-
-    @given(input_tensor=hu.arrays(
-        dims=[10], elements=hu.floats(allow_nan=False,
-                                      allow_infinity=False)),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_abs(self, input_tensor, gc, dc):
-        op = core.CreateOperator(
-            "Abs",
-            ["input"],
-            ["output"]
-        )
-
-        def abs_ref(input_tensor):
-            return (np.abs(input_tensor),)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[input_tensor],
-            reference=abs_ref)
-
-    @given(input_tensor=hu.arrays(
-        dims=[10], elements=hu.floats(min_value=-10,
-                                      max_value=10)),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_cos(self, input_tensor, gc, dc):
-        op = core.CreateOperator(
-            "Cos",
-            ["input"],
-            ["output"]
-        )
-
-        def cos_ref(input_tensor):
-            return (np.cos(input_tensor),)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[input_tensor],
-            reference=cos_ref)
-
-    @given(input_tensor=hu.arrays(
-           dims=[10], elements=hu.floats(min_value=-10,
-                                         max_value=10)),
-           **hu.gcs)
-    @settings(deadline=1000)
-    def test_sin(self, input_tensor, gc, dc):
-        op = core.CreateOperator(
-            "Sin",
-            ["input"],
-            ["output"]
-        )
-
-        def sin_ref(input_tensor):
-            return (np.sin(input_tensor),)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[input_tensor],
-            reference=sin_ref)
-
-    @given(input_tensor=hu.arrays(
-           dims=[10], elements=hu.floats(allow_nan=False,
-                                         allow_infinity=False)),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_exp(self, input_tensor, gc, dc):
-        op = core.CreateOperator(
-            "Exp",
-            ["input"],
-            ["output"]
-        )
-
-        def exp_ref(input_tensor):
-            return (np.exp(input_tensor),)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[input_tensor],
-            reference=exp_ref)
-
-    @given(input_tensor=hu.arrays(
-        dims=[10], elements=hu.floats(min_value=1,
-                                      max_value=10000)),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_log(self, input_tensor, gc, dc):
-        op = core.CreateOperator(
-            "Log",
-            ["input"],
-            ["output"]
-        )
-
-        def log_ref(input_tensor):
-            return (np.log(input_tensor),)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[input_tensor],
-            reference=log_ref)
-        self.assertGradientChecks(gc, op, [input_tensor], 0, [0])
-
-    def test_blobs_dequeue_timeout(self):
-        op = core.CreateOperator(
-            "CreateBlobsQueue",
-            [],
-            ["queue"],
-            capacity=5,
-            num_blobs=1)
-        self.ws.run(op)
-        t = time.time()
-        op = core.CreateOperator(
-            "DequeueBlobs",
-            ["queue"],
-            ["out"],
-            timeout_secs=0.2)
-        self.assertRaises(RuntimeError, lambda: self.ws.run(op))
-        t = time.time() - t
-        self.assertGreater(t, 0.19)
-
-    @given(num_threads=st.integers(1, 10),  # noqa
-           num_elements=st.integers(1, 100),
-           capacity=st.integers(1, 5),
-           num_blobs=st.integers(1, 3),
-           do=st.sampled_from(hu.device_options))
-    @settings(deadline=10000)
-    def test_blobs_queue_threading(self, num_threads, num_elements,
-                                   capacity, num_blobs, do):
-        """
-        - Construct matrices of size N x D
-        - Start K threads
-        - Push all N rows into the queue of capacity C
-        - Pull all N rows out of the queue.
-        - Verify that the output matrices are permutation of the rows of the
-          original matrices.
-        """
-        import threading
-        import queue
-        op = core.CreateOperator(
-            "CreateBlobsQueue",
-            [],
-            ["queue"],
-            capacity=capacity,
-            num_blobs=num_blobs,
-            device_option=do)
-        self.ws.run(op)
-
-        xs = [np.random.randn(num_elements, 5).astype(np.float32)
-              for _ in range(num_blobs)]
-        q = queue.Queue()
-        for i in range(num_elements):
-            q.put([x[i] for x in xs])
-
-        def enqueue(t):
-            while True:
-                feed_blobs = ["x_{}_{}".format(i, t) for i in range(num_blobs)]
-                op = core.CreateOperator(
-                    "EnqueueBlobs",
-                    ["queue"] + feed_blobs,
-                    feed_blobs,
-                    device_option=do)
-                try:
-                    elems = q.get_nowait()
-                    for elem, feed_blob in zip(elems, feed_blobs):
-                        self.ws.create_blob(feed_blob).feed(
-                            elem, device_option=do)
-                    self.ws.run(op)
-                except queue.Empty:
-                    return
-
-        # Create all blobs before racing on multiple threads
-        # (blob creation is not threadsafe)
-        for t in range(num_threads):
-            for i in range(num_blobs):
-                self.ws.create_blob("x_{}_{}".format(i, t))
-
-        threads = [threading.Thread(target=enqueue, args=(t,))
-                   for t in range(num_threads)]
-        for thread in threads:
-            thread.start()
-
-        for n in range(num_elements):
-            dequeue_blobs = ["y_{}_{}".format(i, n) for i in range(num_blobs)]
-            op = core.CreateOperator(
-                "DequeueBlobs",
-                ["queue"],
-                dequeue_blobs,
-                device_option=do)
-            self.ws.run(op)
-        for thread in threads:
-            thread.join()
-        op = core.CreateOperator("CloseBlobsQueue", ["queue"], [])
-        self.ws.run(op)
-        ys = [np.vstack([self.ws.blobs["y_{}_{}".format(i, n)].fetch()
-                         for n in range(num_elements)])
-              for i in range(num_blobs)]
-        for i in range(num_blobs):
-            self.assertEqual(ys[i].shape, xs[i].shape)
-            for j in range(num_elements):
-                # Verify that the rows of the returned blob are a
-                # permutation. The order may be different due to
-                # different threads racing.
-                self.assertTrue(
-                    any(np.array_equal(xs[i][j], ys[i][k])
-                        for k in range(num_elements)))
-
-    @given(num_producers=st.integers(1, 10),
-           num_consumers=st.integers(1, 10),
-           capacity=st.integers(1, 5),
-           num_blobs=st.integers(1, 3),
-           do=st.sampled_from(hu.device_options))
-    @settings(deadline=None, max_examples=50)
-    def test_safe_blobs_queue(self, num_producers, num_consumers,
-                              capacity, num_blobs, do):
-        init_net = core.Net('init_net')
-        queue = init_net.CreateBlobsQueue(
-            [], 1, capacity=capacity, num_blobs=num_blobs)
-        producer_steps = []
-        truth = 0
-        for i in range(num_producers):
-            name = 'producer_%d' % i
-            net = core.Net(name)
-            blobs = [net.ConstantFill([], 1, value=1.0, run_once=False)
-                     for times in range(num_blobs)]
-            status = net.NextName()
-            net.SafeEnqueueBlobs([queue] + blobs, blobs + [status])
-            count = (i + 1) * 10
-            step = core.execution_step(name, net, num_iter=count)
-            truth += count
-            producer_steps.append(step)
-        producer_exit_net = core.Net('producer_exit_net')
-        producer_exit_net.CloseBlobsQueue([queue], 0)
-        producer_step = core.execution_step('producer', [
-            core.execution_step(
-                'producers', producer_steps, concurrent_substeps=True),
-            core.execution_step('producer_exit', producer_exit_net)]
-        )
-
-        consumer_steps = []
-        counters = []
-        const_1 = init_net.ConstantFill([], 1, value=1.0)
-        for i in range(num_consumers):
-            name = 'consumer_%d' % i
-            net1 = core.Net(name)
-            blobs = net1.SafeDequeueBlobs([queue], num_blobs + 1)
-            status = blobs[-1]
-
-            net2 = core.Net(name + '_counter')
-            counter = init_net.ConstantFill([], 1, value=0.0)
-            counters.append(counter)
-            net2.Add([counter, const_1], counter)
-            consumer_steps.append(core.execution_step(
-                name, [net1, net2], should_stop_blob=status))
-        consumer_step = core.execution_step(
-            'consumer', consumer_steps, concurrent_substeps=True)
-
-        init_step = core.execution_step('init', init_net)
-        worker_step = core.execution_step(
-            'worker', [consumer_step, producer_step], concurrent_substeps=True)
-
-        plan = core.Plan('test')
-        plan.AddStep(init_step)
-        plan.AddStep(worker_step)
-
-        self.ws.run(plan)
-        v = 0
-        for counter in counters:
-            v += self.ws.blobs[str(counter)].fetch().tolist()
-        self.assertEqual(v, truth)
-
-    @given(num_queues=st.integers(1, 5),
-           num_iter=st.integers(5, 10),
-           capacity=st.integers(1, 5),
-           num_blobs=st.integers(1, 3))
-    @settings(deadline=None, max_examples=50)
-    def test_weighted_sample_blobs_queue(
-        self, num_queues, num_iter, capacity, num_blobs
-    ):
-        # Create BlobsQueue for each input queue
-        print("num_queues", num_queues)
-        init_net = core.Net('init_net')
-        queues = [
-            init_net.CreateBlobsQueue(
-                [], 1, capacity=capacity, num_blobs=num_blobs
-            ) for _ in range(num_queues)
-        ]
-
-        # Create multiple producer nets and one producer exist net
-        producer_steps = []
-        producer_exit_nets = []
-        for i in range(num_queues):
-            name = 'producer_%d' % i
-            net = core.Net(name)
-            blobs = [net.ConstantFill([], 1, value=1.0, run_once=False)
-                     for _ in range(num_blobs)]
-            status = net.NextName()
-            net.SafeEnqueueBlobs([queues[i]] + blobs, blobs + [status])
-
-            exit_net = core.Net('producer_exit_%d' % i)
-            exit_net.CloseBlobsQueue(queues[i], 0)
-            producer_exit_nets.append(exit_net)
-
-            step = core.execution_step(
-                name, [
-                    core.execution_step(
-                        'producer_%d' % i, [net], num_iter=num_iter
-                    ),
-                    core.execution_step('producer_exit_%d' % i, [exit_net]),
-                ]
-            )
-            producer_steps.append(step)
-
-        producer_step = core.execution_step(
-            'producer', [
-                core.execution_step(
-                    'producers',
-                    producer_steps,
-                    concurrent_substeps=True,
-                ),
-            ]
-        )
-
-        status_lst = []
-
-        def append(ins, outs):
-            status_lst.append(ins)
-
-        # Create one consumer dequeue net and one consumer exist net
-        consumer_net = core.Net('weight_sample_dequeue_net')
-        table_idx_blob = np.random.randint(low=-1, high=num_blobs, size=1)
-        blobs = consumer_net.WeightedSampleDequeueBlobs(
-            queues,
-            num_blobs + 1,
-            weights=np.random.uniform(low=0.0, high=1.0, size=(num_queues,)),
-            table_idx_blob=table_idx_blob[0],
-        )
-        status = blobs[-1]
-        consumer_net.Python(append)(status)
-
-        consumer_step = core.execution_step(
-            'consumer',
-            [
-                core.execution_step(
-                    'consumer', [consumer_net], should_stop_blob=status
-                ),
-                core.execution_step('producer_exit', producer_exit_nets)
-            ]
-        )
-
-        init_step = core.execution_step('init', init_net)
-        worker_step = core.execution_step(
-            'worker', [producer_step, consumer_step], concurrent_substeps=True)
-
-        plan = core.Plan('test')
-        plan.AddStep(init_step)
-        plan.AddStep(worker_step)
-
-        self.ws.run(plan)
-        assert len(status_lst) >= num_iter + 1
-        assert len(status_lst) <= num_iter * num_queues + 1
-
-    @given(
-        data=hu.tensor(),
-        **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_squeeze_expand_dims(self, data, gc, dc):
-        dims = [0, 0]
-        if len(data.shape) > 2:
-            dims.append(2)
-        op = core.CreateOperator(
-            "ExpandDims",
-            ["data"],
-            ["expanded"],
-            dims=dims)
-
-        def expand_dims_ref(data, *args, **kw):
-            inc_dims = list(set(dims))
-            inc_dims.sort()
-            r = data
-            for dim in inc_dims:
-                r = np.expand_dims(r, axis=dim)
-            return (r, )
-
-        def squeeze_ref(data, *args, **kw):
-            dec_dims = list(set(dims))
-            dec_dims.sort(reverse=True)
-            r = data
-            for dim in dec_dims:
-                r = np.squeeze(r, axis=dim)
-            return (r, )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[data],
-            reference=expand_dims_ref,
-            output_to_grad='expanded',
-            grad_reference=squeeze_ref)
-
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_tt_layer(self, gc, dc):
-        seed = 1234
-        np.random.seed(seed)
-
-        inp_sizes = [2, 2, 2, 2]
-        out_sizes = [2, 2, 2, 2]
-        tt_ranks = [1, 3, 3, 3, 1]
-
-        op = core.CreateOperator(
-            "TT",
-            ["X", "b", "cores"],
-            ["Y"],
-            inp_sizes=inp_sizes,
-            out_sizes=out_sizes,
-            tt_ranks=tt_ranks,
-        )
-
-        X = np.expand_dims(
-            np.random.rand(16).astype(np.float32), axis=0)
-        b = np.array([0] * 16).astype(np.float32)
-        cores = tt_core.init_tt_cores(inp_sizes, out_sizes, tt_ranks)
-
-        self.ws.create_blob("X").feed(X)
-        self.ws.create_blob("b").feed(b)
-        self.ws.create_blob("cores").feed(cores)
-        self.ws.run(op)
-
-        Y = self.ws.blobs[("Y")].fetch()
-        Y = Y.reshape([16])
-
-        golden = np.array([-9.51763490e-07, -1.28442286e-06,
-                           -2.86281141e-07, 2.28865644e-07,
-                           -1.96180017e-06, -1.78920531e-06,
-                           9.31094666e-07, -2.04273989e-07,
-                           1.70017107e-06, 1.64845711e-06,
-                           -1.06099132e-06, -4.69111137e-07,
-                           6.57552358e-08, -1.28942040e-08,
-                           -2.29114004e-07, -1.04262714e-06])
-
-        # This golden array is dependent on the specified inp_sizes, out_sizes,
-        # tt_ranks, and seed. Changing these will cause the test to fail.
-        self.assertAlmostEqual(np.linalg.norm(golden - Y), 0, delta=1e-10)
-
-    @given(**hu.gcs_cpu_only)
-    def test_tt_sls_layer(self, gc, dc):
-        seed = 1234
-        np.random.seed(seed)
-
-        factor_voc = [10, 10, 10]
-        factor_width = [2, 2, 2]
-
-        op = core.CreateOperator(
-            "TTSparseLengthsSum",
-            ["core0", "core1", "core2", "index", "lengths"],
-            ["Y", "core0_output", "core1_output", "indices"],
-            factor_i=factor_voc,
-            factor_j=factor_width,
-            ranks=[1, 16, 16, 1],
-            emb_size=8
-        )
-        c0 = np.ones([10, 1, 2, 16]).astype(np.float32)
-        c1 = np.ones([10, 16, 2, 16]).astype(np.float32)
-        c2 = np.ones([10, 16, 2, 1]).astype(np.float32)
-        # index = np.array([0, 1, 2, 1, 4], dtype=int)
-        # lengths = np.array([3, 2], dtype=int)
-        index = np.array([0, 1, 2, 1, 4], np.int64)
-        lengths = np.array([3, 2], np.int32)
-
-        self.ws.create_blob("core0").feed(c0)
-        self.ws.create_blob("core1").feed(c1)
-        self.ws.create_blob("core2").feed(c2)
-        self.ws.create_blob("index").feed(index)
-        self.ws.create_blob("lengths").feed(lengths)
-
-        self.ws.run(op)
-        Y = self.ws.blobs[("Y")].fetch()
-        self.assertEqual(list(Y.shape), [2, 8])
-
-        golden = np.array([[768, 768, 768, 768, 768, 768, 768, 768],
-                           [512, 512, 512, 512, 512, 512, 512, 512]])
-
-        self.assertAlmostEqual(np.linalg.norm(golden - Y), 0, delta=0)
-
-    @given(**hu.gcs_cpu_only)
-    def test_tt_sls_gradientop(self, gc, dc):
-
-        op = core.CreateOperator(
-            "TTSparseLengthsSumGradient",
-            ["core0", "core1", "core2", "lengths",
-             "core0_out", "core1_out", "indices", "dY"],
-            ["dCore0", "dCore1", "dCore2"]
-        )
-
-        c0 = np.ones([10, 1, 4, 16]).astype(np.float32)
-        c1 = np.ones([10, 16, 4, 16]).astype(np.float32)
-        c2 = np.ones([10, 16, 4, 1]).astype(np.float32)
-        lengths = np.array([3, 2], np.int32)
-
-        c0_out = np.ones([5, 4, 16]).astype(np.float32)
-        c1_out = np.ones([5, 16, 16]).astype(np.float32)
-
-        indices = np.array([[0, 0, 0],
-                            [1, 0, 0],
-                            [2, 0, 0],
-                            [1, 0, 0],
-                            [4, 0, 0]], np.int64)
-
-        dY = np.ones([2, 64]).astype(np.float32)
-
-        self.ws.create_blob("core0").feed(c0)
-        self.ws.create_blob("core1").feed(c1)
-        self.ws.create_blob("core2").feed(c2)
-        self.ws.create_blob("lengths").feed(lengths)
-        self.ws.create_blob("core0_out").feed(c0_out)
-        self.ws.create_blob("core1_out").feed(c1_out)
-        self.ws.create_blob("indices").feed(indices)
-        self.ws.create_blob("dY").feed(dY)
-
-        self.ws.run(op)
-        dCore0 = self.ws.blobs[("dCore0")].fetch()
-        dCore1 = self.ws.blobs[("dCore1")].fetch()
-        dCore2 = self.ws.blobs[("dCore2")].fetch()
-        self.assertEqual(list(dCore0.shape), list(c0.shape))
-        self.assertEqual(list(dCore1.shape), list(c1.shape))
-        self.assertEqual(list(dCore2.shape), list(c2.shape))
-
-
-    @given(**hu.gcs_cpu_only)
-    def test_tt_sls_gradientop1(self, gc, dc):
-
-        op = core.CreateOperator(
-            "TTSparseLengthsSumGradient",
-            ["core0", "core1", "core2", "lengths",
-             "core0_out", "core1_out", "indices", "dY"],
-            ["dCore0", "dCore1", "dCore2"]
-        )
-
-        c0 = np.ones([101, 1, 2, 16]).astype(np.float32)
-        c1 = np.ones([102, 16, 2, 16]).astype(np.float32)
-        c2 = np.ones([153, 16, 4, 1]).astype(np.float32)
-        lengths = np.array([3, 2], np.int32)
-
-        c0_out = np.ones([5, 2, 16]).astype(np.float32)
-        c1_out = np.ones([5, 4, 16]).astype(np.float32)
-
-        indices = np.array([[0, 0, 0],
-                            [1, 0, 0],
-                            [2, 0, 0],
-                            [1, 0, 0],
-                            [4, 0, 0]], np.int64)
-
-        dY = np.ones([2, 16]).astype(np.float32)
-
-        self.ws.create_blob("core0").feed(c0)
-        self.ws.create_blob("core1").feed(c1)
-        self.ws.create_blob("core2").feed(c2)
-        self.ws.create_blob("lengths").feed(lengths)
-        self.ws.create_blob("core0_out").feed(c0_out)
-        self.ws.create_blob("core1_out").feed(c1_out)
-        self.ws.create_blob("indices").feed(indices)
-        self.ws.create_blob("dY").feed(dY)
-
-        self.ws.run(op)
-        dCore0 = self.ws.blobs[("dCore0")].fetch()
-        dCore1 = self.ws.blobs[("dCore1")].fetch()
-        dCore2 = self.ws.blobs[("dCore2")].fetch()
-        self.assertEqual(list(dCore0.shape), list(c0.shape))
-        self.assertEqual(list(dCore1.shape), list(c1.shape))
-        self.assertEqual(list(dCore2.shape), list(c2.shape))
-
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_tt_sls(self, gc, dc):
-        factor_voc = [10, 10, 10]
-        factor_width = [2, 2, 2]
-
-        op = core.CreateOperator(
-            "TTSparseLengthsSum",
-            ["core0", "core1", "core2", "index", "lengths"],
-            ["Y", "core0_output", "core1_output", "indices"],
-            factor_i=factor_voc,
-            factor_j=factor_width,
-            ranks=[1, 16, 16, 1],
-            emb_size=8
-        )
-        c0 = np.ones([10, 1, 2, 16]).astype(np.float32)
-        c1 = np.ones([10, 16, 2, 16]).astype(np.float32)
-        c2 = np.ones([10, 16, 2, 1]).astype(np.float32)
-        index = np.array([0, 1, 2, 1, 4], np.int64)
-        lengths = np.array([0, 3, 0, 0, 2, 0, 0], np.int32)
-        self.assertGradientChecks(gc, op, [c0, c1, c2, index, lengths], 0, [0])
-
-
-    @given(**hu.gcs_cpu_only)
-    def test_tt_sls_repro(self, gc, dc):
-        factor_voc = [125, 160, 200]
-        factor_width = [4, 4, 4]
-
-        op = core.CreateOperator(
-            "TTSparseLengthsSum",
-            ["core0", "core1", "core2", "index", "lengths"],
-            ["Y", "core0_output", "core1_output", "indices"],
-            factor_i=factor_voc,
-            factor_j=factor_width,
-            ranks=[1, 16, 16, 1],
-            emb_size=64
-        )
-        c0 = np.ones([125, 1, 4, 16]).astype(np.float32)
-        c1 = np.ones([160, 16, 4, 16]).astype(np.float32)
-        c2 = np.ones([200, 16, 4, 1]).astype(np.float32)
-        index = np.array([0, 4000000 - 1, 20000, 1000000, 4000000 - 1], np.int64)
-        lengths = np.array([0, 3, 0, 0, 2, 0, 0], np.int32)
-
-        self.ws.create_blob("core0").feed(c0)
-        self.ws.create_blob("core1").feed(c1)
-        self.ws.create_blob("core2").feed(c2)
-        self.ws.create_blob("index").feed(index)
-        self.ws.create_blob("lengths").feed(lengths)
-
-        self.ws.run(op)
-        Y = self.ws.blobs[("Y")].fetch()
-        self.assertEqual(list(Y.shape), [7, 64])
-
-        golden = np.array([[0] * 64, [768] * 64, [0] * 64, [0] * 64, [512] * 64, [0] * 64, [0] * 64])
-
-        self.assertAlmostEqual(np.linalg.norm(golden - Y), 0, delta=0)
-
-
-    @given(**hu.gcs_cpu_only)
-    def test_tt_sls_gradientop2(self, gc, dc):
-
-        op = core.CreateOperator(
-            "TTSparseLengthsSumGradient",
-            ["core0", "core1", "core2", "lengths",
-             "core0_out", "core1_out", "indices", "dY"],
-            ["dCore0", "dCore1", "dCore2"]
-        )
-
-        c0 = np.ones([101, 1, 2, 16]).astype(np.float32)
-        c1 = np.ones([102, 16, 2, 16]).astype(np.float32)
-        c2 = np.ones([153, 16, 4, 1]).astype(np.float32)
-        lengths = np.array([0, 3, 0, 0, 2, 0, 0], np.int32)
-
-        c0_out = np.ones([5, 2, 16]).astype(np.float32)
-        c1_out = np.ones([5, 4, 16]).astype(np.float32)
-
-        indices = np.array([[0, 0, 0],
-                            [1, 0, 0],
-                            [2, 0, 0],
-                            [1, 0, 0],
-                            [4, 0, 0]], np.int64)
-
-        dY = np.ones([7, 16]).astype(np.float32)
-
-        self.ws.create_blob("core0").feed(c0)
-        self.ws.create_blob("core1").feed(c1)
-        self.ws.create_blob("core2").feed(c2)
-        self.ws.create_blob("lengths").feed(lengths)
-        self.ws.create_blob("core0_out").feed(c0_out)
-        self.ws.create_blob("core1_out").feed(c1_out)
-        self.ws.create_blob("indices").feed(indices)
-        self.ws.create_blob("dY").feed(dY)
-
-        self.ws.run(op)
-        dCore0 = self.ws.blobs[("dCore0")].fetch()
-        dCore1 = self.ws.blobs[("dCore1")].fetch()
-        dCore2 = self.ws.blobs[("dCore2")].fetch()
-        self.assertEqual(list(dCore0.shape), list(c0.shape))
-        self.assertEqual(list(dCore1.shape), list(c1.shape))
-        self.assertEqual(list(dCore2.shape), list(c2.shape))
-
-    @given(num_workers=st.integers(1, 10),
-           net_type=st.sampled_from(
-               ["simple", "dag"] +
-               (["async_dag"] if workspace.has_gpu_support else [])),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_dag_net_forking(self, net_type, num_workers, gc, dc):
-        from caffe2.python.model_helper import ModelHelper
-        from caffe2.python import brew
-        m = ModelHelper(name="test_model")
-        n = 10
-        d = 2
-        depth = 2
-        iters = 5
-        np.random.seed(1701)
-        # Build a binary tree of FC layers, summing at each node.
-        for i in reversed(range(depth)):
-            for j in range(2 ** i):
-                bottom_1 = "{}_{}".format(i + 1, 2 * j)
-                bottom_2 = "{}_{}".format(i + 1, 2 * j + 1)
-                mid_1 = "{}_{}_m".format(i + 1, 2 * j)
-                mid_2 = "{}_{}_m".format(i + 1, 2 * j + 1)
-                top = "{}_{}".format(i, j)
-                brew.fc(
-                    m,
-                    bottom_1, mid_1,
-                    dim_in=d, dim_out=d,
-                    weight_init=('ConstantFill', dict(value=np.random.randn())),
-                    bias_init=('ConstantFill', dict(value=np.random.randn())))
-                brew.fc(
-                    m,
-                    bottom_2, mid_2,
-                    dim_in=d, dim_out=d,
-                    weight_init=('ConstantFill', dict(value=np.random.randn())),
-                    bias_init=('ConstantFill', dict(value=np.random.randn())))
-                m.net.Sum([mid_1, mid_2], top)
-        m.net.SquaredL2Distance(["0_0", "label"], "xent")
-        m.net.AveragedLoss("xent", "loss")
-        input_to_grad = m.AddGradientOperators(["loss"])
-        m.Proto().device_option.CopyFrom(gc)
-        m.param_init_net.Proto().device_option.CopyFrom(gc)
-
-        m.Proto().type = net_type
-        m.Proto().num_workers = num_workers
-
-        self.ws.run(m.param_init_net)
-
-        print(str(m.Proto()))
-
-        def run():
-            import numpy as np
-            np.random.seed(1701)
-            input_blobs = ["{}_{}".format(depth, j) for j in range(2 ** depth)]
-            for input_blob in input_blobs:
-                self.ws.create_blob(input_blob).feed(
-                    np.random.randn(n, d).astype(np.float32),
-                    device_option=gc)
-                self.ws.create_blob("label").feed(
-                    np.random.randn(n, d).astype(np.float32),
-                    device_option=gc)
-            self.ws.run(m.net)
-            gradients = [
-                self.ws.blobs[str(input_to_grad[input_blob])].fetch()
-                for input_blob in input_blobs]
-            return gradients
-
-        outputs = [run() for _ in range(iters)]
-        for output in outputs[1:]:
-            np.testing.assert_array_equal(outputs[0], output)
-            self.assertAlmostEqual(np.sum(np.square(output)), 91.81752,
-                                   delta=1e-2)
-
-    @given(input=hu.tensor(min_dim=2, max_dim=6),
-           slice_dim=st.integers(),
-           a=st.integers(),
-           b=st.integers(),
-           is_empty=st.booleans(),
-           **hu.gcs_cpu_only)
-    @settings(deadline=None, max_examples=50)
-    def test_slice(self, input, slice_dim, a, b, is_empty, gc, dc):
-        slice_dim = slice_dim % len(input.shape)
-        if (is_empty):
-            input = np.random.rand(*([0] + list(input.shape))).astype(np.int32)
-            slice_dim += 1
-
-        a = a % input.shape[slice_dim]
-        b = b % input.shape[slice_dim] + 1
-        start_vec = np.zeros(len(input.shape), dtype=np.int32)
-        end_vec = np.ones(len(input.shape), dtype=np.int32) * -1
-        start_vec[slice_dim] = min(a, b)
-        end_vec[slice_dim] = max(a, b)
-        op = core.CreateOperator(
-            "Slice",
-            ["input", "start", "end"],
-            ["output"])
-
-        def slice_ref(x, s, e):
-            if len(s.shape) == 0:
-                return x
-            slc = [slice(si, None if ei == -1 else ei) for si, ei in zip(s, e)]
-            return (x[slc], )
-
-        self.assertReferenceChecks(gc, op, [input, start_vec, end_vec],
-                                   slice_ref)
-        self.assertGradientChecks(gc, op, [input, start_vec, end_vec], 0, [0])
-
-    @given(data=hu.tensor(), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_shape(self, data, gc, dc):
-        op = core.CreateOperator("Shape", ["data"], ["shape"])
-        self.assertReferenceChecks(gc, op, [data], lambda x: (x.shape, ))
-
-    @given(data=hu.tensor(), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_shape_with_axes(self, data, gc, dc):
-        def shape_ref(x, y):
-            return ([x.shape[i] for i in y],)
-        axes = np.random.randint(len(data.shape), size=10).tolist()
-        op = core.CreateOperator("Shape", ["data"], ["shape"], axes=axes)
-        self.assertReferenceChecks(gc, op, [data, axes], shape_ref)
-
-    @given(x=hu.tensor(), y=hu.tensor(), **hu.gcs_cpu_only)
-    @settings(deadline=1000)
-    def test_has_elements(self, x, y, gc, dc):
-        op = core.CreateOperator("HasElements", ["x", "y"], ["has_elements"])
-        self.assertReferenceChecks(gc, op, [x, y], lambda x, y: (len(x) > 0 or len(y) > 0, ))
-
-        op = core.CreateOperator("IsEmpty", ["x"], ["is_empty"])
-        self.assertReferenceChecks(gc, op, [x], lambda x: (len(x) == 0, ))
-
-    @given(initial_iters=st.integers(0, 100),
-           max_iters=st.integers(0, 100))
-    @settings(deadline=10000)
-    def test_should_stop_as_criteria_net_execution_step(
-            self, initial_iters, max_iters):
-        net = core.Net("net")
-        net.Iter(["iter"], ["iter"])
-        self.ws.create_blob("iter").feed(
-            np.asarray([initial_iters]).astype(np.int64))
-        self.ws.create_blob("num_iters").feed(
-            np.asarray([max_iters]).astype(np.int64))
-        criteria_net = core.Net("criteria")
-        criteria_net.GE(["iter", "num_iters"], ["stop"])
-        criteria_net.Proto().external_output.extend(["stop"])
-
-        plan = core.Plan('plan')
-        plan.AddStep(core.execution_step(
-            'step', [criteria_net, net],
-            should_stop_blob=core.BlobReference("stop")))
-        self.ws.run(plan)
-        iters = self.ws.blobs[("iter")].fetch()
-        self.assertEqual(iters.dtype, np.int64)
-        self.assertEqual(iters[0], max(initial_iters, max_iters))
-
-    def test_disabled_execution_step(self):
-        def createNets(i, disabled):
-            should_stop = 'should_stop_{}'.format(i)
-            output = 'output_{}'.format(i)
-
-            # init content and stop signal
-            init = core.Net("init_{}".format(i))
-            init.ConstantFill(
-                [],
-                [output],
-                shape=[1],
-                value=0.0
-            )
-            init.Cast([output], [should_stop], to='bool')
-
-            # decide if disabled or not
-            criterion = core.Net("criterion_{}".format(i))
-            tmp = criterion.ConstantFill(
-                [],
-                shape=[1],
-                value=1.0 if disabled else 0.0
-            )
-            criterion.Cast([tmp], [should_stop], to='bool')
-            criterion.Proto().external_output.extend([should_stop])
-
-            # the body net is just to turn a 0 blob to 1
-            net = core.Net("net_{}".format(i))
-            net.ConstantFill(
-                [],
-                [output],
-                shape=[1],
-                value=1.0
-            )
-
-            # always end the loop
-            ender = core.Net("ender_{}".format(i))
-            tmp = ender.ConstantFill(
-                [],
-                shape=[1],
-                value=1.0
-            )
-            ender.Cast([tmp], [should_stop], to='bool')
-            ender.Proto().external_output.extend([should_stop])
-
-            return [init, criterion, net, ender]
-
-        nets = [createNets(1, False),
-                createNets(2, True),
-                createNets(3, False)]
-        steps = [
-            core.execution_step(
-                'step_1', nets[0],
-                should_stop_blob=core.BlobReference('should_stop_1')),
-            core.execution_step(
-                'step_2', nets[1],
-                should_stop_blob=core.BlobReference('should_stop_2')),
-            core.execution_step('step_3', nets[2])
-        ]
-        expected = [1.0, 0.0, 1.0]
-
-        plan = core.Plan('plan')
-        plan.AddStep(core.execution_step('all_steps', steps, num_iter=3))
-        self.ws.run(plan)
-
-        for i, _ in enumerate(nets):
-            self.assertEqual(
-                self.ws.blobs['output_{}'.format(i + 1)].fetch()[0],
-                expected[i])
-
-    @given(initial_iters=st.integers(0, 100),
-           num_iters=st.integers(0, 100))
-    @settings(deadline=10000)
-    def test_iter_count_with_execution_step(self, initial_iters, num_iters):
-        net = core.Net("net")
-        net.Iter(["iter"], ["iter"])
-        self.ws.create_blob("iter").feed(
-            np.asarray([initial_iters]).astype(np.int64))
-
-        step = core.ExecutionStep("step", [net])
-        step.SetIter(num_iters)
-
-        plan = core.Plan("plan")
-        plan.AddStep(step)
-        self.ws.run(plan)
-        iters = self.ws.blobs[("iter")].fetch()
-        self.assertEqual(iters.dtype, np.int64)
-        self.assertEqual(iters[0], initial_iters + num_iters)
-
-
-    @given(initial_iters=st.integers(0, 100),
-           num_iters=st.integers(0, 100),
-           num_nets=st.integers(0, 5))
-    @settings(deadline=None, max_examples=50)
-    def test_atomic_iter_with_concurrent_steps(self, initial_iters, num_iters,
-                                               num_nets):
-        init_net = core.Net("init_net")
-        iter_mutex = init_net.CreateMutex([], ["iter_mutex"])
-        self.ws.create_blob("iter").feed(
-            np.asarray([initial_iters]).astype(np.int64))
-        concurrent_steps = core.ExecutionStep("concurrent_steps",
-                                              num_iter=num_iters)
-        for i in range(num_nets):
-            net = core.Net("net_{}".format(i))
-            net.AtomicIter([iter_mutex, "iter"], ["iter"])
-            step = core.ExecutionStep("step", [net])
-            concurrent_steps.AddSubstep(step)
-
-        concurrent_steps.SetConcurrentSubsteps(True)
-        plan = core.Plan("plan")
-        plan.AddStep(concurrent_steps)
-
-        stats_net = core.Net("stats_net")
-        stats_net.StatRegistryExport([], ["stats_key", "stats_val", "stats_ts"])
-
-        self.ws.run(init_net)
-        self.ws.run(plan)
-        self.ws.run(stats_net)
-        iters = self.ws.blobs[("iter")].fetch()
-        self.assertEqual(iters.dtype, np.int64)
-        self.assertEqual(iters[0], initial_iters + num_iters * num_nets)
-
-        if num_iters * num_nets > 0:
-            stats_key = self.ws.blobs[("stats_key")].fetch()
-            atomic_iter_key = b'atomic_iter/stats/iter/num_iter'
-            self.assertTrue(atomic_iter_key in stats_key)
-            stat_val = self.ws.blobs[("stats_val")].fetch()
-            self.assertEqual(num_iters * num_nets, stat_val[list(stats_key).index(atomic_iter_key)])
-
-
-    @given(a=hu.tensor(),
-           src=st.sampled_from(list(_NUMPY_TYPE_TO_ENUM.keys())),
-           dst=st.sampled_from(list(_NUMPY_TYPE_TO_ENUM.keys())),
-           use_name=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=1000)
-    def test_cast(self, a, src, dst, use_name, gc, dc):
-        a = a.astype(src)
-
-        # Casting from a float type outside the range of the integral
-        # type is UB.
-        ftypes = [np.float32, np.float64]
-        if src in ftypes and dst not in ftypes and dst is not bool:
-            info = np.iinfo(dst)
-            a = np.clip(a, info.min, info.max)
-
-        def ref(data):
-            return [data.astype(dst)]
-
-        to = _NUMPY_TYPE_TO_ENUM[dst]
-        if use_name:
-            to = caffe2_pb2.TensorProto.DataType.Name(to).lower()
-        op = core.CreateOperator('Cast', ["X"], ["Y"], to=to)
-        self.assertDeviceChecks(dc, op, [a], [0])
-        out, = self.assertReferenceChecks(gc, op, [a], ref)
-        self.assertEqual(dst, out.dtype)
-
-    @given(a=hu.tensor(),
-           eps=hu.floats(min_value=1e-4, max_value=1e-2),
-           a_grad=hu.tensor(elements=hu.floats(min_value=0.01, max_value=0.99)),
-           eps_grad=hu.floats(min_value=1e-4, max_value=1e-3),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_logit(self, a, eps, a_grad, eps_grad, gc, dc):
-        def ref(data):
-            data = np.clip(data, eps, 1.0 - eps)
-            return (np.log(data / (1 - data)), )
-        # forward testing carried out in the full range of input
-        # to ensure original test coverage.
-        # gradient test carried out with reduced input range
-        # because the sharp increase of the logit curve at 0 and 1
-        # error increases dramtically when input is close to 0 or 1
-        # and it will fail the test.
-        # So we only run gradient test in the range of (0.01, 0.99)
-        # very occasionally, test may fail due to random accumulated error
-        # reduce test range to (0.02, 0.98) will improve test stability
-        op = core.CreateOperator('Logit', ["X"], ["Y"], eps=eps)
-        self.assertDeviceChecks(dc, op, [a], [0])
-        self.assertReferenceChecks(gc, op, [a], ref)
-        op_grad = core.CreateOperator('Logit', ["X"], ["Y"], eps=eps_grad)
-        self.assertGradientChecks(gc, op_grad, [a_grad], 0, [0],
-                                  threshold=0.04, stepsize=2e-3)
-
-    @given(a=hu.tensor(elements=hu.floats(allow_nan=True)),
-           value=hu.floats(min_value=-10, max_value=10),
-           **hu.gcs)
-    @settings(deadline=1000)
-    def test_replace_nan(self, a, value, gc, dc):
-        def ref(data):
-            out = np.copy(data)
-            out[np.isnan(data)] = value
-            return (out, )
-
-        op = core.CreateOperator('ReplaceNaN', ["X"], ["Y"], value=value)
-        self.assertDeviceChecks(dc, op, [a], [0])
-        self.assertReferenceChecks(gc, op, [a], ref)
-
-    @given(data=_dtypes(dtypes=[np.int32, np.int64, np.float32, bool]).
-           flatmap(lambda dtype: hu.tensor(
-               min_dim=1, dtype=dtype, elements=hu.elements_of_type(dtype))),
-           has_input=st.booleans(),
-           has_extra_shape=st.booleans(),
-           extra_shape=st.lists(
-           min_size=1, max_size=5, elements=st.integers(1, 5)),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_constant_fill(self, data, has_input, has_extra_shape, extra_shape,
-                           gc, dc):
-        dtype = data.dtype.type
-        # in opt mode, bool is converted into np.bool_
-        if data.dtype == np.dtype(bool):
-            dtype = bool
-
-        value = data.item(0)
-        gt_shape = data.shape
-        inputs = [data]
-        enum_type = _NUMPY_TYPE_TO_ENUM[dtype]
-
-        if has_input:
-            if has_extra_shape:
-                op = core.CreateOperator('ConstantFill', ["X"], ["Y"],
-                                         dtype=enum_type,
-                                         extra_shape=extra_shape,
-                                         value=value)
-                gt_shape += tuple(extra_shape)
-            else:
-                op = core.CreateOperator('ConstantFill', ["X"], ["Y"],
-                                         dtype=enum_type,
-                                         value=value)
-        else:
-            op = core.CreateOperator('ConstantFill', [], ["Y"],
-                                     dtype=enum_type,
-                                     value=value,
-                                     shape=list(gt_shape))
-            inputs = []
-
-        def ref(inputs=None):
-            outputs = np.full(shape=gt_shape, fill_value=value, dtype=dtype)
-            return [outputs]
-
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        out, = self.assertReferenceChecks(gc, op, inputs, ref)
-        self.assertEqual(dtype, out.dtype)
-
-    @given(data=_dtypes(dtypes=[np.int32, np.int64, np.float32, bool]).
-        flatmap(lambda dtype: hu.tensor(
-            min_dim=1, dtype=dtype, elements=hu.elements_of_type(dtype))),
-        **hu.gcs)
-    @settings(deadline=1000)
-    def test_constant_fill_from_tensor(self, data, gc, dc):
-        dtype = data.dtype.type
-        if data.dtype == np.dtype(bool):
-            dtype = bool
-
-        value = np.array([data.item(0)], dtype=dtype)
-        inputs = [data, value]
-        enum_type = _NUMPY_TYPE_TO_ENUM[dtype]
-
-        op = core.CreateOperator(
-            'ConstantFill',
-            ["X", "V"],
-            ["Y"],
-            dtype=enum_type,
-        )
-
-        def ref(x, v):
-            outputs = np.full(shape=data.shape, fill_value=value[0], dtype=dtype)
-            return [outputs]
-
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        out, = self.assertReferenceChecks(gc, op, inputs, ref)
-        self.assertEqual(dtype, out.dtype)
-
-    @given(t=st.integers(1, 5),
-           n=st.integers(1, 5),
-           d=st.integers(1, 5))
-    @settings(deadline=10000)
-    def test_elman_recurrent_network(self, t, n, d):
-        from caffe2.python import model_helper, brew
-        np.random.seed(1701)
-        step_net = model_helper.ModelHelper(name="Elman")
-        # TODO: name scope external inputs and outputs
-        step_net.Proto().external_input.extend(
-            ["input_t", "seq_lengths", "timestep",
-             "hidden_t_prev", "gates_t_w", "gates_t_b"])
-        step_net.Proto().type = "simple"
-        step_net.Proto().external_output.extend(["hidden_t", "gates_t"])
-        brew.fc(step_net,
-                "hidden_t_prev", "gates_t", dim_in=d, dim_out=d, axis=2)
-        step_net.net.Sum(["gates_t", "input_t"], ["gates_t"])
-        step_net.net.Sigmoid(["gates_t"], ["hidden_t"])
-
-        # Initialize params for step net in the parent net
-        for op in step_net.param_init_net.Proto().op:
-            workspace.RunOperatorOnce(op)
-
-        backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
-            step_net.Proto().op, {"hidden_t": "hidden_t_grad"})
-        backward_mapping = {
-            str(k): str(v) for k, v in backward_mapping.items()
-        }
-        backward_step_net = core.Net("ElmanBackward")
-        del backward_step_net.Proto().op[:]
-        backward_step_net.Proto().op.extend(backward_ops)
-        assert backward_mapping["input_t"] == "gates_t_grad"
-        links = [
-            ("hidden_t_prev", "hidden", 0),
-            ("hidden_t", "hidden", 1),
-            ("input_t", "input", 0),
-        ]
-        link_internal, link_external, link_offset = zip(*links)
-        backward_links = [
-            ("hidden_t_prev_grad", "hidden_grad", 0),
-            ("hidden_t_grad", "hidden_grad", 1),
-            ("gates_t_grad", "input_grad", 0),
-        ]
-        backward_link_internal, backward_link_external, backward_link_offset = \
-            zip(*backward_links)
-        backward_step_net.Proto().external_input.extend(["hidden_t_grad"])
-        backward_step_net.Proto().external_input.extend(
-            step_net.Proto().external_input)
-        backward_step_net.Proto().external_input.extend(
-            step_net.Proto().external_output)
-        inputs = ["input", "seq_lengths", "gates_t_w", "gates_t_b", "hidden_input"]
-        recurrent_inputs = ["hidden_input"]
-        op = core.CreateOperator(
-            "RecurrentNetwork",
-            inputs,
-            ["output", "hidden", "hidden_output", "step_workspaces"],
-            alias_src=["hidden", "hidden"],
-            alias_dst=["output", "hidden_output"],
-            alias_offset=[1, -1],
-            recurrent_states=["hidden"],
-            initial_recurrent_state_ids=[
-                inputs.index(i) for i in recurrent_inputs
-            ],
-            link_internal=link_internal,
-            link_external=link_external,
-            link_offset=link_offset,
-            backward_link_internal=backward_link_internal,
-            backward_link_external=backward_link_external,
-            backward_link_offset=backward_link_offset,
-            param=[inputs.index(p) for p in step_net.params],
-            step_net=step_net.Proto(),
-            backward_step_net=backward_step_net.Proto(),
-            outputs_with_grads=[0],
-        )
-        workspace.FeedBlob(
-            "input", np.random.randn(t, n, d).astype(np.float32))
-        workspace.FeedBlob(
-            "hidden_input", np.random.randn(1, n, d).astype(np.float32))
-        workspace.FeedBlob(
-            "seq_lengths", np.random.randint(0, t, size=(n,)).astype(np.int32))
-
-        def reference(input, seq_lengths, gates_w, gates_b, hidden_input):
-            T = input.shape[0]
-            N = input.shape[1]
-            D = input.shape[2]
-            hidden = np.zeros(shape=(T + 1, N, D))
-            assert hidden.shape[0] == T + 1
-            assert hidden.shape[1] == N
-            assert hidden.shape[2] == D
-
-            hidden[0, :, :] = hidden_input
-            for t in range(T):
-                input_t = input[t].reshape(1, N, D)
-                hidden_t_prev = hidden[t].reshape(1, N, D)
-                gates = np.dot(hidden_t_prev, gates_w.T)
-                gates = gates.reshape(1, N, D) + input_t.reshape(1, N, D)
-                hidden[t + 1] = sigmoid(gates)
-            return hidden[1:], hidden, hidden[-1].reshape(1, N, D)
-
-        self.assertReferenceChecks(
-            hu.cpu_do,
-            op,
-            [workspace.FetchBlob(name)
-             for name in ["input", "seq_lengths", "gates_t_w", "gates_t_b",
-                          "hidden_input"]],
-            reference,
-            outputs_to_check=[0, 1, 2])
-
-        for param in [0, 2, 3]:
-            self.assertGradientChecks(
-                hu.cpu_do,
-                op,
-                [workspace.FetchBlob(name)
-                 for name in ["input", "seq_lengths", "gates_t_w", "gates_t_b",
-                              "hidden_input"]],
-                param,
-                [0])
-
-    @settings(suppress_health_check=[HealthCheck.filter_too_much], deadline=10000)
-    @given(n=st.integers(1, 5),
-           c=st.integers(1, 5),
-           h=st.integers(1, 5),
-           w=st.integers(1, 5),
-           pad=st.integers(0, 2),
-           block_size=st.integers(2, 3),
-           **hu.gcs)
-    def test_space_to_batch(self, n, c, h, w, pad, block_size, gc, dc):
-        assume((h + 2 * pad) % block_size == 0)
-        assume((w + 2 * pad) % block_size == 0)
-        X = np.random.randn(n, c, h, w).astype(np.float32)
-        op = core.CreateOperator("SpaceToBatch", ["X"], ["Y"],
-                                 pad=pad, block_size=block_size)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @settings(suppress_health_check=[HealthCheck.filter_too_much], deadline=10000)
-    @given(n=st.integers(1, 5),
-           c=st.integers(1, 5),
-           h=st.integers(1, 5),
-           w=st.integers(1, 5),
-           pad=st.integers(0, 2),
-           block_size=st.integers(2, 3),
-           **hu.gcs)
-    def test_batch_to_space(self, n, c, h, w, pad, block_size, gc, dc):
-        assume((h + 2 * pad) % block_size == 0)
-        assume((w + 2 * pad) % block_size == 0)
-        X = np.random.randn(
-            n * block_size * block_size,
-            c,
-            (h + 2 * pad) // block_size,
-            (w + 2 * pad) // block_size).astype(np.float32)
-        op = core.CreateOperator("BatchToSpace", ["X"], ["Y"],
-                                 pad=pad, block_size=block_size)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(X=hu.tensor(),
-           in_place=st.booleans(),
-           scale=hu.floats(min_value=-2.0, max_value=2.0),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_scale(self, X, in_place, scale, gc, dc):
-        op = core.CreateOperator(
-            "Scale", ["X"], ["Y" if not in_place else "X"],
-            scale=scale)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(s=st.text())
-    def test_string_serde(self, s):
-        s = s.encode('ascii', 'ignore')
-        self.ws.create_blob("a").feed(s)
-        serialized = self.ws.blobs["a"].serialize("a")
-        self.ws.create_blob("b").deserialize(serialized)
-        self.assertEqual(s, self.ws.blobs[("a")].fetch())
-        self.assertEqual(s, self.ws.blobs[("b")].fetch())
-
-    @given(pad=st.integers(0, 3),
-           size=st.integers(1, 10),
-           input_channels=st.integers(1, 5),
-           batch_size=st.integers(1, 5),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           mode=st.sampled_from(["constant", "reflect", "edge"]),
-           **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_same_pad_image(self, pad, size, input_channels, batch_size, order,
-                            mode, gc, dc):
-        assume(size > pad)
-
-        op = core.CreateOperator(
-            "PadImage",
-            ["X"],
-            ["Y"],
-            pad=pad,
-            mode=mode,
-            order=order,
-        )
-        if order == "NHWC":
-            X = np.random.rand(
-                batch_size, size, size, input_channels).astype(np.float32) - 0.5
-
-            def numpy_pad_ref(x):
-                return (np.pad(
-                    x, ((0, 0), (pad, pad), (pad, pad), (0, 0)), mode),)
-
-        else:
-            X = np.random.rand(
-                batch_size, input_channels, size, size).astype(np.float32) - 0.5
-
-            def numpy_pad_ref(x):
-                return (np.pad(
-                    x, ((0, 0), (0, 0), (pad, pad), (pad, pad)), mode),)
-
-        self.assertReferenceChecks(gc, op, [X], numpy_pad_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(pad_t=st.integers(0, 3),
-           pad_l=st.integers(0, 3),
-           pad_b=st.integers(0, 3),
-           pad_r=st.integers(0, 3),
-           size=st.integers(1, 10),
-           input_channels=st.integers(1, 5),
-           batch_size=st.integers(1, 5),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           mode=st.sampled_from(["constant", "reflect", "edge"]),
-           **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_pad_image(self, pad_t, pad_l, pad_b, pad_r, size, input_channels,
-                       batch_size, order, mode, gc, dc):
-        assume(size > max(pad_b, pad_r, pad_t, pad_l))
-
-        op = core.CreateOperator(
-            "PadImage",
-            ["X"],
-            ["Y"],
-            pad_t=pad_t,
-            pad_l=pad_l,
-            pad_b=pad_b,
-            pad_r=pad_r,
-            mode=mode,
-            order=order,
-        )
-        if order == "NHWC":
-            X = np.random.rand(
-                batch_size, size, size, input_channels).astype(np.float32) - 0.5
-
-            def numpy_pad_ref(x):
-                return (np.pad(
-                    x, ((0, 0), (pad_t, pad_b), (pad_l, pad_r), (0, 0)),
-                    mode),)
-
-        else:
-            X = np.random.rand(
-                batch_size, input_channels, size, size).astype(np.float32) - 0.5
-
-            def numpy_pad_ref(x):
-                return (np.pad(
-                    x, ((0, 0), (0, 0), (pad_t, pad_b), (pad_l, pad_r)),
-                    mode),)
-
-        self.assertReferenceChecks(gc, op, [X], numpy_pad_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(size=st.integers(7, 10),
-           input_channels=st.integers(1, 10),
-           batch_size=st.integers(1, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           epsilon=hu.floats(min_value=1e-4, max_value=1e-2),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_instance_norm(self, size, input_channels, batch_size, order,
-                           epsilon, gc, dc):
-        op = core.CreateOperator(
-            "InstanceNorm",
-            ["X", "scale", "bias"],
-            ["Y"],
-            order=order,
-            epsilon=epsilon,
-        )
-        np.random.seed(1701)
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        if order == "NHWC":
-            X = X.swapaxes(1, 2).swapaxes(2, 3)
-
-        def ref_nchw(x, scale, bias):
-            x = x.reshape(batch_size * input_channels, size * size)
-            y = (x - x.mean(1)[:, np.newaxis])
-            y /= np.sqrt(x.var(1) + epsilon)[:, np.newaxis]
-            y = y.reshape(batch_size, input_channels, size, size)
-            y = y * scale.reshape(1, input_channels, 1, 1)
-            y = y + bias.reshape(1, input_channels, 1, 1)
-            return (y, )
-
-        def ref_nhwc(x, scale, bias):
-            x = x.swapaxes(2, 3).swapaxes(1, 2)
-            y = ref_nchw(x, scale, bias)[0]
-            return (y.swapaxes(1, 2).swapaxes(2, 3), )
-
-        self.assertReferenceChecks(
-            gc, op, [X, scale, bias],
-            ref_nchw if order == "NCHW" else ref_nhwc)
-        # TODO(jiayq): when there are backward and GPU implementations, enable
-        # these two.
-        # self.assertDeviceChecks(dc, op, [X, scale, bias], [0])
-        # self.assertGradientChecks(gc, op, [X, scale, bias], 0, [0])
-
-        ws = workspace.C.Workspace()
-        feeds = [("X", X), ("scale", scale), ("bias", bias)]
-        for blob, arr in feeds:
-            ws.create_blob(blob).feed(arr)
-        for _ in range(100):
-            ws.run(op)
-        for blob, arr in feeds:
-            np.testing.assert_array_equal(ws.blobs[blob].fetch(), arr)
-
-    @given(inp=_dtypes().flatmap(lambda dt: _tensor_and_indices(
-        elements=hu.elements_of_type(dt), dtype=dt)),
-        **hu.gcs)
-    @settings(deadline=10000)
-    def test_sparse_to_dense(self, inp, gc, dc):
-        first_dim, X, I = inp
-        if X.dtype != np.dtype('float32') and gc.device_type in {caffe2_pb2.CUDA, caffe2_pb2.HIP} :
-            # Cuda only support 32 bit float
-            print("Bailout {}".format(X.dtype))
-            return
-        if gc.device_type in {caffe2_pb2.CUDA, caffe2_pb2.HIP}:
-            # Cuda version only support int32
-            I = I.astype(np.int32)
-
-        if X.dtype in (np.dtype('int64'), np.dtype('int32')):
-            assume((np.abs(X.ravel()).max() < np.iinfo('int32').max).all())
-            assume(np.abs(X.ravel()).astype(np.int64).sum() < np.iinfo('int32').max)
-
-        # values don't matter
-        D = np.zeros((first_dim,) + X.shape[1:]).astype(X.dtype)
-
-        op = core.CreateOperator("SparseToDense", ["I", "X", "D"], ["Y"])
-        op_noshapeinfer = core.CreateOperator("SparseToDense", ["I", "X"], ["Y"])
-
-        def sparse_to_dense(I, X, D):
-            O = np.zeros(D.shape, dtype=X.dtype)
-            for i, p in enumerate(I):
-                O[p] += X[i]
-            return [O]
-
-        def sparse_to_dense_noshapeinfer(I, X):
-            O = np.zeros((np.max(I) + 1,) + X.shape[1:], dtype=X.dtype)
-            for i, p in enumerate(I):
-                O[p] += X[i]
-            return [O]
-
-        self.assertReferenceChecks(gc, op, [I, X, D], sparse_to_dense)
-        self.assertReferenceChecks(gc, op_noshapeinfer, [I, X], sparse_to_dense_noshapeinfer)
-        if X.dtype == np.float32:
-            self.assertGradientChecks(gc, op, [I, X, D], 1, [0])
-
-    @given(inputs=hu.tensors(n=2, min_dim=2, max_dim=2), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_dot_product(self, inputs, gc, dc):
-        X, Y = inputs
-        op = core.CreateOperator("DotProduct", ["X", "Y"], 'out')
-
-        def dotproduct(X, Y):
-            return (np.sum(X * Y, axis=1), )
-
-        self.assertReferenceChecks(gc, op, [X, Y], dotproduct)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
-        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
-
-    @given(N=st.integers(min_value=2, max_value=10),
-           M=st.integers(min_value=2, max_value=10),
-           K=st.integers(min_value=2, max_value=10),
-           pad_value=hu.floats(min_value=0.1, max_value=1.0),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_dot_product_with_padding(self, N, M, K, pad_value, gc, dc):
-        X = np.random.rand(N, M).astype(np.float32) - 0.5
-        Y = np.random.rand(N, K).astype(np.float32) - 0.5
-        op = core.CreateOperator("DotProductWithPadding", ["X", "Y"], 'out',
-                                 pad_value=pad_value)
-
-        def dotproduct(X, Y):
-            Z = np.ones((N, max(M, K))).astype(np.float32) * pad_value
-            if M < K:
-                Z[:, :M] = X
-                return (np.sum(Z * Y, axis=1), )
-            else:
-                Z[:, :K] = Y
-                return (np.sum(Z * X, axis=1), )
-
-        self.assertReferenceChecks(gc, op, [X, Y], dotproduct)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
-        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
-
-    @given(N=st.integers(min_value=2, max_value=10),
-           M=st.integers(min_value=2, max_value=10),
-           pad_value=hu.floats(min_value=0.1, max_value=1.0),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_dot_product_with_rep_padding(self, N, M, pad_value, gc, dc):
-        K = 2 * M
-        X = np.random.rand(N, M).astype(np.float32) - 0.5
-        Y = np.random.rand(N, K).astype(np.float32) - 0.5
-        op = core.CreateOperator("DotProductWithPadding", ["X", "Y"], 'out',
-                                 replicate=True,
-                                 pad_value=pad_value)
-
-        def dotproduct(X, Y):
-            import numpy.matlib as npm
-            if M < K:
-                Z = npm.repmat(X, 1, K // M)
-                return (np.sum(Z * Y, axis=1), )
-            else:
-                Z = npm.repmat(Y, 1, M // K)
-                return (np.sum(Z * X, axis=1), )
-
-        self.assertReferenceChecks(gc, op, [X, Y], dotproduct)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
-        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
-
-    @given(N=st.integers(min_value=2, max_value=10),
-           M=st.integers(min_value=2, max_value=10), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_ensure_dense(self, N, M, gc, dc):
-        # in place
-        X = np.random.rand(N, M).astype(np.float32) - 0.5
-        op = core.CreateOperator("EnsureDense", ["X"], "X")
-        self.assertReferenceChecks(gc, op, [X], lambda x: [x])
-        self.assertDeviceChecks(dc, op, [X], [0])
-        # or not
-        X = np.random.rand(N, M).astype(np.float32) - 0.5
-        op = core.CreateOperator("EnsureDense", ["X"], "out")
-        self.assertReferenceChecks(gc, op, [X], lambda x: [x])
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(N=st.integers(min_value=10, max_value=100),
-           M=st.integers(min_value=2, max_value=10),
-           num_buckets=st.integers(min_value=1, max_value=5),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_accumulate_histogram_op(self, N, M, num_buckets, gc, dc):
-        X = np.random.rand(N, M).astype(np.float32)
-        lower_bound, upper_bound = 0.1, 0.9
-        op = core.CreateOperator("AccumulateHistogram", ["X"],
-                                 ['cur_hist', 'acc_hist'],
-                                 lower_bound=lower_bound,
-                                 upper_bound=upper_bound,
-                                 num_buckets=num_buckets)
-
-        def histogram(X):
-            hist = np.zeros((num_buckets + 2, ), dtype=np.int32)
-            segment = (upper_bound - lower_bound) / num_buckets
-            Y = np.zeros((N, M), dtype=np.int32)
-            Y[X < lower_bound] = 0
-            Y[X >= upper_bound] = num_buckets + 1
-            Y[(X >= lower_bound) & (X < upper_bound)] = \
-                ((X[(X >= lower_bound) & (X < upper_bound)] - lower_bound) /
-                    segment + 1).astype(np.int32)
-
-            for i in range(Y.shape[0]):
-                for j in range(Y.shape[1]):
-                    hist[Y[i][j]] += 1
-            cur_hist, acc_hist = hist, hist
-
-            return [cur_hist, acc_hist]
-
-        self.assertDeviceChecks(dc, op, [X], [0, 1])
-        self.assertReferenceChecks(gc, op, [X], histogram)
-
-    @settings(max_examples=1, deadline=None)
-    @given(
-        queue_capacity=st.integers(2, 2),
-        time_sleep=st.integers(5, 10),
-        num_blobs_to_equeue=st.integers(1, 1),
-        num_blobs_to_dequeue=st.integers(2, 2),
-    )
-    def test_safe_dequeue_blob__raises_exception_when_hang(
-        self,
-        queue_capacity,
-        time_sleep,
-        num_blobs_to_equeue,
-        num_blobs_to_dequeue,
-    ):
-        r"""
-        Tests SafeDequeueBlobsOp being cancellable.
-
-        Create a queue with the number of BlobsQueue less than the number
-        SafeDequeueBlobs to cause the hanging behavior when running the Net.
-
-        Then call cancel from the previous sleeping thread to ensure exception
-        is raised.
-        """
-
-        def _net_instance_cancel(net_instance):
-            time.sleep(time_sleep)
-            net_instance.cancel()
-
-        init_net = core.Net("init_net")
-        init_net.Proto().type = "async_scheduling"
-
-        queue = init_net.CreateBlobsQueue(
-            [],
-            "queue_name",
-            capacity=queue_capacity,
-            num_blobs=num_blobs_to_equeue,
-        )
-
-        ws = workspace.Workspace()
-        ws.create_net(init_net).run()
-
-        net = core.Net("net")
-        net.Proto().type = "async_scheduling"
-
-        blobs = net.SafeDequeueBlobs([queue], num_blobs_to_dequeue)
-
-        net_instance = ws.create_net(net)
-
-        t = threading.Thread(target=_net_instance_cancel, args=[net_instance])
-        t.start()
-
-        with self.assertRaises(Exception):
-            net_instance.run()
-            t.join()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
deleted file mode 100644
index 9ddb2d1012ec..000000000000
--- a/caffe2/python/hypothesis_test_util.py
+++ /dev/null
@@ -1,751 +0,0 @@
-## @package hypothesis_test_util
-# Module caffe2.python.hypothesis_test_util
-"""
-The Hypothesis library uses *property-based testing* to check
-invariants about the code under test under a variety of random inputs.
-
- The key idea here is to express properties of the code under test
-(e.g. that it passes a gradient check, that it implements a reference
-function, etc), and then generate random instances and verify they
-satisfy these properties.
-
-The main functions of interest are exposed on `HypothesisTestCase`.
-You can usually just add a short function in this to generate an
-arbitrary number of test cases for your operator.
-
-The key functions are:
-
-- `assertDeviceChecks(devices, op, inputs, outputs)`. This asserts that the
-  operator computes the same outputs, regardless of which device it is executed
-  on.
-- `assertGradientChecks(device, op, inputs, output_,
-  outputs_with_grads)`. This implements a standard numerical gradient checker
-  for the operator in question.
-- `assertReferenceChecks(device, op, inputs, reference)`. This runs the
-  reference function (effectively calling `reference(*inputs)`, and comparing
-  that to the output of output.
-
-`hypothesis_test_util.py` exposes some useful pre-built samplers.
-
-- `hu.gcs` - a gradient checker device (`gc`) and device checker devices (`dc`)
-
-- `hu.gcs_cpu_only` - a CPU-only gradient checker device (`gc`) and
-  device checker devices (`dc`). Used for when your operator is only
-  implemented on the CPU.
-"""
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import (
-    workspace, device_checker, gradient_checker, test_util, core)
-import contextlib
-import copy
-import functools
-import hypothesis
-import hypothesis.extra.numpy
-import hypothesis.strategies as st
-import logging
-import numpy as np
-import os
-import struct
-
-
-def is_sandcastle():
-    return os.getenv('SANDCASTLE') == '1' or os.getenv('TW_JOB_USER') == 'sandcastle'
-
-
-def is_travis():
-    return 'TRAVIS' in os.environ
-
-
-def to_float32(x):
-    return struct.unpack("f", struct.pack("f", float(x)))[0]
-
-
-#  "min_satisfying_examples" setting has been deprecated in hypothesis
-#  3.56.0 and removed in hypothesis 4.x
-def settings(*args, **kwargs):
-    if 'min_satisfying_examples' in kwargs and hypothesis.version.__version_info__ >= (3, 56, 0):
-        kwargs.pop('min_satisfying_examples')
-
-    if 'deadline' in kwargs and hypothesis.version.__version_info__ < (4, 44, 0):
-        kwargs.pop('deadline')
-
-    if 'timeout' in kwargs and hypothesis.version.__version_info__ >= (4, 44, 0):
-        if 'deadline' not in kwargs:
-            kwargs['deadline'] = kwargs['timeout'] * 1e3
-        kwargs.pop('timeout')
-
-    return hypothesis.settings(*args, **kwargs)
-
-# This wrapper wraps around `st.floats` and
-# sets width parameters to 32 if version is newer than 3.67.0
-def floats(*args, **kwargs):
-
-    width_supported = hypothesis.version.__version_info__ >= (3, 67, 0)
-    if 'width' in kwargs and not width_supported:
-        kwargs.pop('width')
-
-    if 'width' not in kwargs and width_supported:
-        kwargs['width'] = 32
-        if kwargs.get('min_value', None) is not None:
-            kwargs['min_value'] = to_float32(kwargs['min_value'])
-        if kwargs.get('max_value', None) is not None:
-            kwargs['max_value'] = to_float32(kwargs['max_value'])
-
-    return st.floats(*args, **kwargs)
-
-
-hypothesis.settings.register_profile(
-    "sandcastle",
-    settings(
-        derandomize=True,
-        suppress_health_check=[hypothesis.HealthCheck.too_slow],
-        database=None,
-        max_examples=50,
-        min_satisfying_examples=1,
-        verbosity=hypothesis.Verbosity.verbose,
-        deadline=10000))
-hypothesis.settings.register_profile(
-    "dev",
-    settings(
-        suppress_health_check=[hypothesis.HealthCheck.too_slow],
-        database=None,
-        max_examples=10,
-        min_satisfying_examples=1,
-        verbosity=hypothesis.Verbosity.verbose,
-        deadline=10000))
-hypothesis.settings.register_profile(
-    "debug",
-    settings(
-        suppress_health_check=[hypothesis.HealthCheck.too_slow],
-        database=None,
-        max_examples=1000,
-        min_satisfying_examples=1,
-        verbosity=hypothesis.Verbosity.verbose,
-        deadline=50000))
-
-hypothesis.settings.load_profile(
-    'sandcastle' if is_sandcastle() else os.getenv('CAFFE2_HYPOTHESIS_PROFILE',
-                                                   'dev')
-)
-
-
-def dims(min_value=1, max_value=5):
-    return st.integers(min_value=min_value, max_value=max_value)
-
-
-def elements_of_type(dtype=np.float32, filter_=None):
-    elems = None
-    if dtype is np.float16:
-        elems = floats(min_value=-1.0, max_value=1.0, width=16)
-    elif dtype is np.float32:
-        elems = floats(min_value=-1.0, max_value=1.0, width=32)
-    elif dtype is np.float64:
-        elems = floats(min_value=-1.0, max_value=1.0, width=64)
-    elif dtype is np.int32:
-        elems = st.integers(min_value=0, max_value=2 ** 31 - 1)
-    elif dtype is np.int64:
-        elems = st.integers(min_value=0, max_value=2 ** 63 - 1)
-    elif dtype is bool:
-        elems = st.booleans()
-    else:
-        raise ValueError("Unexpected dtype without elements provided")
-    return elems if filter_ is None else elems.filter(filter_)
-
-
-def arrays(dims, dtype=np.float32, elements=None, unique=False):
-    if elements is None:
-        elements = elements_of_type(dtype)
-    return hypothesis.extra.numpy.arrays(
-        dtype,
-        dims,
-        elements=elements,
-        unique=unique,
-    )
-
-
-def tensor(min_dim=1,
-           max_dim=4,
-           dtype=np.float32,
-           elements=None,
-           unique=False,
-           **kwargs):
-    dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
-    return dims_.flatmap(
-        lambda dims: arrays(dims, dtype, elements, unique=unique))
-
-
-def tensor1d(min_len=1, max_len=64, dtype=np.float32, elements=None):
-    return tensor(1, 1, dtype, elements, min_value=min_len, max_value=max_len)
-
-
-def segment_ids(size, is_sorted):
-    if size == 0:
-        return st.just(np.empty(shape=[0], dtype=np.int32))
-    if is_sorted:
-        return arrays(
-            [size],
-            dtype=np.int32,
-            elements=st.booleans()).map(
-                lambda x: np.cumsum(x, dtype=np.int32) - x[0])
-    else:
-        return arrays(
-            [size],
-            dtype=np.int32,
-            elements=st.integers(min_value=0, max_value=2 * size))
-
-
-def lengths(size, min_segments=None, max_segments=None, **kwargs):
-    # First generate number of boarders between segments
-    # Then create boarder values and add 0 and size
-    # By sorting and computing diff we convert them to lengths of
-    # possible 0 value
-    if min_segments is None:
-        min_segments = 0
-    if max_segments is None:
-        max_segments = size
-    assert min_segments >= 0
-    assert min_segments <= max_segments
-    if size == 0 and max_segments == 0:
-        return st.just(np.empty(shape=[0], dtype=np.int32))
-    assert max_segments > 0, "size is not 0, need at least one segment"
-    return st.integers(
-        min_value=max(min_segments - 1, 0), max_value=max_segments - 1
-    ).flatmap(
-        lambda num_borders:
-        hypothesis.extra.numpy.arrays(
-            np.int32, num_borders, elements=st.integers(
-                min_value=0, max_value=size
-            )
-        )
-    ).map(
-        lambda x: np.append(x, np.array([0, size], dtype=np.int32))
-    ).map(sorted).map(np.diff)
-
-
-def segmented_tensor(
-    min_dim=1,
-    max_dim=4,
-    dtype=np.float32,
-    is_sorted=True,
-    elements=None,
-    segment_generator=segment_ids,
-    allow_empty=False,
-    **kwargs
-):
-    gen_empty = st.booleans() if allow_empty else st.just(False)
-    data_dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
-    data_dims_ = st.tuples(
-        gen_empty, data_dims_
-    ).map(lambda pair: ([0] if pair[0] else []) + pair[1])
-    return data_dims_.flatmap(lambda data_dims: st.tuples(
-        arrays(data_dims, dtype, elements),
-        segment_generator(data_dims[0], is_sorted=is_sorted),
-    ))
-
-
-def lengths_tensor(min_segments=None, max_segments=None, *args, **kwargs):
-    gen = functools.partial(
-        lengths, min_segments=min_segments, max_segments=max_segments)
-    return segmented_tensor(*args, segment_generator=gen, **kwargs)
-
-
-def sparse_segmented_tensor(min_dim=1, max_dim=4, dtype=np.float32,
-                            is_sorted=True, elements=None, allow_empty=False,
-                            segment_generator=segment_ids, itype=np.int64,
-                            **kwargs):
-    gen_empty = st.booleans() if allow_empty else st.just(False)
-    data_dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
-    all_dims_ = st.tuples(gen_empty, data_dims_).flatmap(
-        lambda pair: st.tuples(
-            st.just(pair[1]),
-            (st.integers(min_value=1, max_value=pair[1][0]) if not pair[0]
-             else st.just(0)),
-        ))
-    return all_dims_.flatmap(lambda dims: st.tuples(
-        arrays(dims[0], dtype, elements),
-        arrays(dims[1], dtype=itype, elements=st.integers(
-            min_value=0, max_value=dims[0][0] - 1)),
-        segment_generator(dims[1], is_sorted=is_sorted),
-    ))
-
-
-def sparse_lengths_tensor(**kwargs):
-    return sparse_segmented_tensor(segment_generator=lengths, **kwargs)
-
-
-def tensors(n, min_dim=1, max_dim=4, dtype=np.float32, elements=None, **kwargs):
-    dims_ = st.lists(dims(**kwargs), min_size=min_dim, max_size=max_dim)
-    return dims_.flatmap(
-        lambda dims: st.lists(
-            arrays(dims, dtype, elements),
-            min_size=n,
-            max_size=n))
-
-
-def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
-    return tensors(
-        n, 1, 1, dtype, elements, min_value=min_len, max_value=max_len
-    )
-
-
-cpu_do = caffe2_pb2.DeviceOption()
-cuda_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.CUDA)
-hip_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.HIP)
-gpu_do = caffe2_pb2.DeviceOption(device_type=workspace.GpuDeviceType)  # CUDA or ROCm
-_cuda_do_list = ([cuda_do] if workspace.has_cuda_support else [])
-_hip_do_list = ([hip_do] if workspace.has_hip_support else [])
-_gpu_do_list = ([gpu_do] if workspace.has_gpu_support else [])
-# (bddppq) Do not rely on this no_hip option! It's just used to
-# temporarily skip some flaky tests on ROCM before it's getting more mature.
-_device_options_no_hip = [cpu_do] + _cuda_do_list
-device_options = _device_options_no_hip + _hip_do_list
-
-# Include device option for each GPU
-expanded_device_options = [cpu_do] + [
-    caffe2_pb2.DeviceOption(device_type=workspace.GpuDeviceType, device_id=i)
-    for i in range(workspace.NumGpuDevices())]
-
-
-def device_checker_device_options():
-    return st.just(device_options)
-
-
-def gradient_checker_device_option():
-    return st.sampled_from(device_options)
-
-
-gcs = dict(
-    gc=gradient_checker_device_option(),
-    dc=device_checker_device_options()
-)
-
-gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
-gcs_cuda_only = dict(gc=st.sampled_from(_cuda_do_list), dc=st.just(_cuda_do_list))
-gcs_gpu_only = dict(gc=st.sampled_from(_gpu_do_list), dc=st.just(_gpu_do_list))  # CUDA or ROCm
-gcs_no_hip = dict(gc=st.sampled_from(_device_options_no_hip), dc=st.just(_device_options_no_hip))
-
-
-@contextlib.contextmanager
-def temp_workspace(name=b"temp_ws"):
-    old_ws_name = workspace.CurrentWorkspace()
-    workspace.SwitchWorkspace(name, True)
-    yield
-    workspace.ResetWorkspace()
-    workspace.SwitchWorkspace(old_ws_name)
-
-
-def runOpBenchmark(
-    device_option,
-    op,
-    inputs,
-    input_device_options=None,
-    iterations=10,
-):
-    op = copy.deepcopy(op)
-    op.device_option.CopyFrom(device_option)
-    net = caffe2_pb2.NetDef()
-    net.op.extend([op])
-    net.name = op.name if op.name else "test"
-
-    with temp_workspace():
-        _input_device_options = input_device_options or \
-            core.InferOpBlobDevicesAsDict(op)[0]
-        for (n, b) in zip(op.input, inputs):
-            workspace.FeedBlob(
-                n,
-                b,
-                device_option=_input_device_options.get(n, device_option)
-            )
-        workspace.CreateNet(net)
-        ret = workspace.BenchmarkNet(net.name, 1, iterations, True)
-    return ret
-
-
-def runOpOnInput(
-    device_option,
-    op,
-    inputs,
-    input_device_options=None,
-):
-    op = copy.deepcopy(op)
-    op.device_option.CopyFrom(device_option)
-
-    with temp_workspace():
-        if (len(op.input) > len(inputs)):
-            raise ValueError(
-                'must supply an input for each input on the op: %s vs %s' %
-                (op.input, inputs))
-        _input_device_options = input_device_options or \
-            core.InferOpBlobDevicesAsDict(op)[0]
-        for (n, b) in zip(op.input, inputs):
-            workspace.FeedBlob(
-                n,
-                b,
-                device_option=_input_device_options.get(n, device_option)
-            )
-        workspace.RunOperatorOnce(op)
-        outputs_to_check = list(range(len(op.output)))
-        outs = []
-        for output_index in outputs_to_check:
-            output_blob_name = op.output[output_index]
-            output = workspace.FetchBlob(output_blob_name)
-            outs.append(output)
-        return outs
-
-
-class HypothesisTestCase(test_util.TestCase):
-    """
-    A unittest.TestCase subclass with some helper functions for
-    utilizing the `hypothesis` (hypothesis.readthedocs.io) library.
-    """
-
-    def assertDeviceChecks(
-        self,
-        device_options,
-        op,
-        inputs,
-        outputs_to_check,
-        input_device_options=None,
-        threshold=0.01
-    ):
-        """
-        Asserts that the operator computes the same outputs, regardless of
-        which device it is executed on.
-
-        Useful for checking the consistency of GPU and CPU
-        implementations of operators.
-
-        Usage example:
-
-            @given(inputs=hu.tensors(n=2), in_place=st.booleans(), **hu.gcs)
-            def test_sum(self, inputs, in_place, gc, dc):
-                op = core.CreateOperator("Sum", ["X1", "X2"],
-                                                ["Y" if not in_place else "X1"])
-                X1, X2 = inputs
-                self.assertDeviceChecks(dc, op, [X1, X2], [0])
-        """
-        dc = device_checker.DeviceChecker(
-            threshold,
-            device_options=device_options
-        )
-        self.assertTrue(
-            dc.CheckSimple(op, inputs, outputs_to_check, input_device_options)
-        )
-
-    def assertGradientChecks(
-        self,
-        device_option,
-        op,
-        inputs,
-        outputs_to_check,
-        outputs_with_grads,
-        grad_ops=None,
-        threshold=0.005,
-        stepsize=0.05,
-        input_device_options=None,
-        ensure_outputs_are_inferred=False,
-    ):
-        """
-        Implements a standard numerical gradient checker for the operator
-        in question.
-
-        Useful for checking the consistency of the forward and
-        backward implementations of operators.
-
-        Usage example:
-
-            @given(inputs=hu.tensors(n=2), in_place=st.booleans(), **hu.gcs)
-            def test_sum(self, inputs, in_place, gc, dc):
-                op = core.CreateOperator("Sum", ["X1", "X2"],
-                                                ["Y" if not in_place else "X1"])
-                X1, X2 = inputs
-                self.assertGradientChecks(gc, op, [X1, X2], 0, [0])
-        """
-        gc = gradient_checker.GradientChecker(
-            stepsize=stepsize,
-            threshold=threshold,
-            device_option=device_option,
-            workspace_name=str(device_option),
-            input_device_options=input_device_options,
-        )
-        res, grad, grad_estimated = gc.CheckSimple(
-            op, inputs, outputs_to_check, outputs_with_grads,
-            grad_ops=grad_ops,
-            input_device_options=input_device_options,
-            ensure_outputs_are_inferred=ensure_outputs_are_inferred,
-        )
-        self.assertEqual(grad.shape, grad_estimated.shape)
-        self.assertTrue(
-            res,
-            "Gradient check failed for input " + str(op.input[outputs_to_check])
-        )
-
-    def _assertGradReferenceChecks(
-        self,
-        op,
-        inputs,
-        ref_outputs,
-        output_to_grad,
-        grad_reference,
-        threshold=1e-4,
-    ):
-        grad_blob_name = output_to_grad + '_grad'
-        grad_ops, grad_map = core.GradientRegistry.GetBackwardPass(
-            [op], {output_to_grad: grad_blob_name})
-        output_grad = workspace.FetchBlob(output_to_grad)
-        grad_ref_outputs = grad_reference(output_grad, ref_outputs, inputs)
-        workspace.FeedBlob(grad_blob_name, workspace.FetchBlob(output_to_grad))
-        workspace.RunOperatorsOnce(grad_ops)
-
-        self.assertEqual(len(grad_ref_outputs), len(inputs))
-        for (n, ref) in zip(op.input, grad_ref_outputs):
-            grad_names = grad_map.get(n)
-            if not grad_names:
-                # no grad for this input
-                self.assertIsNone(ref)
-            else:
-                if isinstance(grad_names, core.BlobReference):
-                    # dense gradient
-                    ref_vals = ref
-                    ref_indices = None
-                    val_name = grad_names
-                else:
-                    # sparse gradient
-                    ref_vals, ref_indices = ref
-                    val_name = grad_names.values
-                vals = workspace.FetchBlob(str(val_name))
-                np.testing.assert_allclose(
-                    vals,
-                    ref_vals,
-                    atol=threshold,
-                    rtol=threshold,
-                    err_msg='Gradient {0} (x) is not matching the reference (y)'
-                    .format(val_name),
-                )
-                if ref_indices is not None:
-                    indices = workspace.FetchBlob(str(grad_names.indices))
-                    np.testing.assert_allclose(indices, ref_indices,
-                                               atol=1e-4, rtol=1e-4)
-
-    def _assertInferTensorChecks(self, name, shapes, types, output,
-                                 ensure_output_is_inferred=False):
-        self.assertTrue(
-            not ensure_output_is_inferred or (name in shapes),
-            'Shape for {0} was not inferred'.format(name))
-
-        if name not in shapes:
-            # No inferred shape or type available
-            return
-        output = workspace.FetchBlob(name)
-        if type(output) is np.ndarray:
-            if output.dtype == np.dtype('float64'):
-                correct_type = caffe2_pb2.TensorProto.DOUBLE
-            elif output.dtype == np.dtype('float32'):
-                correct_type = caffe2_pb2.TensorProto.FLOAT
-            elif output.dtype == np.dtype('int32'):
-                correct_type = caffe2_pb2.TensorProto.INT32
-            elif output.dtype == np.dtype('int64'):
-                correct_type = caffe2_pb2.TensorProto.INT64
-            else:
-                correct_type = "unknown {}".format(np.dtype)
-        else:
-            correct_type = str(type(output))
-        try:
-            np.testing.assert_array_equal(
-                np.array(shapes[name]).astype(np.int32),
-                np.array(output.shape).astype(np.int32),
-                err_msg='Shape {} mismatch: {} vs. {}'.format(
-                    name,
-                    shapes[name],
-                    output.shape))
-            # BUG: Workspace blob type not being set correctly T16121392
-            if correct_type != caffe2_pb2.TensorProto.INT32:
-                return
-            np.testing.assert_equal(
-                types[name],
-                correct_type,
-                err_msg='Type {} mismatch: {} vs. {}'.format(
-                    name, types[name], correct_type,
-                )
-            )
-        except AssertionError as e:
-            # Temporarily catch these assertion errors when validating
-            # inferred shape and type info
-            logging.warning(str(e))
-            if os.getenv('CAFFE2_ASSERT_SHAPEINFERENCE') == '1' or ensure_output_is_inferred:
-                raise e
-
-    def assertReferenceChecks(
-        self,
-        device_option,
-        op,
-        inputs,
-        reference,
-        input_device_options=None,
-        threshold=1e-4,
-        output_to_grad=None,
-        grad_reference=None,
-        atol=None,
-        outputs_to_check=None,
-        ensure_outputs_are_inferred=False,
-    ):
-        """
-        This runs the reference Python function implementation
-        (effectively calling `reference(*inputs)`, and compares that
-        to the output of output, with an absolute/relative tolerance
-        given by the `threshold` parameter.
-
-        Useful for checking the implementation matches the Python
-        (typically NumPy) implementation of the same functionality.
-
-        Usage example:
-
-            @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs)
-            def test_softsign(self, X, inplace, gc, dc):
-                op = core.CreateOperator(
-                    "Softsign", ["X"], ["X" if inplace else "Y"])
-
-                def softsign(X):
-                    return (X / (1 + np.abs(X)),)
-
-                self.assertReferenceChecks(gc, op, [X], softsign)
-        """
-        op = copy.deepcopy(op)
-        op.device_option.CopyFrom(device_option)
-
-        with temp_workspace():
-            if (len(op.input) > len(inputs)):
-                raise ValueError(
-                    'must supply an input for each input on the op: %s vs %s' %
-                    (op.input, inputs))
-            _input_device_options = input_device_options or \
-                core.InferOpBlobDevicesAsDict(op)[0]
-            for (n, b) in zip(op.input, inputs):
-                workspace.FeedBlob(
-                    n,
-                    b,
-                    device_option=_input_device_options.get(n, device_option)
-                )
-            net = core.Net("opnet")
-            net.Proto().op.extend([op])
-            test_shape_inference = False
-            try:
-                (shapes, types) = workspace.InferShapesAndTypes([net])
-                test_shape_inference = True
-            except RuntimeError as e:
-                # Temporarily catch runtime errors when inferring shape
-                # and type info
-                logging.warning(str(e))
-                if os.getenv('CAFFE2_ASSERT_SHAPEINFERENCE') == '1' or ensure_outputs_are_inferred:
-                    raise e
-            workspace.RunNetOnce(net)
-            reference_outputs = reference(*inputs)
-            if not (isinstance(reference_outputs, tuple) or
-                    isinstance(reference_outputs, list)):
-                raise RuntimeError(
-                    "You are providing a wrong reference implementation. A "
-                    "proper one should return a tuple/list of numpy arrays.")
-            if not outputs_to_check:
-                self.assertEqual(len(reference_outputs), len(op.output))
-                outputs_to_check = list(range(len(op.output)))
-            outs = []
-            for (output_index, ref) in zip(outputs_to_check, reference_outputs):
-                output_blob_name = op.output[output_index]
-                output = workspace.FetchBlob(output_blob_name)
-                if output.dtype.kind in ('S', 'O'):
-                    np.testing.assert_array_equal(output, ref)
-                else:
-                    if atol is None:
-                        atol = threshold
-                    np.testing.assert_allclose(
-                        output, ref, atol=atol, rtol=threshold,
-                        err_msg=(
-                            'Output {0} is not matching the reference'.format(
-                                output_blob_name,
-                            )),
-                    )
-                if test_shape_inference:
-                    self._assertInferTensorChecks(
-                        output_blob_name, shapes, types, output,
-                        ensure_output_is_inferred=ensure_outputs_are_inferred)
-                outs.append(output)
-            if grad_reference is not None:
-                assert output_to_grad is not None, \
-                    "If grad_reference is set," \
-                    "output_to_grad has to be set as well"
-
-                with core.DeviceScope(device_option):
-                    self._assertGradReferenceChecks(
-                        op, inputs, reference_outputs,
-                        output_to_grad, grad_reference,
-                        threshold=threshold)
-
-            return outs
-
-    def assertValidationChecks(
-            self,
-            device_option,
-            op,
-            inputs,
-            validator,
-            input_device_options=None,
-            as_kwargs=True,
-            init_net=None,
-    ):
-        if as_kwargs:
-            assert len(set(list(op.input) + list(op.output))) == \
-                len(op.input) + len(op.output), \
-                "in-place ops are not supported in as_kwargs mode"
-        op = copy.deepcopy(op)
-        op.device_option.CopyFrom(device_option)
-
-        with temp_workspace():
-            _input_device_options = input_device_options or \
-                core.InferOpBlobDevicesAsDict(op)[0]
-            for (n, b) in zip(op.input, inputs):
-                workspace.FeedBlob(
-                    n,
-                    b,
-                    device_option=_input_device_options.get(n, device_option)
-                )
-            if init_net:
-                workspace.RunNetOnce(init_net)
-            workspace.RunOperatorOnce(op)
-            outputs = [workspace.FetchBlob(n) for n in op.output]
-            if as_kwargs:
-                validator(**dict(zip(
-                    list(op.input) + list(op.output), inputs + outputs)))
-            else:
-                validator(inputs=inputs, outputs=outputs)
-
-    def assertRunOpRaises(
-        self,
-        device_option,
-        op,
-        inputs,
-        input_device_options=None,
-        exception=(Exception,),
-        regexp=None,
-    ):
-        op = copy.deepcopy(op)
-        op.device_option.CopyFrom(device_option)
-
-        with temp_workspace():
-            _input_device_options = input_device_options or \
-                core.InferOpBlobDevicesAsDict(op)[0]
-            for (n, b) in zip(op.input, inputs):
-                workspace.FeedBlob(
-                    n,
-                    b,
-                    device_option=_input_device_options.get(n, device_option)
-                )
-            if regexp is None:
-                self.assertRaises(exception, workspace.RunOperatorOnce, op)
-            else:
-                self.assertRaisesRegex(
-                    exception, regexp, workspace.RunOperatorOnce, op)
diff --git a/caffe2/python/ideep/LRN_op_test.py b/caffe2/python/ideep/LRN_op_test.py
deleted file mode 100644
index 23ecd79062f7..000000000000
--- a/caffe2/python/ideep/LRN_op_test.py
+++ /dev/null
@@ -1,45 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class LRNTest(hu.HypothesisTestCase):
-    @given(input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           im_size=st.integers(1, 10),
-           order=st.sampled_from(["NCHW"]),
-           **mu.gcs)
-    @settings(deadline=10000)
-    def test_LRN(self, input_channels,
-                            batch_size, im_size, order,
-                             gc, dc):
-        op = core.CreateOperator(
-            "LRN",
-            ["X"],
-            ["Y", "Y_scale"],
-            size=5,
-            alpha=0.001,
-            beta=0.75,
-            bias=2.0,
-            order=order,
-        )
-        X = np.random.rand(
-            batch_size, input_channels, im_size, im_size).astype(np.float32)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/__init__.py b/caffe2/python/ideep/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/ideep/adam_op_test.py b/caffe2/python/ideep/adam_op_test.py
deleted file mode 100644
index 5ac0395bff63..000000000000
--- a/caffe2/python/ideep/adam_op_test.py
+++ /dev/null
@@ -1,82 +0,0 @@
-
-
-
-
-
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core, workspace
-from hypothesis import given
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class TestAdamOps(hu.HypothesisTestCase):
-    @given(inputs=hu.tensors(n=4),
-           ITER=st.integers(min_value=0, max_value=10000),
-           LR=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           beta1=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           beta2=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           **mu.gcs)
-    def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
-        param, mom1, mom2, grad = inputs
-        ITER = np.array([ITER], dtype=np.int64)
-        LR = np.array([LR], dtype=np.float32)
-        mom2 = np.absolute(mom2)
-        op = core.CreateOperator(
-            "Adam",
-            ["param", "mom1", "mom2", "grad", "lr", "iter"],
-            ["output_param", "output_mom1", "output_mom2"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-        # Iter lives on the CPU
-        input_device_options = {'iter': hu.cpu_do, 'lr': hu.cpu_do}
-
-        self.assertDeviceChecks(
-            dc, op,
-            [param, mom1, mom2, grad, LR, ITER],
-            [0],
-            input_device_options=input_device_options,
-            threshold=0.001)
-
-    @given(inputs=hu.tensors(n=4),
-           ITER=st.integers(min_value=0, max_value=10000),
-           LR=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           beta1=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           beta2=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           **mu.gcs)
-    def test_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
-        param, mom1, mom2, grad = inputs
-        ITER = np.array([ITER], dtype=np.int64)
-        LR = np.array([LR], dtype=np.float32)
-        mom2 = np.absolute(mom2)
-
-        op = core.CreateOperator(
-            "Adam",
-            ["param", "mom1", "mom2", "grad", "lr", "iter"],
-            ["output_param", "output_mom1", "output_mom2", "output_grad"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-
-        # Iter lives on the CPU
-        input_device_options = {'iter': hu.cpu_do, 'lr': hu.cpu_do}
-
-        self.assertDeviceChecks(
-            dc, op,
-            [param, mom1, mom2, grad, LR, ITER],
-            [0],
-            input_device_options=input_device_options,
-            threshold=0.001)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/blobs_queue_db_test.py b/caffe2/python/ideep/blobs_queue_db_test.py
deleted file mode 100644
index 966fcc23d47d..000000000000
--- a/caffe2/python/ideep/blobs_queue_db_test.py
+++ /dev/null
@@ -1,109 +0,0 @@
-
-
-
-
-
-import unittest
-import numpy as np
-
-import caffe2.proto.caffe2_pb2 as caffe2_pb2
-from caffe2.python import core, workspace, timeout_guard
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class BlobsQueueDBTest(unittest.TestCase):
-    def test_create_blobs_queue_db_string(self):
-        device_opt = core.DeviceOption(caffe2_pb2.IDEEP, 0)
-        with core.DeviceScope(device_opt):
-            def add_blobs(queue, num_samples):
-                blob = core.BlobReference("blob")
-                status = core.BlobReference("blob_status")
-                for i in range(num_samples):
-                    self._add_blob_to_queue(
-                        queue, self._create_test_tensor_protos(i), blob, status
-                    )
-            self._test_create_blobs_queue_db(add_blobs)
-
-    def test_create_blobs_queue_db_tensor(self):
-        device_opt = core.DeviceOption(caffe2_pb2.IDEEP, 0)
-        with core.DeviceScope(device_opt):
-            def add_blobs(queue, num_samples):
-                blob = core.BlobReference("blob")
-                status = core.BlobReference("blob_status")
-                for i in range(num_samples):
-                    data = self._create_test_tensor_protos(i)
-                    data = np.array([data], dtype=str)
-                    self._add_blob_to_queue(
-                        queue, data, blob, status
-                    )
-            self._test_create_blobs_queue_db(add_blobs)
-
-    def _test_create_blobs_queue_db(self, add_blobs_fun):
-        device_opt = core.DeviceOption(caffe2_pb2.IDEEP, 0)
-        with core.DeviceScope(device_opt):
-            num_samples = 10000
-            batch_size = 10
-            init_net = core.Net('init_net')
-            net = core.Net('test_create_blobs_queue_db')
-            queue = init_net.CreateBlobsQueue([], 'queue', capacity=num_samples)
-            reader = init_net.CreateBlobsQueueDB(
-                [queue],
-                'blobs_queue_db_reader',
-                value_blob_index=0,
-                timeout_secs=0.1,
-            )
-            workspace.RunNetOnce(init_net)
-            add_blobs_fun(queue, num_samples)
-
-            net.TensorProtosDBInput(
-                [reader],
-                ['image', 'label'],
-                batch_size=batch_size
-            )
-            workspace.CreateNet(net)
-
-            close_net = core.Net('close_net')
-            close_net.CloseBlobsQueue([queue], [])
-
-            for i in range(int(num_samples / batch_size)):
-                with timeout_guard.CompleteInTimeOrDie(2.0):
-                    workspace.RunNet(net)
-
-                images = workspace.FetchBlob('image')
-                labels = workspace.FetchBlob('label')
-                self.assertEqual(batch_size, len(images))
-                self.assertEqual(batch_size, len(labels))
-                for idx, item in enumerate(images):
-                    self.assertEqual(
-                        "foo{}".format(i * batch_size + idx).encode('utf-8'), item
-                    )
-                for item in labels:
-                    self.assertEqual(1, item)
-            workspace.RunNetOnce(close_net)
-
-    def _add_blob_to_queue(self, queue, data, blob, status):
-        device_opt = core.DeviceOption(caffe2_pb2.IDEEP, 0)
-        with core.DeviceScope(device_opt):
-            workspace.FeedBlob(blob, data, core.DeviceOption(caffe2_pb2.CPU, 0))
-            op = core.CreateOperator(
-                "SafeEnqueueBlobs",
-                [queue, blob],
-                [blob, status],
-            )
-
-            workspace.RunOperatorOnce(op)
-
-    def _create_test_tensor_protos(self, idx):
-        item = caffe2_pb2.TensorProtos()
-        data = item.protos.add()
-        data.data_type = core.DataType.STRING
-        data.string_data.append("foo{}".format(idx).encode('utf-8'))
-        label = item.protos.add()
-        label.data_type = core.DataType.INT32
-        label.int32_data.append(1)
-
-        return item.SerializeToString()
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/ideep/channel_shuffle_op_test.py b/caffe2/python/ideep/channel_shuffle_op_test.py
deleted file mode 100644
index b4cedca61061..000000000000
--- a/caffe2/python/ideep/channel_shuffle_op_test.py
+++ /dev/null
@@ -1,45 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class ChannelShuffleTest(hu.HypothesisTestCase):
-    @given(size=st.integers(8, 10),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 32),
-           group=st.integers(2, 4),
-           stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           **mu.gcs)
-    @settings(max_examples=10, deadline=None)
-    def test_channel_shuffle(self, size, input_channels, batch_size, group, stride, pad, kernel, gc, dc):
-        op = core.CreateOperator(
-            "ChannelShuffle",
-            ["X"],
-            ["Y"],
-            group=group,
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-        )
-        X = np.random.rand(
-            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/concat_split_op_test.py b/caffe2/python/ideep/concat_split_op_test.py
deleted file mode 100644
index 75c9ceeba0e4..000000000000
--- a/caffe2/python/ideep/concat_split_op_test.py
+++ /dev/null
@@ -1,164 +0,0 @@
-
-
-
-
-
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core, workspace
-from hypothesis import given, settings
-import caffe2.python.ideep_test_util as mu
-
-@st.composite
-def _tensor_splits(draw, add_axis=False):
-    """Generates (axis, split_info, tensor_splits) tuples."""
-    tensor = draw(hu.tensor(min_dim=2, min_value=4))  # Each dim has at least 4 elements.
-    axis = draw(st.integers(-len(tensor.shape), len(tensor.shape) - 1))
-    if add_axis:
-        # Simple case: get individual slices along one axis, where each of them
-        # is (N-1)-dimensional. The axis will be added back upon concatenation.
-        return (
-            axis,
-            np.ones(tensor.shape[axis], dtype=np.int32),
-            [
-                np.array(tensor.take(i, axis=axis))
-                for i in range(tensor.shape[axis])
-            ]
-        )
-    else:
-        # General case: pick some (possibly consecutive, even non-unique)
-        # indices at which we will split the tensor, along the given axis.
-        splits = sorted(draw(
-            st.lists(elements=st.integers(0, tensor.shape[axis]), max_size=4)
-        ) + [0, tensor.shape[axis]])
-        # Not support empty tensor
-        splits = list(set(splits))
-        return (
-            axis,
-            np.array(np.diff(splits), dtype=np.int32),
-            [
-                tensor.take(range(splits[i], splits[i + 1]), axis=axis)
-                for i in range(len(splits) - 1)
-            ],
-        )
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class TestConcatSplitOps(hu.HypothesisTestCase):
-    @given(tensor_splits=_tensor_splits(),
-           **mu.gcs)
-    @settings(deadline=10000)
-    def test_concat(self, tensor_splits, gc, dc):
-        axis, _, splits = tensor_splits
-
-        op = core.CreateOperator(
-            "Concat",
-            ['X_{}'.format(i) for i in range(len(splits))],
-            ['concat_result', 'split_info'],
-            axis=axis
-        )
-
-        self.assertDeviceChecks(dc, op, splits, [0, 1])
-        self.assertGradientChecks(gc, op, splits, 0, [0])
-
-    @given(tensor_splits=_tensor_splits(),
-           split_as_arg=st.booleans(),
-           **mu.gcs)
-    @settings(deadline=10000)
-    def test_split(self, tensor_splits, split_as_arg, gc, dc):
-        axis, split_info, splits = tensor_splits
-
-        split_as_arg = True
-
-        if split_as_arg:
-            input_names = ['input']
-            input_tensors = [np.concatenate(splits, axis=axis)]
-            kwargs = dict(axis=axis, split=split_info)
-        else:
-            input_names = ['input', 'split']
-            input_tensors = [np.concatenate(splits, axis=axis), split_info]
-            kwargs = dict(axis=axis)
-
-        op = core.CreateOperator(
-            "Split",
-            input_names,
-            ['X_{}'.format(i) for i in range(len(split_info))],
-            **kwargs
-        )
-
-        def split_ref(input, split=split_info):
-            s = np.cumsum([0] + list(split))
-            return [
-                np.array(input.take(np.arange(s[i], s[i + 1]), axis=axis))
-                for i in range(len(split))
-            ]
-        outputs_with_grad = range(len(split_info))
-        self.assertDeviceChecks(dc, op, input_tensors, outputs_with_grad)
-        self.assertGradientChecks(gc, op, input_tensors, 0, outputs_with_grad)
-
-    @given(tensor_splits=_tensor_splits(add_axis=True), **mu.gcs)
-    @settings(deadline=10000)
-    def test_concat_add_axis(self, tensor_splits, gc, dc):
-        axis, _, splits = tensor_splits
-        op = core.CreateOperator(
-            "Concat",
-            ['X_{}'.format(i) for i in range(len(splits))],
-            ['concat_result', 'split_info'],
-            axis=axis,
-            add_axis=1
-        )
-
-        self.assertDeviceChecks(dc, op, splits, [0, 1])
-
-        for i in range(len(splits)):
-            self.assertGradientChecks(gc, op, splits, i, [0])
-
-
-    @given(tensor_splits=_tensor_splits(add_axis=True), **mu.gcs)
-    def test_concat_with_TensorCPU(self, tensor_splits, gc, dc):
-        axis, _, splits = tensor_splits
-        op0 = core.CreateOperator(
-            "Concat",
-            ['X_{}'.format(i) for i in range(len(splits))],
-            ['concat_result0', 'split_info0'],
-            axis=axis,
-            add_axis=1,
-            device_option=dc[0]
-        )
-        op1 = core.CreateOperator(
-            "Concat",
-            ['X_{}'.format(i) for i in range(len(splits))],
-            ['concat_result1', 'split_info1'],
-            axis=axis,
-            add_axis=1,
-            device_option=dc[1]
-        )
-
-        for i, X in enumerate(splits):
-            workspace.FeedBlob('X_{}'.format(i), X, dc[0])
-
-        workspace.RunOperatorOnce(op0)
-        res0 = workspace.FetchBlob('concat_result0')
-        inf0 = workspace.FetchBlob('split_info0')
-
-        workspace.RunOperatorOnce(op1)
-        res1 = workspace.FetchBlob('concat_result1')
-        inf1 = workspace.FetchBlob('split_info1')
-
-        if not np.allclose(res0, res1, atol=0.0, rtol=0.0):
-            print(res1.flatten())
-            print(res0.flatten())
-            print(np.max(np.abs(res1 - res0)))
-            self.assertTrue(False)
-
-        if not np.allclose(inf0, inf1, atol=0.0, rtol=0.0):
-            print(inf1.flatten())
-            print(inf0.flatten())
-            print(np.max(np.abs(inf1 - inf0)))
-            self.assertTrue(False)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/conv_op_test.py b/caffe2/python/ideep/conv_op_test.py
deleted file mode 100644
index 7c5a0026c113..000000000000
--- a/caffe2/python/ideep/conv_op_test.py
+++ /dev/null
@@ -1,165 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.transformations import optimizeForMKLDNN
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class ConvTest(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(8, 10),
-           input_channels=st.integers(1, 3),
-           output_channels=st.integers(1, 5),
-           batch_size=st.integers(1, 3),
-           use_bias=st.booleans(),
-           training_mode=st.booleans(),
-           group=st.integers(1, 2),
-           **mu.gcs)
-    @settings(max_examples=10, deadline=None)
-    def test_convolution(self, stride, pad, kernel, size,
-                         input_channels, output_channels,
-                         batch_size, use_bias, training_mode, group, gc, dc):
-        training = 1 if training_mode else 0
-        op = core.CreateOperator(
-            "Conv",
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            training_mode=training,
-        )
-        X = np.random.rand(
-            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(output_channels * group, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
-
-        inputs = [X, w, b] if use_bias else [X, w]
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-        if training_mode:
-            for i in range(len(inputs)):
-                self.assertGradientChecks(gc, op, inputs, i, [0], threshold=0.01)
-
-    @settings(max_examples=10, deadline=None)
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           size=st.integers(8, 10),
-           input_channels=st.integers(16, 32),
-           output_channels=st.integers(16, 32),
-           batch_size=st.integers(1, 3),
-           use_bias=st.booleans(),
-           training_mode=st.booleans(),
-           **mu.gcs)
-    def test_winograd_convolution(self, stride, pad, size,
-                             input_channels, output_channels,
-                             batch_size, use_bias, training_mode, gc, dc):
-        training = 1 if training_mode else 0
-        conv3x3_winograd_algorithm = 1
-        kernel = 3
-        op = core.CreateOperator(
-            "Conv",
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            training_mode=training,
-            algorithm=conv3x3_winograd_algorithm
-        )
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-                output_channels, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-
-        inputs = [X, w, b] if use_bias else [X, w]
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-        if training_mode:
-            for i in range(len(inputs)):
-                self.assertGradientChecks(gc, op, inputs, i, [0], threshold=0.01)
-
-    @given(batch_size=st.integers(1, 3), **mu.gcs)
-    def test_depthwise_convolution(self, batch_size, gc, dc):
-        op = core.CreateOperator(
-            "Conv",
-            ["X", "w", "b"],
-            ["Y"],
-            stride=1,
-            pad=0,
-            kernel=1,
-            group=4,
-            device_option=dc[0]
-        )
-        op1 = core.CreateOperator(
-            "Conv",
-            ["X", "w", "b"],
-            ["Y"],
-            stride=1,
-            pad=0,
-            kernel=1,
-            group=4,
-            device_option=dc[1]
-        )
-        X = np.random.rand(batch_size, 544, 14, 14).astype(np.float32)
-        w = np.random.rand(544, 136, 1, 1).astype(np.float32)
-        b = np.random.rand(544).astype(np.float32)
-
-        workspace.SwitchWorkspace("_device_check_", True)
-        workspace.FeedBlob('X', X, dc[0])
-        workspace.FeedBlob('w', w, dc[0])
-        workspace.FeedBlob('b', b, dc[0])
-        workspace.RunOperatorOnce(op)
-        Y0 = workspace.FetchBlob('Y')
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('X', X, dc[1])
-        workspace.FeedBlob('w', w, dc[1])
-        workspace.FeedBlob('b', b, dc[1])
-        net = core.Net("net")
-        old_net = caffe2_pb2.NetDef()
-        old_net.op.extend([op1])
-        net.Proto().CopyFrom(old_net)
-        optimizeForMKLDNN(net)
-        workspace.RunOperatorOnce(net.Proto().op[0])
-        Y1 = workspace.FetchBlob('Y')
-
-        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
-            print(Y1.flatten())
-            print(Y0.flatten())
-            print(np.max(np.abs(Y1 - Y0)))
-            self.assertTrue(False)
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('X', X, dc[1])
-        workspace.FeedBlob('w', w, dc[1])
-        workspace.FeedBlob('b', b, dc[1])
-        workspace.RunOperatorOnce(op1)
-        Y2 = workspace.FetchBlob('Y')
-
-        if not np.allclose(Y0, Y2, atol=0.01, rtol=0.01):
-            print(Y2.flatten())
-            print(Y0.flatten())
-            print(np.max(np.abs(Y2 - Y0)))
-            self.assertTrue(False)
-
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/conv_transpose_test.py b/caffe2/python/ideep/conv_transpose_test.py
deleted file mode 100644
index eeda2ea43a2d..000000000000
--- a/caffe2/python/ideep/conv_transpose_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-
-
-
-
-import unittest
-import numpy as np
-from hypothesis import assume, given, settings
-import hypothesis.strategies as st
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class ConvTransposeTest(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 2),
-           pad=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           adj=st.integers(0, 2),
-           size=st.integers(7, 10),
-           input_channels=st.integers(1, 8),
-           output_channels=st.integers(1, 8),
-           batch_size=st.integers(1, 3),
-           use_bias=st.booleans(),
-           training_mode=st.booleans(),
-           compute_dX=st.booleans(),
-           **mu.gcs)
-    @settings(max_examples=2, timeout=100)
-    def test_convolution_transpose_gradients(self, stride, pad, kernel, adj,
-                                             size, input_channels,
-                                             output_channels, batch_size,
-                                             use_bias, training_mode,
-                                             compute_dX, gc, dc):
-        training = 1 if training_mode else 0
-        assume(adj < stride)
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-            input_channels, output_channels, kernel, kernel)\
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        op = core.CreateOperator(
-            "ConvTranspose",
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            stride=stride,
-            kernel=kernel,
-            pad=pad,
-            adj=adj,
-            training_mode=training,
-            no_gradient_to_input=not compute_dX,
-        )
-
-        inputs = [X, w, b] if use_bias else [X, w]
-        self.assertDeviceChecks(dc, op, inputs, [0], threshold=0.001)
-
-        if training_mode:
-            if use_bias and compute_dX:
-                # w, b, X
-                outputs_to_check = [1, 2, 0]
-            elif use_bias:
-                # w, b
-                outputs_to_check = [1, 2]
-            elif compute_dX:
-                # w, X
-                outputs_to_check = [1, 0]
-            else:
-                # w
-                outputs_to_check = [1]
-            for i in outputs_to_check:
-                self.assertGradientChecks(gc, op, inputs, i, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/convfusion_op_test.py b/caffe2/python/ideep/convfusion_op_test.py
deleted file mode 100644
index a0a782ab8a03..000000000000
--- a/caffe2/python/ideep/convfusion_op_test.py
+++ /dev/null
@@ -1,861 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-import math
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.transformations import optimizeForMKLDNN
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class ConvFusionTest(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(8, 20),
-           input_channels=st.integers(1, 16),
-           output_channels=st.integers(1, 16),
-           batch_size=st.integers(1, 3),
-           use_bias=st.booleans(),
-           group=st.integers(1, 1),
-           **mu.gcs)
-    def test_convolution_relu_fusion(self, stride, pad, kernel, size,
-                             input_channels, output_channels,
-                             batch_size, use_bias, group, gc, dc):
-        conv = core.CreateOperator(
-            "Conv",
-            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
-            ["Y0"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            device_option=dc[0]
-        )
-        relu = core.CreateOperator(
-            "Relu",
-            ["Y0"],
-            ["Y0"],
-            device_option=dc[0]
-        )
-
-        # Manual fusion for Conv + ReLU
-        conv_fusion = core.CreateOperator(
-            "ConvFusion",
-            ["X1", "w1", "b1"] if use_bias else ["X1", "w1"],
-            ["Y1"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            fusion_type = 1,
-            device_option=dc[1]
-        )
-
-        X = np.random.rand(
-            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-                output_channels * group, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-        workspace.FeedBlob('X0', X, dc[0])
-        workspace.FeedBlob('w0', w, dc[0])
-        workspace.FeedBlob('b0', b, dc[0])
-        workspace.RunOperatorOnce(conv)
-        workspace.RunOperatorOnce(relu)
-        Y0 = workspace.FetchBlob('Y0')
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('X1', X, dc[1])
-        workspace.FeedBlob('w1', w, dc[1])
-        workspace.FeedBlob('b1', b, dc[1])
-        workspace.RunOperatorOnce(conv_fusion)
-        Y1 = workspace.FetchBlob('Y1')
-        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
-            print(Y1.flatten())
-            print(Y0.flatten())
-            print(np.max(np.abs(Y1 - Y0)))
-            self.assertTrue(False)
-
-        # Auto fusion for Conv + ReLU
-        workspace.ResetWorkspace()
-        old_net = caffe2_pb2.NetDef()
-        conv_old = caffe2_pb2.OperatorDef()
-        conv_old.CopyFrom(conv)
-        conv_old.device_option.CopyFrom(dc[1])
-        relu_old = caffe2_pb2.OperatorDef()
-        relu_old.CopyFrom(relu)
-        relu_old.device_option.CopyFrom(dc[1])
-        old_net.op.extend([conv_old, relu_old])
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        net = core.Net("net")
-        net.Proto().CopyFrom(old_net)
-        optimizeForMKLDNN(net)
-        self.assertTrue(len(net.Proto().op) == 1)
-        self.assertTrue(net.Proto().op[0].type == "ConvFusion")
-        workspace.RunOperatorOnce(net.Proto().op[0])
-        Y2 = workspace.FetchBlob('Y0')
-        if not np.allclose(Y0, Y2, atol=0.01, rtol=0.01):
-            print(Y2.flatten())
-            print(Y0.flatten())
-            print(np.max(np.abs(Y2 - Y0)))
-            self.assertTrue(False)
-
-        workspace.SwitchWorkspace(old_ws_name)
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(8, 20),
-           input_channels=st.integers(1, 16),
-           output_channels=st.integers(1, 16),
-           batch_size=st.integers(1, 3),
-           use_bias=st.booleans(),
-           group=st.integers(1, 1),
-           sum_add=st.sampled_from(["Sum", "Add"]),
-           **mu.gcs)
-    def test_convolution_sum_fusion(self, stride, pad, kernel, size,
-                             input_channels, output_channels,
-                             batch_size, use_bias, group, sum_add, gc, dc):
-        pool_S0 = core.CreateOperator(
-            "MaxPool",
-            ["SX0"],
-            ["S0"],
-            stride=2,
-            pad=0,
-            kernel=2,
-            device_option=dc[0]
-        )
-        conv = core.CreateOperator(
-            "Conv",
-            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
-            ["Y0"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            device_option=dc[0]
-        )
-        sum = core.CreateOperator(
-            sum_add,
-            ["S0", "Y0"],
-            ["S0"],
-            device_option=dc[0]
-        )
-
-        # Manual fusion for Conv + Sum
-        pool_S1 = core.CreateOperator(
-            "MaxPool",
-            ["SX1"],
-            ["S1"],
-            stride=2,
-            pad=0,
-            kernel=2,
-            group=group,
-            device_option=dc[1]
-        )
-        conv_fusion = core.CreateOperator(
-            "ConvFusion",
-            ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"],
-            ["S1"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            fusion_type = 2,
-            device_option=dc[1]
-        )
-        pool_input_size = int(math.ceil(float(size + 2 * pad - kernel + 1) / stride)) * 2;
-        SX = np.random.rand(
-            batch_size, output_channels * group, pool_input_size, pool_input_size).astype(np.float32) - 0.5
-        X = np.random.rand(
-            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-                output_channels * group, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-        workspace.FeedBlob('SX0', SX, dc[0])
-        workspace.FeedBlob('X0', X, dc[0])
-        workspace.FeedBlob('w0', w, dc[0])
-        workspace.FeedBlob('b0', b, dc[0])
-        workspace.RunOperatorOnce(pool_S0)
-        workspace.RunOperatorOnce(conv)
-        workspace.RunOperatorOnce(sum)
-        S0 = workspace.FetchBlob('S0')
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('SX1', SX, dc[1])
-        workspace.FeedBlob('X1', X, dc[1])
-        workspace.FeedBlob('w1', w, dc[1])
-        workspace.FeedBlob('b1', b, dc[1])
-        workspace.RunOperatorOnce(pool_S1)
-        workspace.RunOperatorOnce(conv_fusion)
-        S1 = workspace.FetchBlob('S1')
-
-        if not np.allclose(S0, S1, atol=0.01, rtol=0.01):
-            print(S1.flatten())
-            print(S0.flatten())
-            print(np.max(np.abs(S1 - S0)))
-            self.assertTrue(False)
-
-        # Auto fusion for Conv + Sum
-        workspace.ResetWorkspace()
-        old_net = caffe2_pb2.NetDef()
-        pool_S0_old = caffe2_pb2.OperatorDef()
-        pool_S0_old.CopyFrom(pool_S0)
-        pool_S0_old.device_option.CopyFrom(dc[1])
-        conv_old = caffe2_pb2.OperatorDef()
-        conv_old.CopyFrom(conv)
-        conv_old.device_option.CopyFrom(dc[1])
-        sum_old = caffe2_pb2.OperatorDef()
-        sum_old.CopyFrom(sum)
-        sum_old.device_option.CopyFrom(dc[1])
-        old_net.op.extend([pool_S0_old, conv_old, sum_old])
-
-        # Conv + Sum should be fused case: [PreNode, Conv, Sum]
-        workspace.FeedBlob('SX0', SX, dc[1])
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        net = core.Net("net")
-        net.Proto().CopyFrom(old_net)
-        optimizeForMKLDNN(net)
-        self.assertTrue(len(net.Proto().op) == 2)
-        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
-        workspace.RunNetOnce(net.Proto())
-        # The output tensor name will be changed by optimization
-        # sometimes when applying conv sum fusion
-        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
-        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
-            print(S2.flatten())
-            print(S0.flatten())
-            print(np.max(np.abs(S2 - S0)))
-            self.assertTrue(False)
-
-        # Conv + Sum should be fused case: [Conv, PreNode, Sum]
-        workspace.ResetWorkspace()
-        old_net = caffe2_pb2.NetDef()
-        workspace.FeedBlob('SX0', SX, dc[1])
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        old_net.op.extend([conv_old, pool_S0_old, sum_old])
-        net = core.Net("net")
-        net.Proto().CopyFrom(old_net)
-        optimizeForMKLDNN(net)
-        self.assertTrue(len(net.Proto().op) == 2)
-        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
-        workspace.RunNetOnce(net.Proto())
-        # The output tensor name will be changed by optimization
-        # sometimes when applying conv sum fusion
-        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
-        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
-            print(S2.flatten())
-            print(S0.flatten())
-            print(np.max(np.abs(S2 - S0)))
-            self.assertTrue(False)
-
-        # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum] Conv output is used by midOp
-        dropout = core.CreateOperator(
-            "Dropout",
-            ["Y0"],
-            ["Y_dropout"],
-            ratio=0.5,
-            is_test=True,
-            device_option=dc[1]
-        )
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('SX0', SX, dc[1])
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        old_net = caffe2_pb2.NetDef()
-        old_net.op.extend([conv_old, dropout, pool_S0_old, sum_old])
-        net = core.Net("net")
-        net.Proto().CopyFrom(old_net)
-        optimizeForMKLDNN(net)
-        self.assertTrue(len(net.Proto().op) == 4)
-        workspace.RunNetOnce(net.Proto())
-        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
-        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
-            print(S2.flatten())
-            print(S0.flatten())
-            print(np.max(np.abs(S2 - S0)))
-            self.assertTrue(False)
-
-        # Conv + Sum should not be fused case: [Conv, preNode, Sum, midOp] preNode output is used by midOp
-        sum1 = core.CreateOperator(
-            sum_add,
-            ["S0", "Y0"],
-            ["S3"],
-            device_option=dc[1]
-        )
-        dropout = core.CreateOperator(
-            "Dropout",
-            ["S0"],
-            ["Y_dropout"],
-            ratio=0.5,
-            is_test=True,
-            device_option=dc[1]
-        )
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('SX0', SX, dc[1])
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        old_net = caffe2_pb2.NetDef()
-        old_net.op.extend([conv_old, pool_S0_old, sum1, dropout])
-        net = core.Net("net")
-        net.Proto().CopyFrom(old_net)
-        optimizeForMKLDNN(net)
-        print("net={}\n".format(net.Proto()))
-        self.assertTrue(len(net.Proto().op) == 4)
-        workspace.RunNetOnce(net.Proto())
-        S2 = workspace.FetchBlob(net.Proto().op[-2].output[0])
-        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
-            print(S2.flatten())
-            print(S0.flatten())
-            print(np.max(np.abs(S2 - S0)))
-            self.assertTrue(False)
-
-        # Conv + Sum should not be fused case: [Conv, midOp, preNode, Sum]
-        # midOp output has the same name with that of the Conv input
-        relu_0 = core.CreateOperator(
-            "Relu",
-            ["X0"],
-            ["X1"],
-            device_option=dc[0]
-        )
-        conv = core.CreateOperator(
-            "Conv",
-            ["X1", "w0", "b0"] if use_bias else ["X1", "w0"],
-            ["Y0"],
-            stride=1,
-            pad=0,
-            kernel=1,
-            device_option=dc[0]
-        )
-        relu_1 = core.CreateOperator(
-            "Relu",
-            ["X1"],
-            ["X1"],
-            device_option=dc[0]
-        )
-        pool = core.CreateOperator(
-            "MaxPool",
-            ["X1"],
-            ["S0"],
-            stride=1,
-            pad=0,
-            kernel=1,
-            device_option=dc[0]
-        )
-        sum = core.CreateOperator(
-            "Sum",
-            ["S0", "Y0"],
-            ["S0"],
-            device_option=dc[0]
-        )
-
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-            input_channels, input_channels, 1, 1).astype(np.float32) - 0.5
-        b = np.random.rand(input_channels).astype(np.float32) - 0.5
-
-        workspace.SwitchWorkspace(old_ws_name)
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('X0', X, dc[0])
-        workspace.FeedBlob('w0', w, dc[0])
-        workspace.FeedBlob('b0', b, dc[0])
-        workspace.RunOperatorOnce(relu_0)
-        workspace.RunOperatorOnce(conv)
-        workspace.RunOperatorOnce(relu_1)
-        workspace.RunOperatorOnce(pool)
-        workspace.RunOperatorOnce(sum)
-        S0 = workspace.FetchBlob('S0')
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        relu_0_old = caffe2_pb2.OperatorDef()
-        relu_0_old.CopyFrom(relu_0)
-        relu_0_old.device_option.CopyFrom(dc[1])
-        conv_old = caffe2_pb2.OperatorDef()
-        conv_old.CopyFrom(conv)
-        conv_old.device_option.CopyFrom(dc[1])
-        relu_1_old = caffe2_pb2.OperatorDef()
-        relu_1_old.CopyFrom(relu_1)
-        relu_1_old.device_option.CopyFrom(dc[1])
-        pool_old = caffe2_pb2.OperatorDef()
-        pool_old.CopyFrom(pool)
-        pool_old.device_option.CopyFrom(dc[1])
-        sum_old = caffe2_pb2.OperatorDef()
-        sum_old.CopyFrom(sum)
-        sum_old.device_option.CopyFrom(dc[1])
-
-        old_net = caffe2_pb2.NetDef()
-        old_net.op.extend([relu_0_old, conv_old, relu_1_old, pool_old, sum_old])
-        net = core.Net("net")
-        net.Proto().CopyFrom(old_net)
-        optimizeForMKLDNN(net)
-        self.assertTrue(len(net.Proto().op) == 5)
-        workspace.RunNetOnce(net.Proto())
-        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
-        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
-            print(S2.flatten())
-            print(S0.flatten())
-            print(np.max(np.abs(S2 - S0)))
-            self.assertTrue(False)
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(8, 20),
-           input_channels=st.integers(1, 16),
-           output_channels=st.integers(1, 16),
-           batch_size=st.integers(1, 3),
-           use_bias=st.booleans(),
-           group=st.integers(1, 1),
-           sum_add=st.sampled_from(["Sum", "Add"]),
-           **mu.gcs)
-    def test_convolution_sum_relu_fusion(self, stride, pad, kernel, size,
-                             input_channels, output_channels,
-                             batch_size, use_bias, group, sum_add, gc, dc):
-        conv_S0 = core.CreateOperator(
-            "Conv",
-            ["SX0", "Sw0", "Sb0"] if use_bias else ["SX0", "Sw0"],
-            ["S0"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            device_option=dc[0]
-        )
-        conv = core.CreateOperator(
-            "Conv",
-            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
-            ["Y0"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            device_option=dc[0]
-        )
-        sum = core.CreateOperator(
-            sum_add,
-            ["S0", "Y0"],
-            ["S0"],
-            device_option=dc[0]
-        )
-        relu = core.CreateOperator(
-            "Relu",
-            ["S0"],
-            ["S0"],
-            device_option=dc[0]
-        )
-
-        # Manual fusion for Conv + Sum + ReLU
-        conv_S1 = core.CreateOperator(
-            "Conv",
-            ["SX1", "Sw1", "Sb1"] if use_bias else ["SX1", "Sw1"],
-            ["S1"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            device_option=dc[1]
-        )
-        conv_fusion = core.CreateOperator(
-            "ConvFusion",
-            ["X1", "w1", "b1", "S1"] if use_bias else ["X1", "w1", "S1"],
-            ["S1"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            fusion_type = 3,
-            device_option=dc[1]
-        )
-        SX = np.random.rand(
-            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
-        Sw = np.random.rand(
-                output_channels * group, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        Sb = np.random.rand(output_channels * group).astype(np.float32) - 0.5
-        X = np.random.rand(
-            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-                output_channels * group, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-        workspace.FeedBlob('SX0', SX, dc[0])
-        workspace.FeedBlob('Sw0', Sw, dc[0])
-        workspace.FeedBlob('Sb0', Sb, dc[0])
-        workspace.FeedBlob('X0', X, dc[0])
-        workspace.FeedBlob('w0', w, dc[0])
-        workspace.FeedBlob('b0', b, dc[0])
-        workspace.RunOperatorOnce(conv_S0)
-        workspace.RunOperatorOnce(conv)
-        workspace.RunOperatorOnce(sum)
-        workspace.RunOperatorOnce(relu)
-        S0 = workspace.FetchBlob('S0')
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('SX1', SX, dc[1])
-        workspace.FeedBlob('Sw1', Sw, dc[1])
-        workspace.FeedBlob('Sb1', Sb, dc[1])
-        workspace.FeedBlob('X1', X, dc[1])
-        workspace.FeedBlob('w1', w, dc[1])
-        workspace.FeedBlob('b1', b, dc[1])
-        workspace.RunOperatorOnce(conv_S1)
-        workspace.RunOperatorOnce(conv_fusion)
-        S1 = workspace.FetchBlob('S1')
-
-        if not np.allclose(S0, S1, atol=0.01, rtol=0.01):
-            print(S1.flatten())
-            print(S0.flatten())
-            print(np.max(np.abs(S1 - S0)))
-            self.assertTrue(False)
-
-        # Auto fusion for Conv + Sum + ReLU
-        workspace.ResetWorkspace()
-        old_net = caffe2_pb2.NetDef()
-        conv_S0_old = caffe2_pb2.OperatorDef()
-        conv_S0_old.CopyFrom(conv_S0)
-        conv_S0_old.device_option.CopyFrom(dc[1])
-        conv_old = caffe2_pb2.OperatorDef()
-        conv_old.CopyFrom(conv)
-        conv_old.device_option.CopyFrom(dc[1])
-        sum_old = caffe2_pb2.OperatorDef()
-        sum_old.CopyFrom(sum)
-        sum_old.device_option.CopyFrom(dc[1])
-        relu_old = caffe2_pb2.OperatorDef()
-        relu_old.CopyFrom(relu)
-        relu_old.device_option.CopyFrom(dc[1])
-        old_net.op.extend([conv_S0_old, conv_old, sum_old, relu_old])
-        workspace.FeedBlob('SX0', SX, dc[1])
-        workspace.FeedBlob('Sw0', Sw, dc[1])
-        workspace.FeedBlob('Sb0', Sb, dc[1])
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        net = core.Net("net")
-        net.Proto().CopyFrom(old_net)
-        optimizeForMKLDNN(net)
-        self.assertTrue(len(net.Proto().op) == 2)
-        self.assertTrue(net.Proto().op[1].type == "ConvFusion")
-        workspace.RunNetOnce(net.Proto())
-        # The output tensor name will be changed by optimization
-        # sometimes when applying conv sum fusion
-        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
-        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
-            print(S2.flatten())
-            print(S0.flatten())
-            print(np.max(np.abs(S2 - S0)))
-            self.assertTrue(False)
-
-        workspace.SwitchWorkspace(old_ws_name)
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(8, 20),
-           input_channels=st.integers(7, 17),
-           output_channels=st.integers(5, 15),
-           batch_size=st.integers(1, 3),
-           use_bias=st.booleans(),
-           group=st.integers(2, 5),
-           **mu.gcs)
-    def test_convolution_grouped_sum_relu_fusion(self, stride, pad, kernel, size,
-                             input_channels, output_channels,
-                             batch_size, use_bias, group, gc, dc):
-        conv_S0 = core.CreateOperator(
-            "Conv",
-            ["SX0", "Sw0", "Sb0"] if use_bias else ["SX0", "Sw0"],
-            ["S0"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            device_option=dc[0]
-        )
-        conv = core.CreateOperator(
-            "Conv",
-            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
-            ["Y0"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            device_option=dc[0]
-        )
-        sum = core.CreateOperator(
-            "Sum",
-            ["S0", "Y0"],
-            ["S0"],
-            device_option=dc[0]
-        )
-        relu = core.CreateOperator(
-            "Relu",
-            ["S0"],
-            ["S0"],
-            device_option=dc[0]
-        )
-
-        SX = np.random.rand(
-            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
-        Sw = np.random.rand(
-                output_channels * group, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        Sb = np.random.rand(output_channels * group).astype(np.float32) - 0.5
-        X = np.random.rand(
-            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-                output_channels * group, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-        workspace.FeedBlob('SX0', SX, dc[0])
-        workspace.FeedBlob('Sw0', Sw, dc[0])
-        workspace.FeedBlob('Sb0', Sb, dc[0])
-        workspace.FeedBlob('X0', X, dc[0])
-        workspace.FeedBlob('w0', w, dc[0])
-        workspace.FeedBlob('b0', b, dc[0])
-        workspace.RunOperatorOnce(conv_S0)
-        workspace.RunOperatorOnce(conv)
-        workspace.RunOperatorOnce(sum)
-        workspace.RunOperatorOnce(relu)
-        S0 = workspace.FetchBlob('S0')
-
-        workspace.ResetWorkspace()
-        old_net = caffe2_pb2.NetDef()
-        conv_S0_old = caffe2_pb2.OperatorDef()
-        conv_S0_old.CopyFrom(conv_S0)
-        conv_S0_old.device_option.CopyFrom(dc[1])
-        conv_old = caffe2_pb2.OperatorDef()
-        conv_old.CopyFrom(conv)
-        conv_old.device_option.CopyFrom(dc[1])
-        sum_old = caffe2_pb2.OperatorDef()
-        sum_old.CopyFrom(sum)
-        sum_old.device_option.CopyFrom(dc[1])
-        relu_old = caffe2_pb2.OperatorDef()
-        relu_old.CopyFrom(relu)
-        relu_old.device_option.CopyFrom(dc[1])
-        old_net.op.extend([conv_S0_old, conv_old, sum_old, relu_old])
-        workspace.FeedBlob('SX0', SX, dc[1])
-        workspace.FeedBlob('Sw0', Sw, dc[1])
-        workspace.FeedBlob('Sb0', Sb, dc[1])
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        net = core.Net("net")
-        net.Proto().CopyFrom(old_net)
-        optimizeForMKLDNN(net)
-        workspace.RunNetOnce(net.Proto())
-        # The output tensor name will be changed by optimization
-        # sometimes when applying conv sum fusion
-        S2 = workspace.FetchBlob(net.Proto().op[-1].output[0])
-        if not np.allclose(S0, S2, atol=0.01, rtol=0.01):
-            print(S2.flatten())
-            print(S0.flatten())
-            print(np.max(np.abs(S2 - S0)))
-            self.assertTrue(False)
-
-        workspace.SwitchWorkspace(old_ws_name)
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(8, 20),
-           input_channels=st.integers(1, 16),
-           output_channels=st.integers(1, 16),
-           batch_size=st.integers(1, 3),
-           use_bias=st.booleans(),
-           group=st.integers(1, 1),
-           inplace=st.sampled_from([True, False]),
-           **mu.gcs)
-    def test_convolution_bn_folding(
-            self, stride, pad, kernel, size, input_channels,
-            output_channels, batch_size, use_bias, group,
-            inplace, gc, dc):
-        conv = core.CreateOperator(
-            "Conv",
-            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
-            ["X1"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            device_option=dc[1]
-        )
-        bn = core.CreateOperator(
-            "SpatialBN",
-            ["X1", "scale", "bias", "mean", "var"],
-            ["X1" if inplace else "Y"],
-            is_test=True,
-            device_option=dc[1]
-        )
-
-        X = np.random.rand(
-            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-                output_channels * group, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
-        scale = np.random.rand(output_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(output_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(output_channels).astype(np.float32)
-        var = np.absolute(np.random.rand(output_channels).astype(np.float32)) + 0.5
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        workspace.FeedBlob('scale', scale, dc[1])
-        workspace.FeedBlob('bias', bias, dc[1])
-        workspace.FeedBlob('mean', mean, dc[1])
-        workspace.FeedBlob('var', var, dc[1])
-        workspace.RunOperatorOnce(conv)
-        workspace.RunOperatorOnce(bn)
-        Y = workspace.FetchBlob('X1' if inplace else "Y")
-
-        workspace.ResetWorkspace()
-        old_net = caffe2_pb2.NetDef()
-        conv_old = caffe2_pb2.OperatorDef()
-        conv_old.CopyFrom(conv)
-        conv_old.device_option.CopyFrom(dc[1])
-        bn_old = caffe2_pb2.OperatorDef()
-        bn_old.CopyFrom(bn)
-        bn_old.device_option.CopyFrom(dc[1])
-        old_net.op.extend([conv_old, bn_old])
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        workspace.FeedBlob('scale', scale, dc[1])
-        workspace.FeedBlob('bias', bias, dc[1])
-        workspace.FeedBlob('mean', mean, dc[1])
-        workspace.FeedBlob('var', var, dc[1])
-        net = core.Net("net")
-        net.Proto().CopyFrom(old_net)
-        optimizeForMKLDNN(net)
-        self.assertTrue(len(net.Proto().op) == 1)
-        self.assertTrue(net.Proto().op[0].type == "Conv")
-        workspace.RunOperatorOnce(net.Proto().op[0])
-        Y1 = workspace.FetchBlob('X1' if inplace else "Y")
-        if not np.allclose(Y, Y1, atol=0.01, rtol=0.01):
-            print(Y.flatten())
-            print(Y1.flatten())
-            print(np.max(np.abs(Y - Y1)))
-            self.assertTrue(False)
-
-        workspace.SwitchWorkspace(old_ws_name)
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(8, 20),
-           input_channels=st.integers(1, 16),
-           output_channels=st.integers(1, 16),
-           batch_size=st.integers(1, 3),
-           use_bias=st.booleans(),
-           group=st.integers(1, 1),
-           inplace=st.sampled_from([True, False]),
-           **mu.gcs)
-    def test_convolution_affch_folding(
-            self, stride, pad, kernel, size, input_channels,
-            output_channels, batch_size, use_bias, group,
-            inplace, gc, dc):
-        conv = core.CreateOperator(
-            "Conv",
-            ["X0", "w0", "b0"] if use_bias else ["X0", "w0"],
-            ["X1"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group,
-            device_option=dc[1]
-        )
-        affch = core.CreateOperator(
-            "AffineChannel",
-            ["X1", "scale", "bias"],
-            ["X1" if inplace else "Y"],
-            device_option=dc[1]
-        )
-
-        X = np.random.rand(
-            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-                output_channels * group, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
-        scale = np.random.rand(output_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(output_channels).astype(np.float32) - 0.5
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        workspace.FeedBlob('scale', scale, dc[1])
-        workspace.FeedBlob('bias', bias, dc[1])
-        workspace.RunOperatorOnce(conv)
-        workspace.RunOperatorOnce(affch)
-        Y = workspace.FetchBlob('X1' if inplace else "Y")
-
-        workspace.ResetWorkspace()
-        old_net = caffe2_pb2.NetDef()
-        conv_old = caffe2_pb2.OperatorDef()
-        conv_old.CopyFrom(conv)
-        conv_old.device_option.CopyFrom(dc[1])
-        affch_old = caffe2_pb2.OperatorDef()
-        affch_old.CopyFrom(affch)
-        affch_old.device_option.CopyFrom(dc[1])
-        old_net.op.extend([conv_old, affch_old])
-        workspace.FeedBlob('X0', X, dc[1])
-        workspace.FeedBlob('w0', w, dc[1])
-        workspace.FeedBlob('b0', b, dc[1])
-        workspace.FeedBlob('scale', scale, dc[1])
-        workspace.FeedBlob('bias', bias, dc[1])
-        net = core.Net("net")
-        net.Proto().CopyFrom(old_net)
-        optimizeForMKLDNN(net)
-        self.assertTrue(len(net.Proto().op) == 1)
-        self.assertTrue(net.Proto().op[0].type == "Conv")
-        workspace.RunOperatorOnce(net.Proto().op[0])
-        Y1 = workspace.FetchBlob('X1' if inplace else "Y")
-        if not np.allclose(Y, Y1, atol=0.01, rtol=0.01):
-            print(Y.flatten())
-            print(Y1.flatten())
-            print(np.max(np.abs(Y - Y1)))
-            self.assertTrue(False)
-
-        workspace.SwitchWorkspace(old_ws_name)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/copy_op_test.py b/caffe2/python/ideep/copy_op_test.py
deleted file mode 100644
index 668282f2e159..000000000000
--- a/caffe2/python/ideep/copy_op_test.py
+++ /dev/null
@@ -1,99 +0,0 @@
-
-
-
-
-
-import unittest
-import numpy as np
-from random import randint
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class CopyTest(unittest.TestCase):
-    def _get_deep_device(self):
-        return caffe2_pb2.DeviceOption(device_type=caffe2_pb2.IDEEP)
-
-    def test_copy_to_ideep(self):
-        op = core.CreateOperator(
-            "CopyCPUToIDEEP",
-            ["X"],
-            ["X_ideep"],
-        )
-        op.device_option.CopyFrom(self._get_deep_device())
-        n = randint(1, 128)
-        c = randint(1, 64)
-        h = randint(1, 128)
-        w = randint(1, 128)
-        X = np.random.rand(n, c, h, w).astype(np.float32)
-        workspace.FeedBlob("X", X)
-        workspace.RunOperatorOnce(op)
-        X_ideep = workspace.FetchBlob("X_ideep")
-        np.testing.assert_allclose(X, X_ideep)
-
-    def test_copy_to_ideep_zero_dim(self):
-        op = core.CreateOperator(
-            "CopyCPUToIDEEP",
-            ["X"],
-            ["X_ideep"],
-        )
-        op.device_option.CopyFrom(self._get_deep_device())
-        n = 0
-        c = randint(1, 128)
-        X = np.random.rand(n, c).astype(np.float32)
-        workspace.FeedBlob("X", X)
-        workspace.RunOperatorOnce(op)
-        X_ideep = workspace.FetchBlob("X_ideep")
-        np.testing.assert_allclose(X, X_ideep)
-
-    def test_copy_from_ideep(self):
-        op = core.CreateOperator(
-            "CopyIDEEPToCPU",
-            ["X_ideep"],
-            ["X"],
-        )
-        op.device_option.CopyFrom(self._get_deep_device())
-        n = randint(1, 128)
-        c = randint(1, 64)
-        h = randint(1, 128)
-        w = randint(1, 128)
-        X = np.random.rand(n, c, h, w).astype(np.float32)
-        workspace.FeedBlob("X_ideep", X, self._get_deep_device())
-        workspace.RunOperatorOnce(op)
-        X_ideep = workspace.FetchBlob("X")
-        np.testing.assert_allclose(X, X_ideep)
-
-    def test_copy_from_ideep_zero_dim(self):
-        op = core.CreateOperator(
-            "CopyIDEEPToCPU",
-            ["X_ideep"],
-            ["X"],
-        )
-        op.device_option.CopyFrom(self._get_deep_device())
-        n = 0
-        c = randint(1, 64)
-        X = np.random.rand(n, c).astype(np.float32)
-        workspace.FeedBlob("X_ideep", X, self._get_deep_device())
-        workspace.RunOperatorOnce(op)
-        X_ideep = workspace.FetchBlob("X")
-        np.testing.assert_allclose(X, X_ideep)
-
-    def test_copy_from_ideep_fallthrough(self):
-        op = core.CreateOperator(
-            "CopyIDEEPToCPU",
-            ["X_ideep"],
-            ["X"],)
-        op.device_option.CopyFrom(self._get_deep_device())
-        n = randint(1, 128)
-        c = randint(1, 64)
-        h = randint(1, 128)
-        w = randint(1, 128)
-        X = np.random.rand(n, c, h, w).astype(np.float32)
-        workspace.FeedBlob("X_ideep", X)
-        workspace.RunOperatorOnce(op)
-        X_ideep = workspace.FetchBlob("X")
-        np.testing.assert_allclose(X, X_ideep)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/dropout_op_test.py b/caffe2/python/ideep/dropout_op_test.py
deleted file mode 100644
index 5e7c37eaf9fe..000000000000
--- a/caffe2/python/ideep/dropout_op_test.py
+++ /dev/null
@@ -1,59 +0,0 @@
-
-
-
-
-
-import unittest
-from hypothesis import given
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class DropoutTest(hu.HypothesisTestCase):
-
-    @given(X=hu.tensor(),
-           in_place=st.booleans(),
-           ratio=st.floats(0, 0.999),
-           **mu.gcs)
-    def test_dropout_is_test(self, X, in_place, ratio, gc, dc):
-        """Test with is_test=True for a deterministic reference impl."""
-        op = core.CreateOperator('Dropout', ['X'],
-                                 ['X' if in_place else 'Y'],
-                                 ratio=ratio, is_test=True)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        # No sense in checking gradients for test phase
-
-        def reference_dropout_test(x):
-            return x, np.ones(x.shape, dtype=bool)
-        self.assertReferenceChecks(
-            gc, op, [X], reference_dropout_test,
-            # The 'mask' output may be uninitialized
-            outputs_to_check=[0])
-
-    @given(X=hu.tensor(),
-           in_place=st.booleans(),
-           output_mask=st.booleans(),
-           **mu.gcs)
-    @unittest.skipIf(True, "Skip duo to different rand seed.")
-    def test_dropout_ratio0(self, X, in_place, output_mask, gc, dc):
-        """Test with ratio=0 for a deterministic reference impl."""
-        is_test = not output_mask
-        op = core.CreateOperator('Dropout', ['X'],
-                                 ['X' if in_place else 'Y'] +
-                                 (['mask'] if output_mask else []),
-                                 ratio=0.0, is_test=is_test)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-        def reference_dropout_ratio0(x):
-            return (x,) if is_test else (x, np.ones(x.shape, dtype=bool))
-        self.assertReferenceChecks(
-            gc, op, [X], reference_dropout_ratio0, outputs_to_check=[0])
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/elementwise_sum_op_test.py b/caffe2/python/ideep/elementwise_sum_op_test.py
deleted file mode 100644
index 11a35d6b2b28..000000000000
--- a/caffe2/python/ideep/elementwise_sum_op_test.py
+++ /dev/null
@@ -1,190 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class ElementwiseSumTest(hu.HypothesisTestCase):
-    @given(size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           inputs=st.integers(2, 7),
-           inplace=st.booleans(),
-           **mu.gcs)
-    def test_elementwise_sum(self,
-                                 size,
-                                 input_channels,
-                                 batch_size,
-                                 inputs,
-                                 inplace,
-                                 gc,
-                                 dc):
-        op = core.CreateOperator(
-            "Sum",
-            ["X_{}".format(i) for i in range(inputs)],
-            ["X_0" if inplace else "Y"],
-        )
-        Xs = [np.random.rand(batch_size, input_channels, size, size).astype(
-            np.float32) for _ in range(inputs)]
-        self.assertDeviceChecks(dc, op, Xs, [0])
-
-
-    @given(size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           inputs=st.integers(2, 7),
-           inplace=st.booleans(),
-           **mu.gcs_cpu_ideep)
-    def test_elementwise_sum_fallback(self,
-                                      size,
-                                      input_channels,
-                                      batch_size,
-                                      inputs,
-                                      inplace,
-                                      gc,
-                                      dc):
-        op = core.CreateOperator(
-            "Sum",
-            ["X_{}".format(i) for i in range(inputs)],
-            ["X_0" if inplace else "Y"],
-            device_option=dc[1]
-        )
-        Xs = [np.random.rand(batch_size, input_channels, size, size).astype(
-            np.float32) for _ in range(inputs)]
-
-        sum_val = Xs[0]
-        workspace.FeedBlob("X_0", Xs[0], dc[0])
-        for i, x in enumerate(Xs):
-            if i == 0: continue
-            sum_val += x
-            workspace.FeedBlob("X_{}".format(i), x, dc[1])
-
-        workspace.RunOperatorOnce(op)
-        Y = workspace.FetchBlob("X_0" if inplace else "Y")
-
-        if not np.allclose(sum_val, Y, atol=0.01, rtol=0.01):
-            print(Y.flatten())
-            print(sum_val.flatten())
-            print(np.max(np.abs(Y - sum_val)))
-            self.assertTrue(False)
-
-
-    @given(size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           inputs=st.integers(2, 7),
-           inplace=st.booleans(),
-           **mu.gcs_cpu_ideep)
-    def test_int8_elementwise_sum(self,
-                                 size,
-                                 input_channels,
-                                 batch_size,
-                                 inputs,
-                                 inplace,
-                                 gc,
-                                 dc):
-        sum_fp32 = core.CreateOperator(
-            "Sum",
-            ["X_{}".format(i) for i in range(inputs)],
-            ["X_0" if inplace else "Y"],
-        )
-        Xs = [np.random.rand(batch_size, input_channels, size, size).astype(
-            np.float32) for _ in range(inputs)]
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-
-        Xi_scales = []
-        Xi_zero_points = []
-        for i, X in enumerate(Xs):
-            workspace.FeedBlob("X_{}".format(i), X, dc[0])
-            if X.min() >= 0:
-                Xi_scales.append(np.absolute(X).max() / 0xFF)
-                Xi_zero_points.append(0)
-            else:
-                Xi_scales.append(np.absolute(X).max() / 0x7F)
-                Xi_zero_points.append(128)
-
-        workspace.RunOperatorOnce(sum_fp32)
-        Y = workspace.FetchBlob("X_0" if inplace else "Y")
-
-        if Y.min() >= 0:
-            Y_scale = np.absolute(Y).max() / 0xFF
-            Y_zero_point = 0
-        else:
-            Y_scale = np.absolute(Y).max() / 0x7F
-            Y_zero_point = 128
-
-        workspace.ResetWorkspace()
-
-        net = caffe2_pb2.NetDef()
-        for i, Xi in enumerate(Xs):
-            workspace.FeedBlob("Xi_{}".format(i), Xi, dc[1])
-            sw2nhwc = core.CreateOperator(
-                "NCHW2NHWC",
-                ["Xi_{}".format(i)],
-                ["Xi_{}_nhwc".format(i)],
-                device_option=dc[1]
-            )
-            quantize = core.CreateOperator(
-                "Int8Quantize",
-                ["Xi_{}_nhwc".format(i)],
-                ["Xi_{}_quantized".format(i)],
-                engine="DNNLOWP",
-                device_option=dc[1],
-                Y_zero_point=Xi_zero_points[i],
-                Y_scale=Xi_scales[i],
-            )
-            net.op.extend([sw2nhwc, quantize])
-
-        sum = core.CreateOperator(
-            "Int8Sum",
-            ["Xi_{}_quantized".format(i) for i in range(inputs)],
-            ["Xi_0_quantized" if inplace else "Y_quantized"],
-            engine="DNNLOWP",
-            device_option=dc[1],
-            Y_zero_point=Y_zero_point,
-            Y_scale=Y_scale,
-        )
-
-        dequantize = core.CreateOperator(
-            "Int8Dequantize",
-            ["Xi_0_quantized" if inplace else "Y_quantized"],
-            ["Y_nhwc"],
-            engine="DNNLOWP",
-            device_option=dc[1],
-        )
-
-        sw2nchw = core.CreateOperator(
-            "NHWC2NCHW",
-            ["Y_nhwc"],
-            ["Y_out"],
-            device_option=dc[1]
-        )
-
-        net.op.extend([sum, dequantize, sw2nchw])
-        workspace.RunNetOnce(net)
-        Y_out = workspace.FetchBlob("Y_out")
-
-        MSE = np.square(np.subtract(Y, Y_out)).mean()
-        if MSE > 0.005:
-            print(Y.flatten())
-            print(Y_out.flatten())
-            print(np.max(np.abs(Y_out - Y)))
-            print("MSE", MSE)
-            self.assertTrue(False)
-
-        workspace.SwitchWorkspace(old_ws_name)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/expanddims_squeeze_op_test.py b/caffe2/python/ideep/expanddims_squeeze_op_test.py
deleted file mode 100644
index 3693a217bb4b..000000000000
--- a/caffe2/python/ideep/expanddims_squeeze_op_test.py
+++ /dev/null
@@ -1,136 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class ExpandDimsSqueezeTest(hu.HypothesisTestCase):
-    @given(
-        squeeze_dims=st.lists(st.integers(0, 3), min_size=1, max_size=3),
-        inplace=st.booleans(),
-        **mu.gcs
-        )
-    def test_squeeze(self, squeeze_dims, inplace, gc, dc):
-        shape = [
-            1 if dim in squeeze_dims else np.random.randint(1, 5)
-            for dim in range(4)
-        ]
-        X = np.random.rand(*shape).astype(np.float32)
-        op = core.CreateOperator(
-            "Squeeze", "X", "X" if inplace else "Y", dims=squeeze_dims
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(
-        squeeze_dims=st.lists(st.integers(0, 3), min_size=1, max_size=3),
-        inplace=st.booleans(),
-        **mu.gcs_cpu_ideep
-        )
-    def test_squeeze_fallback(self, squeeze_dims, inplace, gc, dc):
-        shape = [
-            1 if dim in squeeze_dims else np.random.randint(1, 5)
-            for dim in range(4)
-        ]
-        X = np.random.rand(*shape).astype(np.float32)
-        op0 = core.CreateOperator(
-            "Squeeze",
-            "X0",
-            "X0" if inplace else "Y0",
-            dims=squeeze_dims,
-            device_option=dc[0]
-        )
-        workspace.FeedBlob('X0', X, dc[0])
-        workspace.RunOperatorOnce(op0)
-        Y0 = workspace.FetchBlob("X0" if inplace else "Y0")
-
-        op1 = core.CreateOperator(
-            "Squeeze",
-            "X1",
-            "X1" if inplace else "Y1",
-            dims=squeeze_dims,
-            device_option=dc[1]
-        )
-        workspace.FeedBlob('X1', X, dc[0])
-        workspace.RunOperatorOnce(op1)
-        Y1 = workspace.FetchBlob("X1" if inplace else "Y1")
-
-        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
-            print(Y1.flatten())
-            print(Y0.flatten())
-            print(np.max(np.abs(Y1 - Y0)))
-            self.assertTrue(False)
-
-
-    @given(
-        squeeze_dims=st.lists(st.integers(0, 3), min_size=1, max_size=3),
-        inplace=st.booleans(),
-        **mu.gcs
-        )
-    def test_expand_dims(self, squeeze_dims, inplace, gc, dc):
-        oshape = [
-            1 if dim in squeeze_dims else np.random.randint(2, 5)
-            for dim in range(4)
-        ]
-        nshape = [s for s in oshape if s!=1]
-        expand_dims = [i for i in range(len(oshape)) if oshape[i]==1]
-
-        X = np.random.rand(*nshape).astype(np.float32)
-        op = core.CreateOperator(
-            "ExpandDims", "X", "X" if inplace else "Y", dims=expand_dims
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(
-        squeeze_dims=st.lists(st.integers(0, 3), min_size=1, max_size=3),
-        inplace=st.booleans(),
-        **mu.gcs_cpu_ideep
-        )
-    def test_expand_dims_fallback(self, squeeze_dims, inplace, gc, dc):
-        oshape = [
-            1 if dim in squeeze_dims else np.random.randint(2, 5)
-            for dim in range(4)
-        ]
-        nshape = [s for s in oshape if s!=1]
-        expand_dims = [i for i in range(len(oshape)) if oshape[i]==1]
-
-        X = np.random.rand(*nshape).astype(np.float32)
-        op0 = core.CreateOperator(
-            "ExpandDims",
-            "X0",
-            "X0" if inplace else "Y0",
-            dims=expand_dims,
-            device_option=dc[0]
-        )
-        workspace.FeedBlob('X0', X, dc[0])
-        workspace.RunOperatorOnce(op0)
-        Y0 = workspace.FetchBlob("X0" if inplace else "Y0")
-
-        op1 = core.CreateOperator(
-            "ExpandDims",
-            "X1",
-            "X1" if inplace else "Y1",
-            dims=expand_dims,
-            device_option=dc[1]
-        )
-        workspace.FeedBlob('X1', X, dc[0])
-        workspace.RunOperatorOnce(op1)
-        Y1 = workspace.FetchBlob("X1" if inplace else "Y1")
-
-        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
-            print(Y1.flatten())
-            print(Y0.flatten())
-            print(np.max(np.abs(Y1 - Y0)))
-            self.assertTrue(False)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/fc_op_test.py b/caffe2/python/ideep/fc_op_test.py
deleted file mode 100644
index 6549bb6ad6bb..000000000000
--- a/caffe2/python/ideep/fc_op_test.py
+++ /dev/null
@@ -1,381 +0,0 @@
-
-
-
-
-
-import unittest
-from functools import reduce
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class FcTest(hu.HypothesisTestCase):
-    @given(n=st.integers(1, 5), m=st.integers(1, 5),
-           k=st.integers(1, 5), **mu.gcs)
-    @settings(deadline=1000)
-    def test_fc_2_dims(self, n, m, k, gc, dc):
-        X = np.random.rand(m, k).astype(np.float32) - 0.5
-        W = np.random.rand(n, k).astype(np.float32) - 0.5
-        b = np.random.rand(n).astype(np.float32) - 0.5
-
-        op = core.CreateOperator(
-            'FC',
-            ['X', 'W', 'b'],
-            ["Y"]
-        )
-
-        self.assertDeviceChecks(dc, op, [X, W, b], [0])
-
-        for i in range(3):
-            self.assertGradientChecks(gc, op, [X, W, b], i, [0])
-
-    @given(n=st.integers(1, 5),
-           m=st.integers(1, 5),
-           c=st.integers(1, 5),
-           h=st.integers(1, 5),
-           w=st.integers(1, 5),
-           axis=st.integers(1, 3),
-           **mu.gcs)
-    def test_fc_with_axis(self, n, m, c, h, w, axis, gc, dc):
-        X = np.random.rand(n, c, h, w).astype(np.float32) - 0.5
-        k = reduce((lambda x, y: x * y), [n, c, h, w][axis - 4:])
-        nn = reduce((lambda x, y: x * y), [n, c, h, w][:axis])
-        W = np.random.rand(m, k).astype(np.float32) - 0.5
-        b = np.random.rand(m).astype(np.float32) - 0.5
-        dY = np.random.rand(nn, m).astype(np.float32) - 0.5
-
-        op0 = core.CreateOperator(
-            'FC',
-            ['X', 'W', 'b'],
-            ["Y"],
-            axis=axis,
-            device_option=dc[0]
-        )
-
-        op0_bw = core.CreateOperator(
-            'FCGradient',
-            ['X', 'W', 'dY'],
-            ["dW", "db"],
-            axis=axis,
-            device_option=dc[0]
-        )
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('X', X, dc[0])
-        workspace.FeedBlob('W', W, dc[0])
-        workspace.FeedBlob('b', b, dc[0])
-        workspace.RunOperatorOnce(op0)
-        Y0 = workspace.FetchBlob('Y')
-
-        workspace.FeedBlob('dY', dY, dc[0])
-        workspace.RunOperatorOnce(op0_bw)
-        dW0 = workspace.FetchBlob('dW')
-        db0 = workspace.FetchBlob('db')
-
-        op1 = core.CreateOperator(
-            'FC',
-            ['X', 'W', 'b'],
-            ["Y"],
-            axis=axis,
-            device_option=dc[1]
-        )
-
-        op1_bw = core.CreateOperator(
-            'FCGradient',
-            ['X', 'W', 'dY'],
-            ["dW", "db"],
-            axis=axis,
-            device_option=dc[1]
-        )
-
-        workspace.SwitchWorkspace("_device_check_", True)
-        workspace.FeedBlob('X', X, dc[1])
-        workspace.FeedBlob('W', W, dc[1])
-        workspace.FeedBlob('b', b, dc[1])
-        workspace.RunOperatorOnce(op1)
-        Y1 = workspace.FetchBlob('Y')
-
-        workspace.FeedBlob('dY', dY, dc[1])
-        workspace.RunOperatorOnce(op1_bw)
-        dW1 = workspace.FetchBlob('dW')
-        db1 = workspace.FetchBlob('db')
-
-        Y0 = Y0.flatten()
-        Y1 = Y1.flatten()
-        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
-            print(Y1)
-            print(Y0)
-            print(np.max(np.abs(Y1 - Y0)))
-            self.assertTrue(False)
-
-        dW0 = dW0.flatten()
-        dW1 = dW1.flatten()
-        if not np.allclose(dW0, dW1, atol=0.01, rtol=0.01):
-            print(dW1)
-            print(dW0)
-            print(np.max(np.abs(dW1 - dW0)))
-            self.assertTrue(False)
-
-        db0 = db0.flatten()
-        db1 = db1.flatten()
-        if not np.allclose(db0, db1, atol=0.01, rtol=0.01):
-            print(db1)
-            print(db0)
-            print(np.max(np.abs(db1 - db0)))
-            self.assertTrue(False)
-
-    @given(n=st.integers(1, 5),
-           o=st.integers(1, 5),
-           i=st.integers(1, 5),
-           h=st.integers(1, 5),
-           w=st.integers(1, 5),
-           axis_w=st.integers(1, 3),
-           **mu.gcs)
-    @settings(deadline=1000)
-    def test_fc_with_axis_w(self, n, o, i, h, w, axis_w, gc, dc):
-        W = np.random.rand(o, i, h, w).astype(np.float32) - 0.5
-        k = reduce((lambda x, y: x * y), [o, i, h, w][axis_w - 4:])
-        m = reduce((lambda x, y: x * y), [o, i, h, w][:axis_w])
-        X = np.random.rand(n, k).astype(np.float32) - 0.5
-        b = np.random.rand(m).astype(np.float32) - 0.5
-        dY = np.random.rand(n, m).astype(np.float32) - 0.5
-
-        op0 = core.CreateOperator(
-            'FC',
-            ['X', 'W', 'b'],
-            ["Y"],
-            axis_w=axis_w,
-            device_option=dc[0]
-        )
-
-        op0_bw = core.CreateOperator(
-            'FCGradient',
-            ['X', 'W', 'dY'],
-            ["dW", "db"],
-            axis_w=axis_w,
-            device_option=dc[0]
-        )
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('X', X, dc[0])
-        workspace.FeedBlob('W', W, dc[0])
-        workspace.FeedBlob('b', b, dc[0])
-        workspace.RunOperatorOnce(op0)
-        Y0 = workspace.FetchBlob('Y')
-
-        workspace.FeedBlob('dY', dY, dc[0])
-        workspace.RunOperatorOnce(op0_bw)
-        dW0 = workspace.FetchBlob('dW')
-        db0 = workspace.FetchBlob('db')
-
-        op1 = core.CreateOperator(
-            'FC',
-            ['X', 'W', 'b'],
-            ["Y"],
-            axis_w=axis_w,
-            device_option=dc[1]
-        )
-
-        op1_bw = core.CreateOperator(
-            'FCGradient',
-            ['X', 'W', 'dY'],
-            ["dW", "db"],
-            axis_w=axis_w,
-            device_option=dc[1]
-        )
-
-        workspace.SwitchWorkspace("_device_check_", True)
-        workspace.FeedBlob('X', X, dc[1])
-        workspace.FeedBlob('W', W, dc[1])
-        workspace.FeedBlob('b', b, dc[1])
-        workspace.RunOperatorOnce(op1)
-        Y1 = workspace.FetchBlob('Y')
-
-        workspace.FeedBlob('dY', dY, dc[1])
-        workspace.RunOperatorOnce(op1_bw)
-        dW1 = workspace.FetchBlob('dW')
-        db1 = workspace.FetchBlob('db')
-
-        Y0 = Y0.flatten()
-        Y1 = Y1.flatten()
-        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
-            print(Y1)
-            print(Y0)
-            print(np.max(np.abs(Y1 - Y0)))
-            self.assertTrue(False)
-
-        dW0 = dW0.flatten()
-        dW1 = dW1.flatten()
-        if not np.allclose(dW0, dW1, atol=0.01, rtol=0.01):
-            print(dW1)
-            print(dW0)
-            print(np.max(np.abs(dW1 - dW0)))
-            self.assertTrue(False)
-
-        db0 = db0.flatten()
-        db1 = db1.flatten()
-        if not np.allclose(db0, db1, atol=0.01, rtol=0.01):
-            print(db1)
-            print(db0)
-            print(np.max(np.abs(db1 - db0)))
-            self.assertTrue(False)
-
-    @given(n=st.integers(1, 5), m=st.integers(1, 5),
-           k=st.integers(1, 5), **mu.gcs)
-    @settings(deadline=10000)
-    def test_fc_4_dims_src(self, n, m, k, gc, dc):
-        X = np.random.rand(m, k, m, m).astype(np.float32) - 0.5
-        W = np.random.rand(n, k * m * m).astype(np.float32) - 0.5
-        b = np.random.rand(n).astype(np.float32) - 0.5
-
-        op = core.CreateOperator(
-            'FC',
-            ['X', 'W', 'b'],
-            ["Y"]
-        )
-
-        self.assertDeviceChecks(dc, op, [X, W, b], [0])
-
-        for i in range(3):
-            self.assertGradientChecks(gc, op, [X, W, b], i, [0])
-
-    @given(n=st.integers(1, 5), m=st.integers(1, 5),
-           k=st.integers(1, 5), **mu.gcs)
-    @settings(deadline=10000)
-    def test_fc_4_dims(self, n, m, k, gc, dc):
-        X = np.random.rand(m, k, m, m).astype(np.float32) - 0.5
-        W = np.random.rand(n, k, m, m).astype(np.float32) - 0.5
-        b = np.random.rand(n).astype(np.float32) - 0.5
-
-        op = core.CreateOperator(
-            'FC',
-            ['X', 'W', 'b'],
-            ["Y"]
-        )
-
-        self.assertDeviceChecks(dc, op, [X, W, b], [0])
-
-        for i in range(3):
-            self.assertGradientChecks(gc, op, [X, W, b], i, [0])
-
-    @given(n=st.integers(2, 5), m=st.integers(2, 5),
-           k=st.integers(2, 5), **mu.gcs_cpu_ideep)
-    def test_int8_fc_4_dims(self, n, m, k, gc, dc):
-        X = np.random.rand(m, k, m, m).astype(np.float32) - 0.5
-        w = np.random.rand(n, k, m, m).astype(np.float32) - 0.5
-        b = np.random.rand(n).astype(np.float32) - 0.5
-
-        fc_fp32 = core.CreateOperator(
-            'FC',
-            ['X', 'w', 'b'],
-            ["Y"]
-        )
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-
-        workspace.FeedBlob('X', X, dc[0])
-        workspace.FeedBlob('w', w, dc[0])
-        workspace.FeedBlob('b', b, dc[0])
-        workspace.RunOperatorOnce(fc_fp32)
-        Y = workspace.FetchBlob('Y')
-
-        workspace.ResetWorkspace()
-
-        Y_absmax = np.array([np.absolute(Y).max()]).astype(np.float32)
-        if Y.min() >= 0:
-            Y_scale = Y_absmax / 0xFF
-            Y_zero_point = 0
-        else:
-            Y_scale = Y_absmax / 0x7F
-            Y_zero_point = 128
-
-        X_absmax = np.array([np.absolute(X).max()]).astype(np.float32)
-        if X.min() >= 0:
-            X_scale = X_absmax / 0xFF
-            X_zero_point = 0
-        else:
-            X_scale = X_absmax / 0x7F
-            X_zero_point = 128
-
-        w_absmax = np.array([np.absolute(w[i, ...]).max() for i in range(w.shape[0])]).astype(np.float32)
-        w_scale = w_absmax / 0x7F
-        w_zero_point = 128
-        w = np.transpose(w, (0, 2, 3, 1)).astype(np.float32)
-        w_bytes = np.rint([w[i, ...] / w_scale[i] for i in range(w.shape[0])]).astype(np.int8) + w_zero_point
-
-        w_filler = core.CreateOperator(
-            "Int8GivenTensorFill",
-            [], ["wi"],
-            shape=w.shape,
-            values=w_bytes.astype(np.uint8).tobytes(),
-            Y_zero_point=w_zero_point,
-            Y_scales=w_scale,
-            device_option=dc[1],
-        )
-
-        b_scale = w_scale * X_scale
-        b_zero_point = 0
-        b_bytes = np.rint([b[i] / b_scale[i] for i in range(b.shape[0])]).astype(np.int32)
-        b_filler = core.CreateOperator(
-            "Int8GivenIntTensorFill",
-            [], ["bi"],
-            shape=b.shape,
-            values=b_bytes,
-            Y_zero_point=b_zero_point,
-            Y_scales=b_scale,
-            device_option=dc[1],
-        )
-
-        sw2nhwc = core.CreateOperator(
-            "NCHW2NHWC",
-            ["Xi"],
-            ["Xi_nhwc"],
-            device_option=dc[1]
-        )
-
-        quantize_X = core.CreateOperator(
-            "Int8Quantize",
-            ["Xi_nhwc"],
-            ["Xi_quantized"],
-            engine="DNNLOWP",
-            device_option=dc[1],
-            Y_zero_point=X_zero_point,
-            Y_scale=X_scale[0],
-        )
-
-        fc = core.CreateOperator(
-            'Int8FC',
-            ['Xi_quantized', 'wi', 'bi'],
-            ["Y_out"],
-            engine="DNNLOWP",
-            device_option=dc[1],
-            Y_zero_point=Y_zero_point,
-            Y_scale=Y_scale[0],
-        )
-
-        net = caffe2_pb2.NetDef()
-        net.op.extend([w_filler, b_filler, sw2nhwc, quantize_X, fc])
-
-        workspace.FeedBlob("Xi", X, dc[1])
-        workspace.RunNetOnce(net)
-        Y_out = workspace.FetchBlob("Y_out")
-
-        MSE = np.square(np.subtract(Y, Y_out)).mean()
-        if MSE > 0.005:
-            print(Y.flatten())
-            print(Y_out.flatten())
-            print(np.max(np.abs(Y_out - Y)))
-            print("MSE", MSE)
-            self.assertTrue(False)
-
-        workspace.SwitchWorkspace(old_ws_name)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/leaky_relu_op_test.py b/caffe2/python/ideep/leaky_relu_op_test.py
deleted file mode 100644
index 6d84f88f4fe2..000000000000
--- a/caffe2/python/ideep/leaky_relu_op_test.py
+++ /dev/null
@@ -1,92 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-from caffe2.python import core, workspace, model_helper
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class LeakyReluTest(hu.HypothesisTestCase):
-    def _get_inputs(self, N, C, H, W, order):
-        input_data = np.random.rand(N, C, H, W).astype(np.float32) - 0.5
-
-        # default step size is 0.05
-        input_data[np.logical_and(
-            input_data >= 0, input_data <= 0.051)] = 0.051
-        input_data[np.logical_and(
-            input_data <= 0, input_data >= -0.051)] = -0.051
-
-        return input_data,
-
-    def _get_op(self, device_option, alpha, order, inplace=False):
-        outputs = ['output' if not inplace else "input"]
-        op = core.CreateOperator(
-            'LeakyRelu',
-            ['input'],
-            outputs,
-            alpha=alpha,
-            device_option=device_option)
-        return op
-
-    def _feed_inputs(self, input_blobs, device_option):
-        names = ['input', 'scale', 'bias']
-        for name, blob in zip(names, input_blobs):
-            self.ws.create_blob(name).feed(blob, device_option=device_option)
-
-    @given(N=st.integers(2, 3),
-           C=st.integers(2, 3),
-           H=st.integers(2, 3),
-           W=st.integers(2, 3),
-           alpha=st.floats(0, 1),
-           seed=st.integers(0, 1000),
-           **mu.gcs)
-    @settings(deadline=1000)
-    def test_leaky_relu_gradients(self, gc, dc, N, C, H, W, alpha, seed):
-        np.random.seed(seed)
-
-        op = self._get_op(
-            device_option=gc,
-            alpha=alpha,
-            order='NCHW')
-        input_blobs = self._get_inputs(N, C, H, W, "NCHW")
-
-        self.assertDeviceChecks(dc, op, input_blobs, [0])
-        self.assertGradientChecks(gc, op, input_blobs, 0, [0])
-
-    @given(N=st.integers(2, 10),
-           C=st.integers(3, 10),
-           H=st.integers(5, 10),
-           W=st.integers(7, 10),
-           alpha=st.floats(0, 1),
-           seed=st.integers(0, 1000))
-    def test_leaky_relu_model_helper_helper(self, N, C, H, W, alpha, seed):
-        np.random.seed(seed)
-        order = 'NCHW'
-        arg_scope = {'order': order}
-        model = model_helper.ModelHelper(name="test_model", arg_scope=arg_scope)
-        model.LeakyRelu(
-            'input',
-            'output',
-            alpha=alpha)
-
-        input_blob = np.random.rand(N, C, H, W).astype(np.float32)
-
-        self.ws.create_blob('input').feed(input_blob)
-
-        self.ws.create_net(model.param_init_net).run()
-        self.ws.create_net(model.net).run()
-
-        output_blob = self.ws.blobs['output'].fetch()
-
-        assert output_blob.shape == (N, C, H, W)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/moment_sgd_op_test.py b/caffe2/python/ideep/moment_sgd_op_test.py
deleted file mode 100644
index 596bab0ad3cc..000000000000
--- a/caffe2/python/ideep/moment_sgd_op_test.py
+++ /dev/null
@@ -1,61 +0,0 @@
-
-
-
-
-
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core, workspace
-from hypothesis import given
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class TestMomentumSGDUpdateOps(hu.HypothesisTestCase):
-    @given(n=st.integers(4, 8), nesterov=st.booleans(),
-           **mu.gcs)
-    def test_MomentumSGDUpdate(self, n, nesterov, gc, dc):
-        param = np.random.rand(n).astype(np.float32)
-        grad = np.random.rand(n).astype(np.float32)
-        lr = np.random.rand(1).astype(np.float32)
-        param_momentum = np.random.rand(n).astype(np.float32)
-        momentum = 0.9
-        op = core.CreateOperator(
-            "MomentumSGDUpdate",
-            ["grad", "param_momentum", "lr", "param"],
-            ["grad", "param_momentum", "param"],
-            momentum=momentum,
-            nesterov=int(nesterov),
-        )
-        # Iter lives on the CPU
-        input_device_options = {'lr': hu.cpu_do}
-
-        self.assertDeviceChecks(
-            dc,
-            op,
-            [grad, param_momentum, lr, param],
-            [0],
-            input_device_options=input_device_options,
-            threshold=0.001)
-
-        op_noparam = core.CreateOperator(
-            "MomentumSGD",
-            ["grad", "param_momentum", "lr"],
-            ["grad", "param_momentum"],
-            momentum=momentum,
-            nesterov=int(nesterov),
-        )
-
-        self.assertDeviceChecks(
-            dc,
-            op_noparam,
-            [grad, param_momentum, lr],
-            [0],
-            input_device_options=input_device_options,
-            threshold=0.001)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/operator_fallback_op_test.py b/caffe2/python/ideep/operator_fallback_op_test.py
deleted file mode 100644
index dc928c264082..000000000000
--- a/caffe2/python/ideep/operator_fallback_op_test.py
+++ /dev/null
@@ -1,99 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-from caffe2.proto import caffe2_pb2
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class TestFallbackOps(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(8, 10),
-           input_channels=st.integers(1, 3),
-           output_channels=st.integers(1, 5),
-           batch_size=st.integers(1, 3),
-           use_bias=st.booleans(),
-           **mu.gcs)
-    def test_in_place(self, stride, pad, kernel, size,
-                             input_channels, output_channels,
-                             batch_size, use_bias, gc, dc):
-        # To expose fallback in-place potential issue, the fallback op
-        # following ideep op must be run at least two iterations.
-        conv = core.CreateOperator(
-            "Conv",
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            device_option=dc[0]
-        )
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(output_channels, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-        workspace.FeedBlob('X', X, dc[0])
-        workspace.FeedBlob('w', w, dc[0])
-        workspace.FeedBlob('b', b, dc[0])
-        workspace.RunOperatorOnce(conv)
-        Y = workspace.FetchBlob('Y')
-
-        scale = np.random.randn(Y.shape[1]).astype(np.float32)
-        bias = np.random.randn(Y.shape[1]).astype(np.float32)
-        ac = core.CreateOperator(
-            "AffineChannel",
-            ["Y", "scale", "bias"],
-            ["Y"],
-            is_learnable=False,
-            device_option=dc[0]
-        )
-        workspace.FeedBlob('scale', scale, dc[0])
-        workspace.FeedBlob('bias', bias, dc[0])
-        workspace.RunOperatorOnce(ac)
-        workspace.RunOperatorOnce(conv)
-        workspace.RunOperatorOnce(ac)
-        Y0 = workspace.FetchBlob('Y')
-
-        workspace.ResetWorkspace()
-        dev_net = caffe2_pb2.NetDef()
-        conv_dev = caffe2_pb2.OperatorDef()
-        conv_dev.CopyFrom(conv)
-        conv_dev.device_option.CopyFrom(dc[1])
-        ac_dev = caffe2_pb2.OperatorDef()
-        ac_dev.CopyFrom(ac)
-        ac_dev.device_option.CopyFrom(dc[1])
-        dev_net.op.extend([conv_dev, ac_dev])
-        workspace.FeedBlob('X', X, dc[1])
-        workspace.FeedBlob('w', w, dc[1])
-        workspace.FeedBlob('b', b, dc[1])
-        workspace.FeedBlob('scale', scale, dc[1])
-        workspace.FeedBlob('bias', bias, dc[1])
-        workspace.RunNetOnce(dev_net)
-        workspace.RunNetOnce(dev_net)
-        Y1 = workspace.FetchBlob('Y')
-
-        if not np.allclose(Y0, Y1, atol=0.01, rtol=0.01):
-            print(Y1.flatten())
-            print(Y0.flatten())
-            print(np.max(np.abs(Y1 - Y0)))
-            self.assertTrue(False)
-
-        workspace.SwitchWorkspace(old_ws_name)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/order_switch_op_test.py b/caffe2/python/ideep/order_switch_op_test.py
deleted file mode 100644
index 39ede0d214fe..000000000000
--- a/caffe2/python/ideep/order_switch_op_test.py
+++ /dev/null
@@ -1,81 +0,0 @@
-
-
-
-
-
-import unittest
-import numpy as np
-import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-from hypothesis import given, settings
-from caffe2.python import core, workspace
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class OrderSwitchTest(hu.HypothesisTestCase):
-    @given(n=st.integers(1, 128),
-           c=st.integers(1, 64),
-           h=st.integers(1, 128),
-           w=st.integers(1, 128),
-           **mu.gcs)
-    @settings(max_examples=10, deadline=None)
-    def test_nchw2nhwc(self, n, c, h, w, gc, dc):
-        op = core.CreateOperator(
-            "NCHW2NHWC",
-            ["X"],
-            ["Y"],
-        )
-        X = np.random.rand(n, c, h, w).astype(np.float32) - 0.5
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(n=st.integers(1, 128),
-           c=st.integers(1, 64),
-           h=st.integers(1, 128),
-           w=st.integers(1, 128),
-           **mu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_nhwc2nchw(self, n, c, h, w, gc, dc):
-        op0 = core.CreateOperator(
-            "NCHW2NHWC",
-            ["X"],
-            ["Y"],
-        )
-        op1 = core.CreateOperator(
-            "NHWC2NCHW",
-            ["Y"],
-            ["Z"],
-        )
-
-        X = np.random.rand(n, c, h, w).astype(np.float32) - 0.5
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-        workspace.FeedBlob('X', X, dc[0])
-        op0.device_option.CopyFrom(dc[0])
-        op1.device_option.CopyFrom(dc[0])
-        workspace.RunOperatorOnce(op0)
-        workspace.RunOperatorOnce(op1)
-        Z0 = workspace.FetchBlob("Z")
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('X', X, dc[1])
-        op0.device_option.CopyFrom(dc[1])
-        op1.device_option.CopyFrom(dc[1])
-        workspace.RunOperatorOnce(op0)
-        workspace.RunOperatorOnce(op1)
-        Z1 = workspace.FetchBlob("Z")
-
-        if not np.allclose(Z0, Z1, atol=0.01, rtol=0.01):
-            print(Z1.flatten())
-            print(Z0.flatten())
-            print(np.max(np.abs(Z1 - Z0)))
-            self.assertTrue(False)
-
-        workspace.SwitchWorkspace(old_ws_name)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/pool_op_test.py b/caffe2/python/ideep/pool_op_test.py
deleted file mode 100644
index 9ab3fcddbadb..000000000000
--- a/caffe2/python/ideep/pool_op_test.py
+++ /dev/null
@@ -1,151 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import assume, given, settings
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class PoolTest(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           method=st.sampled_from(["MaxPool", "AveragePool"]),
-           **mu.gcs)
-    @settings(deadline=10000)
-    def test_pooling(self, stride, pad, kernel, size,
-                         input_channels, batch_size,
-                         method, gc, dc):
-        assume(pad < kernel)
-        op = core.CreateOperator(
-            method,
-            ["X"],
-            ["Y"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            device_option=dc[0],
-        )
-        X = np.random.rand(
-            batch_size, input_channels, size, size
-        ).astype(np.float32)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-        if 'MaxPool' not in method:
-            self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           method=st.sampled_from(["MaxPool", "AveragePool"]),
-           **mu.gcs_cpu_ideep)
-    def test_int8_pooling(self, stride, pad, kernel, size,
-                         input_channels, batch_size,
-                         method, gc, dc):
-        assume(pad < kernel)
-        pool_fp32 = core.CreateOperator(
-            method,
-            ["X"],
-            ["Y"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            device_option=dc[0]
-        )
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32)
-
-        if X.min() >=0:
-            scale = np.absolute(X).max() / 0xFF
-            zero_point = 0
-        else:
-            scale = np.absolute(X).max() / 0x7F
-            zero_point = 128
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-
-        workspace.FeedBlob("X", X, dc[0])
-        workspace.RunOperatorOnce(pool_fp32)
-        Y = workspace.FetchBlob("Y")
-
-        workspace.ResetWorkspace()
-
-        sw2nhwc = core.CreateOperator(
-            "NCHW2NHWC",
-            ["Xi"],
-            ["Xi_nhwc"],
-            device_option=dc[1]
-        )
-
-        quantize = core.CreateOperator(
-            "Int8Quantize",
-            ["Xi_nhwc"],
-            ["Xi_quantized"],
-            engine="DNNLOWP",
-            device_option=dc[1],
-            Y_zero_point=zero_point,
-            Y_scale=scale,
-        )
-
-        pool = core.CreateOperator(
-            "Int8{}".format(method),
-            ["Xi_quantized"],
-            ["Y_quantized"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            engine="DNNLOWP",
-            device_option=dc[1],
-        )
-
-        dequantize = core.CreateOperator(
-            "Int8Dequantize",
-            ["Y_quantized"],
-            ["Y_nhwc"],
-            engine="DNNLOWP",
-            device_option=dc[1],
-        )
-
-        sw2nchw = core.CreateOperator(
-            "NHWC2NCHW",
-            ["Y_nhwc"],
-            ["Y_out"],
-            device_option=dc[1]
-        )
-
-        net = caffe2_pb2.NetDef()
-        net.op.extend([sw2nhwc, quantize, pool, dequantize, sw2nchw])
-
-        workspace.FeedBlob("Xi", X, dc[1])
-        workspace.RunNetOnce(net)
-        Y_out = workspace.FetchBlob("Y_out")
-
-        MSE = np.square(np.subtract(Y, Y_out)).mean()
-        if MSE > 0.005:
-            print(Y.flatten())
-            print(Y_out.flatten())
-            print(np.max(np.abs(Y_out - Y)))
-            print("MSE", MSE)
-            self.assertTrue(False)
-
-        workspace.SwitchWorkspace(old_ws_name)
-
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/pre_convert_test.py b/caffe2/python/ideep/pre_convert_test.py
deleted file mode 100644
index 6c0b7ca5d7a7..000000000000
--- a/caffe2/python/ideep/pre_convert_test.py
+++ /dev/null
@@ -1,97 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import (
-    brew,
-    core,
-    model_helper,
-    workspace,
-)
-from caffe2.python.transformations import optimizeForMKLDNN
-import caffe2.python.hypothesis_test_util as hu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class PreConvertTest(hu.HypothesisTestCase):
-    @given(input_channels=st.integers(15, 16),
-           batch_size=st.integers(1, 3))
-    def test_preConvert(self, input_channels, batch_size):
-        def AddModel(model, data):
-            conv1 = brew.conv(model, data, 'conv1', dim_in=input_channels,
-                              dim_out=10, kernel=3, stride=1, pad=1, training_mode=1)
-            deconv1 = brew.conv_transpose(model, conv1, 'deconv1', dim_in=10, dim_out=10,
-                                          kernel=2, stride=2, pad=0, training_mode=1)
-            fc1 = brew.fc(model, deconv1, 'fc1', dim_in=10 * 56 * 56, dim_out=3)
-            softmax = brew.softmax(model, fc1, 'softmax')
-
-            return softmax
-
-        def AddTrainingOperators(model, softmax, label):
-            """Adds training operators to the model."""
-            # Compute cross entropy between softmax scores and labels
-            xent = model.LabelCrossEntropy([softmax, label], 'xent')
-            # Compute the expected loss
-            loss = model.AveragedLoss(xent, "loss")
-            # Use the average loss we just computed to add gradient operators to the model
-            model.AddGradientOperators([loss])
-
-        arg_scope = {"order": "NCHW", 'no_bias': False}
-        # Create the model helper for the train model
-        device_opt = core.DeviceOption(caffe2_pb2.IDEEP, 0)
-        with core.DeviceScope(device_opt):
-            train_model = model_helper.ModelHelper(name="test_train", arg_scope=arg_scope)
-            # Add the model definition (fc layers, conv layers, softmax, etc.)
-            softmax = AddModel(train_model, "X")
-            AddTrainingOperators(train_model, softmax, "label")
-
-            X = np.random.rand(
-                batch_size, input_channels, 28, 28).astype(np.float32) - 0.5
-            label = np.random.randint(3, size=batch_size).astype(np.int32)
-            blob_dict = {}
-            output_dict = {}
-            output_dict_cosim = {}
-            old_ws_name = workspace.CurrentWorkspace()
-            workspace.FeedBlob('X', X)
-            workspace.FeedBlob('label', label)
-            workspace.RunNetOnce(train_model.param_init_net)
-            for op in train_model.net.Proto().op:
-                if op.type == "Softmax":
-                    break
-                for j in range(1, len(op.input)):
-                    blob_dict[op.input[j]] = workspace.FetchBlob(op.input[j])
-
-            workspace.CreateNet(train_model.net, overwrite=True)
-            optimizeForMKLDNN(train_model.net, training_mode=True)
-            workspace.RunNet(train_model.net)
-            for op in train_model.net.Proto().op:
-                for blob in op.output:
-                    output_dict[blob] = workspace.FetchBlob(blob)
-
-            workspace.SwitchWorkspace("_device_check_", True)
-            workspace.FeedBlob('X', X)
-            workspace.FeedBlob('label', label)
-            for blob in blob_dict.keys():
-                workspace.FeedBlob(blob, blob_dict[blob])
-            workspace.CreateNet(train_model.net, overwrite=True)
-            workspace.RunNet(train_model.net)
-            for blob in output_dict.keys():
-                output_dict_cosim[blob] = workspace.FetchBlob(blob)
-
-            for blob in output_dict.keys():
-                if not np.allclose(output_dict[blob], output_dict_cosim[blob], atol=0.001, rtol=0.0001):
-                    print("blob {} error".format(blob))
-                    print(np.max(np.abs(output_dict[blob] - output_dict_cosim[blob])))
-                    self.assertTrue(False)
-
-            workspace.ResetWorkspace()
-            workspace.SwitchWorkspace(old_ws_name)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/relu_op_test.py b/caffe2/python/ideep/relu_op_test.py
deleted file mode 100644
index e2fda68aed2b..000000000000
--- a/caffe2/python/ideep/relu_op_test.py
+++ /dev/null
@@ -1,129 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class ReluTest(hu.HypothesisTestCase):
-    @given(X=hu.tensor(),
-           inplace=st.booleans(),
-           **mu.gcs)
-    @settings(deadline=1000)
-    def test_relu(self, X, inplace, gc, dc):
-        op = core.CreateOperator(
-            "Relu",
-            ["X"],
-            ["Y"] if not inplace else ["X"],
-        )
-        # go away from the origin point to avoid kink problems
-        X += 0.02 * np.sign(X)
-        X[X == 0.0] += 0.02
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           inplace=st.booleans(),
-           **mu.gcs_cpu_ideep)
-    @settings(max_examples=10, deadline=None)
-    def test_int8_relu(self, size, input_channels, batch_size, inplace, gc, dc):
-        relu_fp32 = core.CreateOperator(
-            "Relu",
-            ["X"],
-            ["Y"] if not inplace else ["X"],
-            device_option=dc[0]
-        )
-
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        # go away from the origin point to avoid kink problems
-        X += 0.02 * np.sign(X)
-        X[X == 0.0] += 0.02
-
-        if X.min() >=0:
-            scale = np.absolute(X).max() / 0xFF
-            zero_point = 0
-        else:
-            scale = np.absolute(X).max() / 0x7F
-            zero_point = 128
-
-        old_ws_name = workspace.CurrentWorkspace()
-        workspace.SwitchWorkspace("_device_check_", True)
-
-        workspace.FeedBlob("X", X, dc[0])
-        workspace.RunOperatorOnce(relu_fp32)
-        Y = workspace.FetchBlob("X" if inplace else "Y")
-
-        workspace.ResetWorkspace()
-
-        sw2nhwc = core.CreateOperator(
-            "NCHW2NHWC",
-            ["Xi"],
-            ["Xi_nhwc"],
-            device_option=dc[1]
-        )
-
-        quantize = core.CreateOperator(
-            "Int8Quantize",
-            ["Xi_nhwc"],
-            ["Xi_quantized"],
-            engine="DNNLOWP",
-            device_option=dc[1],
-            Y_zero_point=zero_point,
-            Y_scale=scale,
-        )
-
-        relu = core.CreateOperator(
-            "Int8Relu",
-            ["Xi_quantized"],
-            ["Y_quantized"] if not inplace else ["Xi_quantized"],
-            engine="DNNLOWP",
-            device_option=dc[1],
-        )
-
-        dequantize = core.CreateOperator(
-            "Int8Dequantize",
-            ["Y_quantized"] if not inplace else ["Xi_quantized"],
-            ["Y_nhwc"],
-            engine="DNNLOWP",
-            device_option=dc[1],
-        )
-
-        sw2nchw = core.CreateOperator(
-            "NHWC2NCHW",
-            ["Y_nhwc"],
-            ["Y_out"],
-            device_option=dc[1]
-        )
-
-        net = caffe2_pb2.NetDef()
-        net.op.extend([sw2nhwc, quantize, relu, dequantize, sw2nchw])
-
-        workspace.FeedBlob("Xi", X, dc[1])
-        workspace.RunNetOnce(net)
-        Y_out = workspace.FetchBlob("Y_out")
-
-        MSE = np.square(np.subtract(Y, Y_out)).mean()
-        if MSE > 0.005:
-            print(Y.flatten())
-            print(Y_out.flatten())
-            print(np.max(np.abs(Y_out - Y)))
-            print("MSE", MSE)
-            self.assertTrue(False)
-
-        workspace.SwitchWorkspace(old_ws_name)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/reshape_op_test.py b/caffe2/python/ideep/reshape_op_test.py
deleted file mode 100644
index c2bca948a52c..000000000000
--- a/caffe2/python/ideep/reshape_op_test.py
+++ /dev/null
@@ -1,144 +0,0 @@
-
-
-
-
-
-from caffe2.python.test_util import TestCase
-from caffe2.proto import caffe2_pb2
-import unittest
-import numpy as np
-from caffe2.python import core, workspace
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class TestReShapeOps(TestCase):
-    def test_reshape_ops(self):
-        device_opt = core.DeviceOption(caffe2_pb2.IDEEP, 0)
-        with core.DeviceScope(device_opt):
-            workspace.FeedBlob('res', np.array([[0, 0, 0, 0]], dtype=np.float32))
-            workspace.FeedBlob('shape', np.array([1, 4], dtype=np.int32), core.DeviceOption(caffe2_pb2.CPU, 0))
-            workspace.FeedBlob('input', np.zeros((2, 2), dtype=np.float32))
-            workspace.RunOperatorOnce(core.CreateOperator(
-                'Reshape', ['input', 'shape'], ['output', 'old_shape']))
-            assert ((workspace.FetchBlob('output') ==
-                    workspace.FetchBlob('res')).all())
-
-    def test_basic_reshape(self):
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(2, 4))
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(2, 4), arg_shape=False)
-
-    def test_int64_reshape_input(self):
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(2, 4), arg_shape=False, shape_dtype=np.int64)
-
-    def test_missing_dim(self):
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(-1, 8))
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(-1, 8), arg_shape=False)
-
-    def test_in_place(self):
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(-1, 8), in_place=True)
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(-1, 8),
-                      in_place=True, arg_shape=False)
-
-    def test_zero_dim(self):
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, 0, 0),
-                      expected_shape=(4, 2, 1))
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, 0, 0),
-                      expected_shape=(4, 2, 1), arg_shape=False)
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, 2, 1),
-                      expected_shape=(4, 2, 1))
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, 2, 1),
-                      expected_shape=(4, 2, 1), arg_shape=False)
-
-    def test_zero_dim_and_missing_dim(self):
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, -1, 0),
-                      expected_shape=(4, 2, 1))
-        _test_reshape(old_shape=(4, 2, 1), new_shape=(0, -1, 0),
-                      expected_shape=(4, 2, 1), arg_shape=False)
-        _test_reshape(old_shape=(4, 3, 2), new_shape=(-1, 0),
-                      expected_shape=(8, 3))
-        _test_reshape(old_shape=(4, 3, 2), new_shape=(-1, 0),
-                      expected_shape=(8, 3), arg_shape=False)
-
-    def test_backprop(self):
-        device_opt = core.DeviceOption(caffe2_pb2.IDEEP, 0)
-        with core.DeviceScope(device_opt):
-            old_shape = (4, 2, 1)
-            new_shape = (1, 8)
-            X = np.random.rand(*old_shape).astype(np.float32)
-            Y = np.random.rand(*new_shape).astype(np.float32)
-
-            net = core.Net('net')
-
-            net.GivenTensorFill([], 'X', shape=old_shape, values=X.flatten())
-            net.GivenTensorFill([], 'Y', shape=new_shape, values=Y.flatten())
-
-            net.Reshape(['X'], ['X_out', 'old_shape'], shape=new_shape)
-            net.Mul(['X_out', 'Y'], 'Z')
-            net.AddGradientOperators(['Z'])
-
-            workspace.RunNetOnce(net)
-
-            Z = workspace.FetchBlob('Z')
-            X_grad = workspace.FetchBlob('X_grad')
-
-            # Check forward computation
-            np.testing.assert_allclose(
-                Z.squeeze(), (X.reshape(new_shape) * Y).squeeze(), rtol=1e-5)
-
-            # Check the shape of the gradient
-            np.testing.assert_array_equal(X_grad.shape, X.shape)
-
-            # Check the gradient
-            np.testing.assert_allclose(X_grad, Y.reshape(old_shape), rtol=1e-5)
-
-    def test_input_shape_changes(self):
-        device_opt = core.DeviceOption(caffe2_pb2.IDEEP, 0)
-        with core.DeviceScope(device_opt):
-            workspace.FeedBlob(
-                'input_blob',
-                np.array(np.random.rand(10, 20, 10), dtype=np.float32))
-            net = core.Net('mynet')
-            z, _ = net.Reshape('input_blob',
-                               ['z_reshape', 'dummy_size'],
-                               shape=(-1, 10))
-            workspace.CreateNet(net)
-            workspace.RunNet(net)
-            workspace.FeedBlob(
-                'input_blob',
-                np.array(np.random.rand(10, 40, 10), dtype=np.float32))
-            workspace.RunNet(net)
-
-
-def _test_reshape(old_shape, new_shape, expected_shape=None, arg_shape=True,
-                  in_place=False, shape_dtype=np.int32):
-    devices = [core.DeviceOption(caffe2_pb2.IDEEP, 0)]
-
-    for device_opt in devices:
-        with core.DeviceScope(device_opt):
-            if expected_shape is None:
-                expected_shape = new_shape
-            X = np.random.rand(*old_shape).astype(np.float32)
-
-            blob_in = 'X'
-            blob_out = blob_in if in_place else blob_in + '_out'
-
-            if arg_shape:
-                op = core.CreateOperator('Reshape',
-                                         [blob_in],
-                                         [blob_out, 'old_shape'],
-                                         shape=new_shape)
-            else:
-                op = core.CreateOperator('Reshape',
-                                         [blob_in, 'new_shape'],
-                                         [blob_out, 'old_shape'])
-                workspace.FeedBlob('new_shape', np.asarray(new_shape, dtype=shape_dtype),
-                                   core.DeviceOption(caffe2_pb2.CPU, 0))
-
-            workspace.FeedBlob(blob_in, X)
-            workspace.RunOperatorOnce(op)
-
-            Y = workspace.FetchBlob(blob_out)
-            np.testing.assert_allclose(Y, X.reshape(expected_shape))
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/shape_op_test.py b/caffe2/python/ideep/shape_op_test.py
deleted file mode 100644
index 1beb24bc8803..000000000000
--- a/caffe2/python/ideep/shape_op_test.py
+++ /dev/null
@@ -1,89 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class ShapeTest(hu.HypothesisTestCase):
-    @given(n=st.integers(1, 128),
-           c=st.integers(1, 128),
-           h=st.integers(1, 128),
-           w=st.integers(1, 128),
-           **mu.gcs)
-    @settings(max_examples=10, deadline=None)
-    def test_shape(self, n, c, h, w, gc, dc):
-        op0 = core.CreateOperator(
-            "Shape",
-            ["X0"],
-            ["Y0"],
-            device_option=dc[0]
-        )
-        op1 = core.CreateOperator(
-            "Shape",
-            ["X1"],
-            ["Y1"],
-            device_option=dc[1]
-        )
-        X = np.random.rand(n, c, h, w).astype(np.float32) - 0.5
-        workspace.FeedBlob('X0', X, dc[0])
-        workspace.FeedBlob('X1', X, dc[1])
-        workspace.RunOperatorOnce(op0)
-        workspace.RunOperatorOnce(op1)
-        Y0 = workspace.FetchBlob('Y0')
-        Y1 = workspace.FetchBlob('Y1')
-
-        if not np.allclose(Y0, Y1, atol=0, rtol=0):
-            print(Y1.flatten())
-            print(Y0.flatten())
-            print(np.max(np.abs(Y1 - Y0)))
-            self.assertTrue(False)
-
-    @given(n=st.integers(1, 128),
-           c=st.integers(1, 128),
-           h=st.integers(1, 128),
-           w=st.integers(1, 128),
-           axes=st.lists(st.integers(0, 3), min_size=1, max_size=3),
-           **mu.gcs)
-    @settings(max_examples=10, deadline=None)
-    def test_shape_with_axes(self, n, c, h, w, axes, gc, dc):
-        axes = list(set(axes)).sort()
-        op0 = core.CreateOperator(
-            "Shape",
-            ["X0"],
-            ["Y0"],
-            axes = axes,
-            device_option=dc[0]
-        )
-        op1 = core.CreateOperator(
-            "Shape",
-            ["X1"],
-            ["Y1"],
-            axes = axes,
-            device_option=dc[1]
-        )
-        X = np.random.rand(n, c, h, w).astype(np.float32) - 0.5
-        workspace.FeedBlob('X0', X, dc[0])
-        workspace.FeedBlob('X1', X, dc[1])
-        workspace.RunOperatorOnce(op0)
-        workspace.RunOperatorOnce(op1)
-        Y0 = workspace.FetchBlob('Y0')
-        Y1 = workspace.FetchBlob('Y1')
-
-        if not np.allclose(Y0, Y1, atol=0, rtol=0):
-            print(Y1.flatten())
-            print(Y0.flatten())
-            print(np.max(np.abs(Y1 - Y0)))
-            self.assertTrue(False)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/sigmoid_op_test.py b/caffe2/python/ideep/sigmoid_op_test.py
deleted file mode 100644
index 2b5eb0e3a2b5..000000000000
--- a/caffe2/python/ideep/sigmoid_op_test.py
+++ /dev/null
@@ -1,32 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class SigmoidTest(hu.HypothesisTestCase):
-    @given(X=hu.tensor(dtype=np.float32),
-           inplace=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=1000)
-    def test_sigmoid(self, X, inplace, gc, dc):
-        op = core.CreateOperator(
-            "Sigmoid",
-            ["X"],
-            ["Y"] if not inplace else ["X"],
-        )
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/softmax_op_test.py b/caffe2/python/ideep/softmax_op_test.py
deleted file mode 100644
index b76d6509609b..000000000000
--- a/caffe2/python/ideep/softmax_op_test.py
+++ /dev/null
@@ -1,34 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class SoftmaxTest(hu.HypothesisTestCase):
-    @given(size=st.integers(8, 20),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           inplace=st.booleans(),
-           **mu.gcs)
-    def test_softmax(self, size, input_channels, batch_size, inplace, gc, dc):
-        op = core.CreateOperator(
-            "Softmax",
-            ["X"],
-            ["Y"],
-            axis=1,
-        )
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/spatial_bn_op_test.py b/caffe2/python/ideep/spatial_bn_op_test.py
deleted file mode 100644
index 97efafa72057..000000000000
--- a/caffe2/python/ideep/spatial_bn_op_test.py
+++ /dev/null
@@ -1,133 +0,0 @@
-
-
-
-
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class TestSpatialBN(hu.HypothesisTestCase):
-    @given(size=st.integers(7, 10),
-           input_channels=st.integers(7, 10),
-           batch_size=st.integers(1, 3),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW"]),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           inplace=st.sampled_from([True, False]),
-           **mu.gcs)
-    @settings(deadline=1000)
-    def test_spatialbn_test_mode(
-            self, size, input_channels, batch_size, seed, order, epsilon,
-            inplace, gc, dc):
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "mean", "var"],
-            ["X" if inplace else "Y"],
-            order=order,
-            is_test=True,
-            epsilon=epsilon
-        )
-
-        def reference_spatialbn_test(X, scale, bias, mean, var):
-            if order == "NCHW":
-                scale = scale[np.newaxis, :, np.newaxis, np.newaxis]
-                bias = bias[np.newaxis, :, np.newaxis, np.newaxis]
-                mean = mean[np.newaxis, :, np.newaxis, np.newaxis]
-                var = var[np.newaxis, :, np.newaxis, np.newaxis]
-            return ((X - mean) / np.sqrt(var + epsilon) * scale + bias,)
-
-        np.random.seed(1701)
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-
-        if order == "NHWC":
-            X = X.swapaxes(1, 2).swapaxes(2, 3)
-
-        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var], [0])
-
-    @given(size=st.integers(7, 10),
-           input_channels=st.integers(7, 10),
-           batch_size=st.integers(1, 3),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW"]),
-           epsilon=st.floats(1e-5, 1e-2),
-           inplace=st.sampled_from([True, False]),
-           **mu.gcs)
-    def test_spatialbn_train_mode(
-            self, size, input_channels, batch_size, seed, order, epsilon,
-            inplace, gc, dc):
-        print("dc0: {}, dc1: {}".format(dc[0], dc[1]))
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "running_mean", "running_var"],
-            ["X" if inplace else "Y",
-            "running_mean", "running_var", "saved_mean", "saved_var"],
-            order=order,
-            is_test=False,
-            epsilon=epsilon,
-        )
-        np.random.seed(1701)
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        running_mean = np.random.randn(input_channels).astype(np.float32)
-        running_var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-
-        if order == "NHWC":
-            X = X.swapaxes(1, 2).swapaxes(2, 3)
-
-        # TODO: It looks like IDEEP spatial_bn op outputs save_var (output[4])
-        # as the reciprocal of CPU op's output. Need to check back and add
-        # output[4] for comparison
-        self.assertDeviceChecks(dc, op, [X, scale, bias, running_mean, running_var],
-            [0, 1, 2, 3])
-
-    @given(size=st.integers(7, 10),
-           input_channels=st.integers(1, 10),
-           batch_size=st.integers(1, 3),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW"]),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           **mu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_spatialbn_train_mode_gradient_check(
-            self, size, input_channels, batch_size, seed, order, epsilon,
-            gc, dc):
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "mean", "var"],
-            ["Y", "mean", "var", "saved_mean", "saved_var"],
-            order=order,
-            is_test=False,
-            epsilon=epsilon,
-        )
-        np.random.seed(seed)
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        if order == "NHWC":
-            X = X.swapaxes(1, 2).swapaxes(2, 3)
-
-        for input_to_check in [0, 1, 2]:  # dX, dScale, dBias
-            self.assertGradientChecks(gc, op, [X, scale, bias, mean, var],
-                                      input_to_check, [0])
-
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/test_ideep_net.py b/caffe2/python/ideep/test_ideep_net.py
deleted file mode 100644
index 42feeed00122..000000000000
--- a/caffe2/python/ideep/test_ideep_net.py
+++ /dev/null
@@ -1,130 +0,0 @@
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.models.download import ModelDownloader
-import numpy as np
-import argparse
-import time
-
-
-def GetArgumentParser():
-    parser = argparse.ArgumentParser(description="Caffe2 benchmark.")
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=128,
-        help="The batch size."
-    )
-    parser.add_argument("--model", type=str, help="The model to benchmark.")
-    parser.add_argument(
-        "--order",
-        type=str,
-        default="NCHW",
-        help="The order to evaluate."
-    )
-    parser.add_argument(
-        "--device",
-        type=str,
-        default="CPU",
-        help="device to evaluate on."
-    )
-    parser.add_argument(
-        "--cudnn_ws",
-        type=int,
-        help="The cudnn workspace size."
-    )
-    parser.add_argument(
-        "--iterations",
-        type=int,
-        default=10,
-        help="Number of iterations to run the network."
-    )
-    parser.add_argument(
-        "--warmup_iterations",
-        type=int,
-        default=10,
-        help="Number of warm-up iterations before benchmarking."
-    )
-    parser.add_argument(
-        "--forward_only",
-        action='store_true',
-        help="If set, only run the forward pass."
-    )
-    parser.add_argument(
-        "--layer_wise_benchmark",
-        action='store_true',
-        help="If True, run the layer-wise benchmark as well."
-    )
-    parser.add_argument(
-        "--engine",
-        type=str,
-        default="",
-        help="If set, blindly prefer the given engine(s) for every op.")
-    parser.add_argument(
-        "--dump_model",
-        action='store_true',
-        help="If True, dump the model prototxts to disk."
-    )
-    parser.add_argument("--net_type", type=str, default="simple")
-    parser.add_argument("--num_workers", type=int, default=2)
-    parser.add_argument("--use-nvtx", default=False, action='store_true')
-    parser.add_argument("--htrace_span_log_path", type=str)
-    return parser
-
-
-def benchmark(args):
-    print('Batch size: {}'.format(args.batch_size))
-    mf = ModelDownloader()
-    init_net, pred_net, value_info = mf.get_c2_model(args.model)
-    input_shapes = {k : [args.batch_size] + v[-1][1:] for (k, v) in value_info.items()}
-    print("input info: {}".format(input_shapes))
-    external_inputs = {}
-    for k, v in input_shapes.items():
-        external_inputs[k] = np.random.randn(*v).astype(np.float32)
-
-    if args.device == 'CPU':
-        device_option = core.DeviceOption(caffe2_pb2.CPU)
-    elif args.device == 'MKL':
-        device_option = core.DeviceOption(caffe2_pb2.MKLDNN)
-    elif args.device == 'IDEEP':
-        device_option = core.DeviceOption(caffe2_pb2.IDEEP)
-    else:
-        raise Exception("Unknown device: {}".format(args.device))
-    print("Device option: {}, {}".format(args.device, device_option))
-    pred_net.device_option.CopyFrom(device_option)
-    for op in pred_net.op:
-        op.device_option.CopyFrom(device_option)
-
-    # Hack to initialized weights into MKL/IDEEP context
-    workspace.RunNetOnce(init_net)
-    bb = workspace.Blobs()
-    weights = {}
-    for b in bb:
-        weights[b] = workspace.FetchBlob(b)
-    for k, v in external_inputs.items():
-        weights[k] = v
-    workspace.ResetWorkspace()
-
-    with core.DeviceScope(device_option):
-        for name, blob in weights.items():
-            #print("{}".format(name))
-            workspace.FeedBlob(name, blob, device_option)
-        workspace.CreateNet(pred_net)
-        start = time.time()
-        res = workspace.BenchmarkNet(pred_net.name,
-                                     args.warmup_iterations,
-                                     args.iterations,
-                                     args.layer_wise_benchmark)
-        print("FPS: {:.2f}".format(1/res[0]*1000*args.batch_size))
-
-if __name__ == '__main__':
-    args, extra_args = GetArgumentParser().parse_known_args()
-    if (
-        not args.batch_size or not args.model or not args.order
-    ):
-        GetArgumentParser().print_help()
-    benchmark(args)
diff --git a/caffe2/python/ideep/transform_ideep_net.py b/caffe2/python/ideep/transform_ideep_net.py
deleted file mode 100644
index 2d0f35a7406f..000000000000
--- a/caffe2/python/ideep/transform_ideep_net.py
+++ /dev/null
@@ -1,340 +0,0 @@
-
-
-
-
-
-import argparse
-import copy
-import json
-
-import numpy as np
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, utils
-import caffe2.python._import_c_extension as C
-
-
-
-def pairwise(iterable):
-    from itertools import tee
-    a, b = tee(iterable)
-    next(b, None)
-    return zip(a, b)
-
-
-def last_producer(ops, blob):
-    for (i, op) in reversed(list(enumerate(ops))):
-        if blob in op.output:
-            return i
-    raise ValueError("Failed to find last producer of blob, %s", blob)
-
-
-def blob_uses(net, blob):
-    u = []
-    for i, op in enumerate(net.op):
-        if blob in op.input or blob in op.control_input:
-            u.append(i)
-    return u
-
-
-def GetArgumentParser():
-    parser = argparse.ArgumentParser(description="Caffe2 optimization")
-    parser.add_argument("--init_net",
-                        type=argparse.FileType('rb'),
-                        help="init net")
-    parser.add_argument("--pred_net",
-                        type=argparse.FileType('rb'),
-                        help="predict net")
-    parser.add_argument("--verify_input",
-                        type=argparse.FileType('r'),
-                        help="input dims for verification")
-    parser.add_argument("--fuse_bn", default=False, action='store_true')
-    parser.add_argument("--fuse_mul_add", default=False, action='store_true')
-    parser.add_argument("--fuse_conv_relu", default=False, action='store_true')
-    return parser
-
-
-def fuse_first_bn(net, params, removed_tensors):
-    net = copy.deepcopy(net)
-    params = copy.deepcopy(params)
-
-    for ((i, current), (j, next_)) in pairwise(enumerate(net.op)):
-        if next_.input[0] != current.output[0]:
-            continue
-
-        if current.type not in ("Conv", "ConvTranspose") \
-           or next_.type != "SpatialBN":
-            continue
-        if len(blob_uses(net, current.output[0])) != 1:
-            # Can't fuse if more than one user
-            continue
-
-        # else, can fuse
-        conv = current
-        bn = next_
-        fused_conv = copy.deepcopy(conv)
-        fused_conv.output[0] = bn.output[0]
-
-        # Fix fused_conv to ensure we have a bias passed.
-        if len(fused_conv.input) != 3:
-            bias_name = "{}_bias".format(conv.input[1])
-            net.external_input.extend([bias_name])
-            fused_conv.input.extend([bias_name])
-            for arg in fused_conv.arg:
-                if arg.name == "no_bias":
-                    arg.i = 0
-
-        conv_weight = params[conv.input[1]]
-        conv_bias = params[conv.input[2]] if len(conv.input) == 3 \
-            else np.zeros(shape=(conv_weight.shape[0])).astype(np.float32)
-
-        bn_scale = params[bn.input[1]]
-        bn_bias = params[bn.input[2]]
-        bn_running_mean = params[bn.input[3]]
-        bn_running_var = params[bn.input[4]]
-
-        # First, BN computation can be phrased as follows:
-        # (X - running_mean) * (1.0 / sqrt(running_var + eps)) *
-        # bn_scale + bias
-        # Thus, we can rewrite bn_scale as:
-        # X * bn_scale * 1.0 / (sqrt(running_var + eps)) + (bias -
-        # running_mean * (1.0 / sqrt(running_var + eps)) * bn_scale)
-        # Thus, can just have the affine transform
-        # X * A + B
-        # where
-        # A = bn_scale * 1.0 / (sqrt(running_var + eps))
-        # B =  (bias - running_mean * (1.0 / sqrt(running_var + eps))
-        # * bn_scale)
-        eps = 1.0e-5
-        for arg in bn.arg:
-            if arg.name == "epsilon":
-                eps = arg.f
-        A = bn_scale * 1.0 / (np.sqrt(bn_running_var + eps))
-        B = bn_bias - bn_running_mean * A
-
-        # This identify should hold if we have correctly fused
-        # np.testing.assert_array_equal(
-        #     params[conv.output[0]] * A + B,
-        #     params[bn.output[0]])
-
-        # Now, we have that the computation made is the following:
-        # ((X `conv` W) + b) * A + B
-        # Then, we can simply fuse this as follows:
-        # (X `conv` (W * A)) + b * A + B
-        # which is simply
-        # (X `conv` Q) + C
-        # where
-
-        # Q = W * A
-        # C = b * A + B
-
-        # For ConvTranspose, from the view of convolutions as a
-        # Toepeliz multiplication, we have W_ = W^T, so the weights
-        # are laid out as (R, S, K, K) (vs (S, R, K, K) for a Conv),
-        # so the weights broadcast slightly differently. Remember, our
-        # BN scale 'B' is of size (S,)
-
-        A_ = A.reshape(-1, 1, 1, 1) if conv.type == "Conv" else \
-            A.reshape(1, -1, 1, 1)
-
-        C = conv_bias * A + B
-        Q = conv_weight * A_
-
-        params[fused_conv.input[1]] = Q
-        params[fused_conv.input[2]] = C
-        new_ops = net.op[:i] + [fused_conv] + net.op[j + 1:]
-        del net.op[:]
-        removed_tensors.append(bn.input[1])
-        removed_tensors.append(bn.input[2])
-        removed_tensors.append(bn.input[3])
-        removed_tensors.append(bn.input[4])
-        del params[bn.input[1]]
-        del params[bn.input[2]]
-        del params[bn.input[3]]
-        del params[bn.input[4]]
-        net.op.extend(new_ops)
-        break
-    return net, params, removed_tensors
-
-
-def fuse_bn(net, params, ignore_failure):
-    # Run until we hit a fixed point
-    removed_tensors = []
-    while True:
-        (next_net, next_params, removed_tensors) = \
-            fuse_first_bn(net, params, removed_tensors)
-        if len(next_net.op) == len(net.op):
-            if (
-                any(op.type == "SpatialBN" for op in next_net.op) and
-                not ignore_failure
-            ):
-                raise Exception(
-                    "Model contains SpatialBN op after fusion: %s", next_net)
-            return (next_net, next_params, removed_tensors)
-        net, params, removed_tensors = (next_net, next_params, removed_tensors)
-
-
-def fuse_first_mul_add(net, params, removed_tensors):
-    net = copy.deepcopy(net)
-    params = copy.deepcopy(params)
-
-    for ((i, current), (j, next_)) in pairwise(enumerate(net.op)):
-        if current.type != "Mul" or next_.type != "Add":
-            continue
-
-        if next_.input[0] != current.output[0]:
-            raise Exception("Failure to fuse")
-
-        if len(blob_uses(net, current.output[0])) != 1:
-            raise Exception("Failure to fuse")
-
-        log.info("Fusing at index %s", i)
-        mul_ = current
-        add_ = next_
-        batch_norm = copy.deepcopy(mul_)
-        batch_norm.type = "SpatialBN"
-        batch_norm.arg.extend([utils.MakeArgument("is_test", 1)])
-        batch_norm.arg.extend([utils.MakeArgument("epsilon", float(1e-9))])
-
-        def s(x):
-            return "{}{}".format(add_.output[0], x)
-        fake_mean = s("_mean")
-        fake_var = s("_var")
-
-        del batch_norm.input[:]
-        batch_norm.input.extend([mul_.input[0],
-                                 mul_.input[1],
-                                 add_.input[1],
-                                 fake_mean,
-                                 fake_var])
-        params[fake_mean] = np.zeros_like(params[mul_.input[1]])
-        params[fake_var] = np.ones_like(params[mul_.input[1]])
-        net.external_input.extend([fake_mean, fake_var])
-
-        batch_norm.output[0] = add_.output[0]
-        new_ops = net.op[:i] + [batch_norm] + net.op[j + 1:]
-        del net.op[:]
-        net.op.extend(new_ops)
-        break
-    return net, params, removed_tensors
-
-
-def fuse_mul_add(net, params):
-    # Run until we hit a fixed point
-    removed_tensors = []
-    while True:
-        (next_net, next_params, removed_tensors) = \
-            fuse_first_mul_add(net, params, removed_tensors)
-        if len(next_net.op) == len(net.op):
-            return (next_net, next_params, removed_tensors)
-        net, params, removed_tensors = (next_net, next_params, removed_tensors)
-
-
-def add_tensor(net, name, blob):
-    ''' Create an operator to store the tensor 'blob',
-        run the operator to put the blob to workspace.
-        uint8 is stored as an array of string with one element.
-    '''
-    kTypeNameMapper = {
-        np.dtype('float32'): "GivenTensorFill",
-        np.dtype('int32'): "GivenTensorIntFill",
-        np.dtype('int64'): "GivenTensorInt64Fill",
-        np.dtype('uint8'): "GivenTensorStringFill",
-    }
-
-    shape = blob.shape
-    values = blob
-    # pass array of uint8 as a string to save storage
-    # storing uint8_t has a large overhead for now
-    if blob.dtype == np.dtype('uint8'):
-        shape = [1]
-        values = [str(blob.data)]
-
-    op = core.CreateOperator(
-        kTypeNameMapper[blob.dtype],
-        [], [name],
-        arg=[
-            utils.MakeArgument("shape", shape),
-            utils.MakeArgument("values", values),
-        ]
-    )
-    net.op.extend([op])
-
-
-def gen_init_net_from_blobs(blobs):
-    ''' Generate an initialization net based on a blob dict '''
-    ret = caffe2_pb2.NetDef()
-    for name, blob in blobs.items():
-        add_tensor(ret, name, blob)
-    return ret
-
-
-def fuse_conv_relu(net):
-    net = copy.deepcopy(net)
-    device_option = core.DeviceOption(caffe2_pb2.IDEEP)
-    for op in net.op:
-        op.device_option.CopyFrom(device_option)
-
-    new_net = caffe2_pb2.NetDef()
-    new_net.ParseFromString(C.transform_optimizeForMKLDNN(net.SerializeToString()))
-    return new_net
-
-
-def Optimize(args):
-    init_net = caffe2_pb2.NetDef()
-    predict_net = caffe2_pb2.NetDef()
-    init_net.ParseFromString(args.init_net.read())
-    predict_net.ParseFromString(args.pred_net.read())
-
-    workspace.ResetWorkspace()
-    workspace.RunNetOnce(init_net)
-    param_dict = {p: workspace.FetchBlob(p) for p in workspace.Blobs()}
-
-    external_inputs = {}
-    external_outputs = {}
-    if args.verify_input:
-        value_info = json.load(args.verify_input)
-        input_shapes = {k : v[-1] for (k, v) in value_info.items()}
-        print("input info: {}".format(input_shapes))
-        for k, v in input_shapes.items():
-            external_inputs[k] = np.random.randn(*v).astype(np.float32)
-            workspace.FeedBlob(k, external_inputs[k])
-        workspace.RunNetOnce(predict_net)
-        for o in predict_net.external_output:
-            external_outputs[o] = workspace.FetchBlob(o)
-
-    if args.fuse_mul_add:
-        predict_net, param_dict, _ = fuse_mul_add(predict_net, param_dict)
-    if args.fuse_bn:
-        predict_net, param_dict, _ = fuse_bn(predict_net, param_dict, False)
-    if args.fuse_conv_relu:
-        predict_net = fuse_conv_relu(predict_net)
-
-    external_outputs_opt = {}
-    if args.verify_input:
-        workspace.ResetWorkspace()
-        device_option = core.DeviceOption(caffe2_pb2.IDEEP) if args.fuse_conv_relu else core.DeviceOption(caffe2_pb2.CPU)
-        with core.DeviceScope(device_option):
-            for k, v in param_dict.items():
-                workspace.FeedBlob(k, v, device_option)
-            for k, v in external_inputs.items():
-                workspace.FeedBlob(k, v, device_option)
-            workspace.RunNetOnce(predict_net)
-            for o in predict_net.external_output:
-                external_outputs_opt[o] = workspace.FetchBlob(o)
-                assert np.allclose(external_outputs[o],
-                                   external_outputs_opt[o],
-                                   atol=1e-3,
-                                   rtol=1e-3)
-
-    for i, o in enumerate(predict_net.op):
-        print("op[{}]: {}".format(i, o.type))
-    init_net = gen_init_net_from_blobs(param_dict)
-    with open('init_net.pb', 'wb') as f:
-        f.write(init_net.SerializeToString())
-    with open('predict_net.pb', 'wb') as f:
-        f.write(predict_net.SerializeToString())
-
-if __name__ == '__main__':
-    args = GetArgumentParser().parse_args()
-    Optimize(args)
diff --git a/caffe2/python/ideep/transpose_op_test.py b/caffe2/python/ideep/transpose_op_test.py
deleted file mode 100644
index f8b784822a07..000000000000
--- a/caffe2/python/ideep/transpose_op_test.py
+++ /dev/null
@@ -1,43 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.ideep_test_util as mu
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class TransposeTest(hu.HypothesisTestCase):
-    @given(
-        X=hu.tensor(min_dim=1, max_dim=5, dtype=np.float32), use_axes=st.booleans(), **mu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_transpose(self, X, use_axes, gc, dc):
-        ndim = len(X.shape)
-        axes = np.arange(ndim)
-        np.random.shuffle(axes)
-
-        if use_axes:
-            op = core.CreateOperator(
-                "Transpose", ["X"], ["Y"], axes=axes, device_option=gc)
-        else:
-            op = core.CreateOperator(
-                "Transpose", ["X"], ["Y"], device_option=gc)
-
-        def transpose_ref(X):
-            if use_axes:
-                return [np.transpose(X, axes=axes)]
-            else:
-                return [np.transpose(X)]
-
-        self.assertReferenceChecks(gc, op, [X], transpose_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep/weightedsum_op_test.py b/caffe2/python/ideep/weightedsum_op_test.py
deleted file mode 100644
index b1e46fca4851..000000000000
--- a/caffe2/python/ideep/weightedsum_op_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-
-
-
-
-
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core, workspace
-from hypothesis import given
-import caffe2.python.ideep_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class TestWeightedSumOp(hu.HypothesisTestCase):
-    @given(n=st.integers(5, 8), m=st.integers(1, 1),
-           d=st.integers(2, 4), grad_on_w=st.booleans(),
-           **mu.gcs_ideep_only)
-    def test_weighted_sum(self, n, m, d, grad_on_w, gc, dc):
-        input_names = []
-        input_vars = []
-        for i in range(m):
-            X_name = 'X' + str(i)
-            w_name = 'w' + str(i)
-            input_names.extend([X_name, w_name])
-            var = np.random.rand(n, d).astype(np.float32)
-            vars()[X_name] = var
-            input_vars.append(var)
-            var = np.random.rand(1).astype(np.float32)
-            vars()[w_name] = var
-            input_vars.append(var)
-
-        def weighted_sum_op_ref(*args):
-            res = np.zeros((n, d))
-            for i in range(m):
-                res = res + args[2 * i + 1] * args[2 * i]
-
-            return (res, )
-
-        op = core.CreateOperator(
-            "WeightedSum",
-            input_names,
-            ['Y'],
-            grad_on_w=grad_on_w,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=input_vars,
-            reference=weighted_sum_op_ref,
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/ideep_test_util.py b/caffe2/python/ideep_test_util.py
deleted file mode 100644
index 0cc643317c93..000000000000
--- a/caffe2/python/ideep_test_util.py
+++ /dev/null
@@ -1,39 +0,0 @@
-## @package ideep_test_util
-# Module caffe2.python.ideep_test_util
-"""
-The IDEEP test utils is a small addition on top of the hypothesis test utils
-under caffe2/python, which allows one to more easily test IDEEP related
-operators.
-"""
-
-
-
-
-
-
-import hypothesis.strategies as st
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import hypothesis_test_util as hu
-
-cpu_do = hu.cpu_do
-ideep_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.IDEEP)
-device_options = hu.device_options + ([ideep_do])
-
-
-def device_checker_device_options():
-    return st.just(device_options)
-
-
-def gradient_checker_device_option():
-    return st.sampled_from(device_options)
-
-
-gcs = dict(
-    gc=gradient_checker_device_option(),
-    dc=device_checker_device_options()
-)
-
-gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
-gcs_ideep_only = dict(gc=st.sampled_from([ideep_do]), dc=st.just([ideep_do]))
-gcs_cpu_ideep = dict(gc=st.sampled_from([cpu_do, ideep_do]), dc=st.just([cpu_do, ideep_do]))
diff --git a/caffe2/python/layer_model_helper.py b/caffe2/python/layer_model_helper.py
deleted file mode 100644
index 017e2ed2f043..000000000000
--- a/caffe2/python/layer_model_helper.py
+++ /dev/null
@@ -1,752 +0,0 @@
-# @package layer_model_helper
-# Module caffe2.python.layer_model_helper
-
-
-
-
-
-from caffe2.python import core, model_helper, schema, scope, utils, muji
-from caffe2.python.modeling.parameter_info import (
-    ParameterInfo,
-)
-from caffe2.python.modeling.parameter_sharing import (
-    parameter_sharing_context,
-)
-from caffe2.python.modeling.net_modifier import NetModifier
-
-from caffe2.python.optimizer import get_param_device, Optimizer
-from caffe2.python.regularizer import Regularizer, RegularizationBy
-from caffe2.python.layers import layers
-
-import logging
-import numpy as np
-import copy
-logger = logging.getLogger(__name__)
-
-
-class LayerModelHelper(model_helper.ModelHelper):
-    """
-    Model helper for building models on top of layers abstractions.
-
-    Each layer is the abstraction that is higher level than Operator. Layer
-    is responsible for ownership of it's own parameters and can easily be
-    instantiated in multiple nets possible with different sets of ops.
-    As an example: one can easily instantiate predict and train nets from
-    the same set of layers, where predict net will have subset of the
-    operators from train net.
-    """
-
-    def __init__(self, name, input_feature_schema, trainer_extra_schema,
-                 keep_blobs=False,
-                 use_attribution=True):
-        ''' TODO(amalevich): more documnetation on input args
-
-        use_attribution:
-            if True, will generate the atrribution net for feature importance
-            calculation; Need to turn it to false when FC is quantized as FP16
-            This attribute access will be consistent with MTML model.
-        '''
-
-        super().__init__(name=name)
-        self._layer_names = set()
-        self._layers = []
-        self._param_to_shape = {}
-
-        # seed default
-        self._seed = None
-        self._sequence_seed = True
-
-        # optimizer bookkeeping
-        self.param_to_optim = {}
-        self.param_to_reg = {}
-
-        self._default_optimizer = None
-        self._loss = None
-        self._prediction = []
-        self._output_schema = None
-
-        self._post_grad_net_modifiers = []
-        self._final_net_modifiers = []
-
-        # breakdown map; breakdown features are categorical (like dense) but not
-        # necessarily used to represent data for training
-        self._breakdown_map = None
-
-        # Connect Schema to self.net. That particular instance of schmea will be
-        # use for generation of the Layers across the network and would be used
-        # for connection with Readers.
-        self._input_feature_schema = schema.NewRecord(
-            self.net,
-            input_feature_schema
-        ) if not keep_blobs else input_feature_schema.clone()
-        self._trainer_extra_schema = schema.NewRecord(
-            self.net,
-            trainer_extra_schema
-        ) if not keep_blobs else trainer_extra_schema.clone()
-        self._metrics_schema = schema.Struct()
-
-        self._preproc_output_schema = None
-
-        self._init_global_constants()
-        self.param_init_net = self.create_init_net('param_init_net')
-        self._initialize_params = True
-
-        self._transfer_learning_blob_name_mappings = None
-
-        # additional (hard-coded) diagnose_options to report based on the model
-        # TODO(xlwang): it's hack!
-        self.ad_hoc_diagnose_blobs_and_operations = []
-        self.ad_hoc_plot_blobs = []
-        self.use_attribution = use_attribution
-
-    def clear_output_schema(self):
-        self._output_schema = None
-
-    def set_initialize_params(self, initialize_params):
-        self._initialize_params = initialize_params
-
-    def add_metric_field(self, name, value):
-        assert name not in self._metrics_schema.fields, (
-            "Try to add metric field twice: {}".format(name))
-        self._metrics_schema = self._metrics_schema + schema.Struct(
-            (name, value)
-        )
-
-    # an empty white_set will skip everything
-    def filter_metrics_schema(self, white_set):
-        logger.info("Filter metric schema with white_set {}".format(white_set))
-        field_names = self._metrics_schema.field_names()
-        for name in field_names:
-            if name not in white_set:
-                self._metrics_schema = self._metrics_schema - schema.Struct((name, schema.Scalar()))
-
-    def add_ad_hoc_plot_blob(self, blob, dtype=None):
-        assert isinstance(
-            blob, (str, core.BlobReference)
-        ), "expect type str or BlobReference, but got {}".format(type(blob))
-        dtype = dtype or (np.float64, (1, ))
-        self.add_metric_field(str(blob), schema.Scalar(dtype, blob))
-        self.ad_hoc_plot_blobs.append(blob)
-
-    @staticmethod
-    def _get_global_constant_initializer_op(
-        blob_name, array=None, dtype=None, initializer=None
-    ):
-        # to add a global constant to model, one first need to get the
-        # initializer
-        if array is not None:
-            assert initializer is None,\
-                "Only one from array and initializer should be specified"
-            if dtype is None:
-                array = np.array(array)
-            else:
-                array = np.array(array, dtype=dtype)
-
-            # TODO: make GivenTensor generic
-            op_name = None
-            if array.dtype == np.int32:
-                op_name = 'GivenTensorIntFill'
-            elif array.dtype == np.int64:
-                op_name = 'GivenTensorInt64Fill'
-            elif array.dtype == str:
-                op_name = 'GivenTensorStringFill'
-            elif array.dtype == bool:
-                op_name = 'GivenTensorBoolFill'
-            else:
-                op_name = 'GivenTensorFill'
-
-            def initializer(blob_name):
-                return core.CreateOperator(
-                    op_name, [],
-                    blob_name,
-                    shape=array.shape,
-                    values=array.flatten().tolist()
-                )
-        else:
-            assert initializer is not None
-        initializer_op = initializer(blob_name)
-        return initializer_op
-
-    def add_global_constant(
-        self, name, array=None, dtype=None, initializer=None
-    ):
-        assert isinstance(name, str), (
-            'name should be a string as we are using it as map key')
-        # This is global namescope for constants. They will be created in all
-        # init_nets and there should be very few of them.
-        assert name not in self.global_constants, \
-            "%s already added in global_constants" % name
-        blob_name = self.net.NextBlob(name)
-        self.global_constants[name] = blob_name
-        initializer_op = LayerModelHelper._get_global_constant_initializer_op(
-            blob_name, array, dtype, initializer
-        )
-        assert blob_name not in self.global_constant_initializers, \
-            "there is already a initializer op associated with blob %s" % \
-            blob_name
-        self.global_constant_initializers[blob_name] = initializer_op
-        return blob_name
-
-    def maybe_add_global_constant(self, name, *args, **kwargs):
-        # To ad hoc add new global constants without duplication
-        # if the name was already registered in global_constants, it will not be
-        # added even if the intended value is different from its original value
-
-        if name in self.global_constants:
-            blob_name = self.global_constants[name]
-            initializer_op = \
-                LayerModelHelper._get_global_constant_initializer_op(
-                    blob_name, *args, **kwargs
-                )
-            # check if the original initializer is the same as the one intended
-            # now
-            assert utils.OpAlmostEqual(
-                initializer_op,
-                self.global_constant_initializers[blob_name],
-                'debug_info'
-            ), \
-                "conflict initializers for global constant %s, " \
-                "previous %s, now %s" % (
-                    blob_name, str(initializer_op),
-                    str(self.global_constant_initializers[blob_name]))
-            return blob_name
-        return self.add_global_constant(name, *args, **kwargs)
-
-    def _init_global_constants(self):
-        self.global_constants = {}
-        self.global_constant_initializers = {}
-        self.add_global_constant('ONE', 1.0)
-        self.add_global_constant('NAN', float("NaN"))
-        self.add_global_constant('ZERO', 0.0)
-        self.add_global_constant('ZERO_RANGE', [0, 0], dtype='int32')
-
-    def _add_global_constants(self, init_net):
-        for initializer_op in self.global_constant_initializers.values():
-            init_net._net.op.extend([initializer_op])
-
-    def create_init_net(self, name):
-        init_net = core.Net(name)
-        self._add_global_constants(init_net)
-        return init_net
-
-    def _validate_param_shape(self, param_name, shape):
-        if param_name not in self._param_to_shape:
-            return
-
-        ref_shape = self._param_to_shape[param_name]
-
-        if shape != ref_shape:
-            raise ValueError(
-                "Got inconsistent shapes between shared parameters "
-                "when trying to map a blob in scope {0} to {1}. ref_shape : "
-                " {2}, shape : {3}".format(
-                    scope.CurrentNameScope(), param_name, ref_shape, shape)
-            )
-
-    def _validate_param_optim(self, param_name, optim):
-        # there are three possible values for optim:
-        # 1) None (which will use self._default_optimizer after this layer is instantiated)
-        # 2) self.NoOptim
-        # 3) an instance of Optimizer class such as AdagradOptimizer
-
-        # this implies this parameter is not shared with any other parameter so far
-        if param_name not in self.param_to_optim:
-            return
-
-        logger.info("{} shares the same parameter with another parameter. "
-                    "Validating if the same optimizer has been specified for them.".format(
-                        param_name,
-                    ))
-
-        ref_optim = self.param_to_optim[param_name]
-
-        if optim is None:
-            assert ref_optim == self._default_optimizer, (
-                "Optim for {} is None which will fall back to use default_optimizer. "
-                "However, the optimizer that has been specified for this shared parameter "
-                "is {} which is different from default_optimizer {}. "
-                "Please check the optimizers specified for parameters shared "
-                "with {} and the default_optimizer to ensure the consistency.".format(
-                    param_name, ref_optim, self._default_optimizer, param_name
-                )
-            )
-        elif optim == self.NoOptim:
-            assert ref_optim == self.NoOptim, (
-                "Optim for {} is NoOptim. However, the optimizer for the parameters "
-                "shared with {} is {} which is different from NoOptim. "
-                "Please check the optimizer specified for other parameters in the "
-                "shared group to ensure consistency.".format(
-                    param_name, param_name, ref_optim
-                )
-            )
-        elif isinstance(optim, Optimizer):
-            assert isinstance(ref_optim, Optimizer), (
-                "Optim for {} is an instance of Optimizer. However, the optimizer "
-                "for the parameters shared with {} is {} which is not an instance "
-                "of Optimizer. Please check the optimizer specified for other "
-                " parameters in the shared group to ensure consistency.".format(
-                    param_name, param_name, ref_optim, optim
-                )
-            )
-
-            assert type(optim) is type(ref_optim) and optim.attributes == ref_optim.attributes, (
-                "Optim for {} is an instance of Optimizer. However, the optimizer "
-                "for the parameters shared with {} is {}. "
-                "This optimizer either doesn't have the same type as the current optimizer: "
-                "{} vs {}, or its attributes such as learning rate are different from "
-                "that of current optimizer which is {} vs {}. "
-                "Please check the optimizer specified for other parameters in the "
-                "shared group to ensure consistency.".format(
-                    param_name, param_name, ref_optim, type(optim), type(ref_optim), optim.attributes, ref_optim.attributes
-                )
-            )
-        else:
-            raise ValueError("optim should be either None, NoOptim, or an instance of Optimizer, Got {} ".format(optim))
-
-    def create_param(self, param_name, shape, initializer, optimizer=None,
-                     ps_param=None, regularizer=None):
-        if isinstance(param_name, core.BlobReference):
-            param_name = str(param_name)
-        elif isinstance(param_name, str):
-            # Parameter name will be equal to current Namescope that got
-            # resolved with the respect of parameter sharing of the scopes.
-            param_name = parameter_sharing_context.get_parameter_name(
-                param_name)
-        else:
-            raise ValueError("Unsupported type for param_name")
-
-        param_blob = core.BlobReference(param_name)
-
-        if len(initializer) == 1:
-            init_op_args = {}
-        else:
-            assert len(initializer) == 2
-            init_op_args = copy.deepcopy(initializer[1])
-        if shape is not None:
-            assert 'shape' not in init_op_args
-            init_op_args.update({'shape': shape})
-
-        initializer_op = None
-        if self._initialize_params:
-            initializer_op = core.CreateOperator(
-                initializer[0],
-                [],
-                param_blob,
-                **init_op_args
-            )
-
-        param = layers.LayerParameter(
-            parameter=param_blob,
-            initializer=initializer_op,
-            optimizer=optimizer,
-            ps_param=ps_param,
-            regularizer=regularizer
-        )
-
-        self._validate_param_shape(param_name, shape)
-
-        self._validate_param_optim(param_name, optimizer)
-
-        self._param_to_shape[param_name] = shape
-
-        return param
-
-    def next_layer_name(self, prefix):
-        base_name = core.ScopedName(prefix)
-        name = base_name
-        index = 0
-        while name in self._layer_names:
-            name = base_name + '_auto_' + str(index)
-            index += 1
-
-        self._layer_names.add(name)
-        return name
-
-    def add_layer(self, layer):
-        self._layers.append(layer)
-        for param in layer.get_parameters():
-            assert isinstance(param.parameter, core.BlobReference)
-
-            self.param_to_optim[str(param.parameter)] = \
-                param.optimizer or self.default_optimizer
-
-            self.params.append(param.parameter)
-            if isinstance(param, layers.LayerParameter):
-                logger.info("Add parameter regularizer {0}".format(param.parameter))
-                self.param_to_reg[param.parameter] = param.regularizer
-            elif isinstance(param, ParameterInfo):
-                # TODO:
-                # Currently, LSTM and RNNcells, which use ModelHelper instead of
-                # LayerModelHelper as super class, are called in pooling_methods
-                # In ModelHelper, regularization is not supported in create_param
-                # We will unify the way of create_param of ModelHelper and
-                # LayerModelHelper in the future.
-                logger.info('regularization is unsupported for ParameterInfo object')
-            else:
-                raise ValueError(
-                    'unknown object type besides ParameterInfo and LayerParameter: {}'
-                    .format(param)
-                )
-
-        # The primary value of adding everything to self.net - generation of the
-        # operators right away, i.e. if error happens it'll be detected
-        # immediately. Other than this - create_x_net should be called.
-        layer.add_operators(self.net, self.param_init_net)
-        return layer.output_schema
-
-    def get_parameter_blobs(self):
-        param_blobs = []
-        for layer in self._layers:
-            for param in layer.get_parameters():
-                param_blobs.append(param.parameter)
-
-        return param_blobs
-
-    def add_post_grad_net_modifiers(self, modifier):
-        assert modifier not in self._post_grad_net_modifiers,\
-            "{0} is already in {1}".format(modifier, self._post_grad_net_modifiers)
-        assert isinstance(modifier, NetModifier),\
-            "{} has to be a NetModifier instance".format(modifier)
-        self._post_grad_net_modifiers.append(modifier)
-
-    def add_final_net_modifiers(self, modifier):
-        assert modifier not in self._final_net_modifiers,\
-            "{0} is already in {1}".format(modifier, self._final_net_modifiers)
-        assert isinstance(modifier, NetModifier),\
-            "{} has to be a NetModifier instance".format(modifier)
-        self._final_net_modifiers.append(modifier)
-
-    @property
-    def seed(self):
-        return self._seed
-
-    @property
-    def sequence_seed(self):
-        return self._sequence_seed
-
-    def store_seed(self, seed, sequence_seed=True):
-        # Store seed config that will be applied to each op in the net.
-        self._seed = seed
-        # If sequence_seed is True, the i-th op has rand_seed=`seed + i`
-        self._sequence_seed = sequence_seed
-
-    def apply_seed(self, net):
-        if self._seed:
-            net.set_rand_seed(self._seed, self._sequence_seed)
-
-    @property
-    def default_optimizer(self):
-        return self._default_optimizer
-
-    @default_optimizer.setter
-    def default_optimizer(self, optimizer):
-        self._default_optimizer = optimizer
-
-    @property
-    def input_feature_schema(self):
-        return self._input_feature_schema
-
-    @property
-    def trainer_extra_schema(self):
-        return self._trainer_extra_schema
-
-    @property
-    def metrics_schema(self):
-        """
-        Returns the schema that represents model output that should be used for
-        metric reporting.
-
-        During the training/evaluation this schema will be appended to the
-        schema that represents model output.
-        """
-        return self._metrics_schema
-
-    @property
-    def output_schema(self):
-        assert self._output_schema is not None
-        return self._output_schema
-
-    @output_schema.setter
-    def output_schema(self, schema):
-        assert self._output_schema is None
-        self._output_schema = schema
-
-    @property
-    def preproc_output_schema(self):
-        assert self._preproc_output_schema is not None
-        return self._preproc_output_schema
-
-    @preproc_output_schema.setter
-    def preproc_output_schema(self, schema):
-        assert self._preproc_output_schema is None
-        self._preproc_output_schema = schema
-
-    @property
-    def prediction(self):
-        assert self._prediction, "model prediction is empty"
-        return self._prediction
-
-    def add_prediction(self, prediction, weight=1.0):
-        assert prediction is not None, "Added prediction should not be None"
-        self._prediction.append((prediction, weight))
-
-    @property
-    def transfer_learning_blob_name_mappings(self):
-        return self._transfer_learning_blob_name_mappings
-
-    @transfer_learning_blob_name_mappings.setter
-    def transfer_learning_blob_name_mappings(self, blob_name_mappings):
-        assert blob_name_mappings is not None, "Transfer learning blob name mappings should not be None"
-        self._transfer_learning_blob_name_mappings = blob_name_mappings
-
-    @property
-    def loss(self):
-        assert self._loss is not None
-        return self._loss
-
-    @loss.setter
-    def loss(self, loss):
-        assert self._loss is None
-        self._loss = loss
-
-    def has_loss(self):
-        return self._loss is not None
-
-    def add_loss(self, loss, name='unnamed'):
-        assert loss is not None, "Added loss should not be None"
-        assert isinstance(loss, schema.Scalar) or isinstance(
-            loss, schema.Struct
-        ), "Added loss should be a scalar or a struct"
-        if self._loss is None:
-            self._loss = schema.Struct((name, loss))
-        else:
-            # loss could've been set through model.loss directly which could be
-            # a scalar
-            if isinstance(self._loss, schema.Scalar):
-                self._loss = schema.Struct(('unnamed', self._loss))
-
-            prefix_base = name + '_auto_'
-            index = 0
-            prefix = name
-            while prefix in self._loss:
-                prefix = prefix_base + str(index)
-                index += 1
-            loss_struct = schema.Struct((prefix, loss))
-            self._loss = self._loss + loss_struct
-
-    def add_output_schema(self, name, value):
-        assert value is not None, \
-            'Added output schema {} should not be None'.format(name)
-        assert isinstance(value, schema.Scalar) or \
-            isinstance(value, schema.Struct), \
-            'Added output schema {} should be a scalar or a struct.\n\
-            Now it is {}.'.format(name, type(value))
-        if self._output_schema is None:  # be the first field
-            self._output_schema = schema.Struct((name, value))
-        else:  # merge with other fields
-            assert name not in self._output_schema.fields, \
-                'Output Schema Field {} already exists'.format(name)
-            self._output_schema = \
-                self._output_schema + schema.Struct((name, value))
-
-    def add_trainer_extra_schema(self, trainer_extra_schema):
-        trainer_extra_record = schema.NewRecord(self.net, trainer_extra_schema)
-        self._trainer_extra_schema += trainer_extra_record
-
-    def __getattr__(self, layer):
-        def is_functional_layer(layer):
-            if core.IsOperator(layer):
-                return True
-            elif layer.startswith('FunctionalLayer'):
-                return True
-            else:
-                return False
-
-        def resolve_functional_layer(layer):
-            if core.IsOperator(layer):
-                return layer
-            elif layer.startswith('FunctionalLayer'):
-                return layer[len('FunctionalLayer'):]
-            else:
-                raise ValueError(
-                    '%s cannot be resolved as functional layer' % layer
-                )
-
-        if layer.startswith('__'):
-            raise AttributeError(layer)
-
-        # TODO(amalevich): Add add support for ifbpy inline documentation
-        if layers.layer_exists(layer):
-            def wrapper(*args, **kwargs):
-                new_layer = layers.create_layer(layer, self, *args, **kwargs)
-                if kwargs.get("output_to_metrics", False):
-                    new_layer.export_output_for_metrics()
-                if kwargs.get("params_to_metrics", False):
-                    new_layer.export_params_for_metrics()
-                return self.add_layer(new_layer)
-            return wrapper
-        elif is_functional_layer(layer):
-            # TODO(xlwang): Desginated layer shadows the usage of an op as a
-            # single layer. To enforce using an op (e.g. Split) as functional
-            # layer, one can call 'model.FunctionalLayerSplit'
-            layer = resolve_functional_layer(layer)
-
-            def wrapper(*args, **kwargs):
-                def apply_operator(net, in_record, out_record, **kwargs):
-                    # TODO(amalevich): Switch to net.operator as soon as it gets
-                    # landed
-                    net.__getattr__(layer)(in_record.field_blobs(),
-                                           out_record.field_blobs(),
-                                           **kwargs)
-
-                if 'name' not in kwargs:
-                    kwargs['name'] = layer
-
-                new_layer = layers.create_layer(
-                    'Functional',
-                    self, *args, function=apply_operator,
-                    **kwargs
-                )
-
-                if kwargs.get("output_to_metrics", False):
-                    new_layer.export_output_for_metrics()
-                if kwargs.get("params_to_metrics", False):
-                    new_layer.export_params_for_metrics()
-
-                return self.add_layer(new_layer)
-            return wrapper
-        else:
-            # this needs to be an AttributeError to fit hasattr semantics
-            raise AttributeError(
-                "Trying to create non-registered layer: {}".format(layer))
-
-    @property
-    def layers(self):
-        return self._layers
-
-    def apply_regularizers_on_loss(
-        self,
-        train_net,
-        train_init_net,
-        blob_to_device=None,
-    ):
-        logger.info("apply regularizer on loss")
-        for param, regularizer in self.param_to_reg.items():
-            if regularizer is None:
-                continue
-            logger.info("add regularizer {0} for param {1} to loss".format(regularizer, param))
-            assert isinstance(regularizer, Regularizer)
-            added_loss_blob = regularizer(train_net, train_init_net, param, grad=None,
-                                          by=RegularizationBy.ON_LOSS)
-            logger.info(added_loss_blob)
-            if added_loss_blob is not None:
-                self.add_loss(
-                    schema.Scalar(blob=added_loss_blob),
-                    str(added_loss_blob)
-                )
-
-    def apply_regularizers_after_optimizer(
-        self,
-        train_net,
-        train_init_net,
-        grad_map,
-        blob_to_device=None,
-    ):
-        logger.info("apply regularizer after optimizer")
-        CPU = muji.OnCPU()
-        # if given, blob_to_device is a map from blob to device_option
-        blob_to_device = blob_to_device or {}
-        for param, regularizer in self.param_to_reg.items():
-            if regularizer is None:
-                continue
-            assert isinstance(regularizer, Regularizer)
-            logger.info("add regularizer {0} for param {1} to optimizer".format(regularizer, param))
-            device = get_param_device(
-                param,
-                grad_map.get(str(param)),
-                param_to_device=blob_to_device,
-                default_device=CPU,
-            )
-            with core.DeviceScope(device):
-                regularizer(
-                    train_net, train_init_net, param, grad=grad_map.get(str(param)),
-                    by=RegularizationBy.AFTER_OPTIMIZER
-                )
-
-    def apply_post_grad_net_modifiers(
-        self,
-        trainer_net,
-        trainer_init_net,
-        grad_map,
-        blob_to_device=None,
-        modify_output_record=False,
-    ):
-        param_grad_map = {param: grad_map[param]
-                          for param in self.param_to_optim.keys() if param in grad_map}
-
-        for modifier in self._post_grad_net_modifiers:
-            modifier(trainer_net, trainer_init_net, param_grad_map,
-                     blob_to_device=blob_to_device,
-                     modify_output_record=modify_output_record)
-
-    def apply_final_net_modifiers(
-        self,
-        trainer_net,
-        trainer_init_net,
-        grad_map,
-        blob_to_device=None,
-        modify_output_record=False,
-    ):
-        for modifier in self._final_net_modifiers:
-            modifier(trainer_net, trainer_init_net, grad_map,
-                     blob_to_device=blob_to_device,
-                     modify_output_record=modify_output_record)
-
-    def apply_optimizers(
-        self,
-        train_net,
-        train_init_net,
-        grad_map,
-        blob_to_device=None,
-    ):
-        CPU = muji.OnCPU()
-        # if given, blob_to_device is a map from blob to device_option
-        blob_to_device = blob_to_device or {}
-        for param, optimizer in self.param_to_optim.items():
-            assert optimizer is not None, \
-                "default optimizer must have been set in add_layer"
-            # note that not all params has gradient and thus we sent None if
-            # gradient does not exists
-            device = get_param_device(
-                param,
-                grad_map.get(str(param)),
-                param_to_device=blob_to_device,
-                default_device=CPU,
-            )
-            if device is not None:
-                # extra info is not applicable for optimizers
-                del device.extra_info[:]
-
-            with core.DeviceScope(device):
-                optimizer(
-                    train_net, train_init_net, param, grad_map.get(str(param)))
-
-    def _GetOne(self):
-        return self.global_constants['ONE']
-
-    # An optimizer which allows us to do NO optimization
-    def NoOptim(self, *args, **kwargs):
-        pass
-
-    @property
-    def breakdown_map(self):
-        return self._breakdown_map
-
-    @breakdown_map.setter
-    def breakdown_map(self, breakdown_map):
-        # TODO(xlwang): provide more rich feature information in breakdown_map;
-        # and change the assertion accordingly
-        assert isinstance(breakdown_map, dict)
-        assert all(isinstance(k, str) for k in breakdown_map)
-        assert sorted(breakdown_map.values()) == list(range(len(breakdown_map)))
-        self._breakdown_map = breakdown_map
diff --git a/caffe2/python/layer_model_instantiator.py b/caffe2/python/layer_model_instantiator.py
deleted file mode 100644
index 9284b9b9e687..000000000000
--- a/caffe2/python/layer_model_instantiator.py
+++ /dev/null
@@ -1,113 +0,0 @@
-## @package layer_model_instantiator
-# Module caffe2.python.layer_model_instantiator
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import InstantiationContext
-from caffe2.python.layers.tags import Tags
-
-
-def _filter_layers(layers, include_tags):
-    if include_tags is None:
-        return layers
-    include_tags = set(include_tags)
-    return [l for l in layers if not include_tags.isdisjoint(l.tags)]
-
-
-def shrink_output_schema(net, out_schema):
-    if len(out_schema.field_names()) <= 1:
-        return out_schema
-    exists = [net.BlobIsDefined(blob) for blob in out_schema.field_blobs()]
-    return schema.from_column_list(
-        [
-            col_name for ok, col_name in
-            zip(exists, out_schema.field_names()) if ok
-        ],
-        [
-            col_type for ok, col_type in
-            zip(exists, out_schema.field_types()) if ok
-        ],
-        [
-            col_blob for ok, col_blob in
-            zip(exists, out_schema.field_blobs()) if ok
-        ],
-        [
-            col_meta for ok, col_meta in
-            zip(exists, out_schema.field_metadata()) if ok
-        ]
-    )
-
-
-def generate_predict_net(model, include_tags=None):
-    predict_net = core.Net('predict_net')
-
-    for layer in _filter_layers(model.layers, include_tags):
-        if Tags.EXCLUDE_FROM_PREDICTION not in layer.tags:
-            layer.add_operators(
-                predict_net, context=InstantiationContext.PREDICTION)
-
-    predict_net.set_input_record(model.input_feature_schema.clone())
-    output_schema = shrink_output_schema(
-        predict_net, model.output_schema.clone()
-    )
-    predict_net.set_output_record(output_schema)
-    return predict_net
-
-
-def generate_eval_net(model, include_tags=None):
-    eval_net = core.Net('eval_net')
-
-    for layer in _filter_layers(model.layers, include_tags):
-        if Tags.EXCLUDE_FROM_EVAL not in layer.tags:
-            layer.add_operators(eval_net, context=InstantiationContext.EVAL)
-
-    input_schema = model.input_feature_schema + model.trainer_extra_schema
-    eval_net.set_input_record(input_schema)
-    output_schema = shrink_output_schema(
-        eval_net, model.output_schema + model.metrics_schema
-    )
-    eval_net.set_output_record(output_schema)
-    return eval_net
-
-
-def _generate_training_net_only(model, include_tags=None):
-    train_net = core.Net('train_net')
-    train_init_net = model.create_init_net('train_init_net')
-
-    for layer in _filter_layers(model.layers, include_tags):
-        if Tags.EXCLUDE_FROM_TRAIN not in layer.tags:
-            layer.add_operators(train_net, train_init_net)
-
-    input_schema = model.input_feature_schema + model.trainer_extra_schema
-    train_net.set_input_record(input_schema)
-    output_schema = shrink_output_schema(
-        train_net, model.output_schema + model.metrics_schema
-    )
-    train_net.set_output_record(output_schema)
-    return train_init_net, train_net
-
-
-def generate_training_nets_forward_only(model, include_tags=None):
-    train_init_net, train_net = _generate_training_net_only(model, include_tags)
-    return train_init_net, train_net
-
-
-def generate_training_nets(model, include_tags=None):
-    train_init_net, train_net = _generate_training_net_only(model, include_tags)
-
-    model.apply_regularizers_on_loss(train_net, train_init_net)
-    if not model.has_loss():
-        return train_init_net, train_net
-    loss = model.loss
-    grad_map = train_net.AddGradientOperators(loss.field_blobs())
-    model.apply_post_grad_net_modifiers(train_net, train_init_net, grad_map,
-                                        modify_output_record=True)
-    model.apply_optimizers(train_net, train_init_net, grad_map)
-    model.apply_regularizers_after_optimizer(train_net, train_init_net, grad_map)
-    model.apply_final_net_modifiers(train_net, train_init_net, grad_map,
-                                    modify_output_record=True)
-
-    return train_init_net, train_net
diff --git a/caffe2/python/layer_parameter_sharing_test.py b/caffe2/python/layer_parameter_sharing_test.py
deleted file mode 100644
index 84b2ed1deddf..000000000000
--- a/caffe2/python/layer_parameter_sharing_test.py
+++ /dev/null
@@ -1,232 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, scope
-from caffe2.python.modeling.parameter_sharing import (
-    ParameterSharing,
-)
-from caffe2.python.optimizer import AdagradOptimizer, AdamOptimizer
-from caffe2.python.layer_test_util import LayersTestCase
-
-
-class ParameterSharingTest(LayersTestCase):
-
-    def test_layer_parameter_name(self):
-        output_dims = 2
-        with scope.NameScope('global_scope'):
-            fc1_output = self.model.FC(
-                self.model.input_feature_schema.float_features,
-                output_dims
-            )
-            self.assertEqual(self.model.layers[-1].w, 'global_scope/fc/w')
-            self.assertEqual(fc1_output(), 'global_scope/fc/output')
-
-            with scope.NameScope('nested_scope'):
-                fc2_output = self.model.FC(
-                    fc1_output,
-                    output_dims
-                )
-                self.assertEqual(self.model.layers[-1].w,
-                                  'global_scope/nested_scope/fc/w')
-                self.assertEqual(fc2_output(),
-                                  'global_scope/nested_scope/fc/output')
-
-                fc3_output = self.model.FC(
-                    fc1_output,
-                    output_dims
-                )
-                self.assertEqual(self.model.layers[-1].w,
-                                  'global_scope/nested_scope/fc_auto_0/w')
-                self.assertEqual(fc3_output(),
-                                  'global_scope/nested_scope/fc_auto_0/output')
-
-    def test_layer_shared_parameter_name_different_namescopes(self):
-        output_dims = 2
-        with scope.NameScope('global_scope'):
-            with ParameterSharing({'scope_1': 'scope_0'}):
-                with scope.NameScope('scope_0'):
-                    fc1_output = self.model.FC(
-                        self.model.input_feature_schema.float_features,
-                        output_dims
-                    )
-                    self.assertEqual(self.model.layers[-1].w,
-                                      'global_scope/scope_0/fc/w')
-                    self.assertEqual(fc1_output(),
-                                      'global_scope/scope_0/fc/output')
-
-                with scope.NameScope('scope_1'):
-                    fc2_output = self.model.FC(
-                        self.model.input_feature_schema.float_features,
-                        output_dims
-                    )
-                    self.assertEqual(self.model.layers[-1].w,
-                                      'global_scope/scope_0/fc/w')
-                    self.assertEqual(fc2_output(),
-                                      'global_scope/scope_1/fc/output')
-
-    def test_layer_shared_parameter_name_within_same_namescope(self):
-        output_dims = 2
-        with scope.NameScope('global_scope'):
-            with ParameterSharing({'fc_auto_0': 'fc'}):
-                self.model.FC(
-                    self.model.input_feature_schema.float_features,
-                    output_dims
-                )
-                self.assertEqual(self.model.layers[-1].w,
-                                  'global_scope/fc/w')
-
-                self.model.FC(
-                    self.model.input_feature_schema.float_features,
-                    output_dims
-                )
-                self.assertEqual(self.model.layers[-1].w,
-                                  'global_scope/fc/w')
-
-    def test_layer_shared_parameter_name_within_same_namescope_customized_name(self):
-        output_dims = 2
-        with scope.NameScope('global_scope'):
-            with ParameterSharing({'new_fc': 'shared_fc'}):
-                self.model.FC(
-                    self.model.input_feature_schema.float_features,
-                    output_dims,
-                    name='shared_fc'
-                )
-                self.assertEqual(self.model.layers[-1].w,
-                                  'global_scope/shared_fc/w')
-
-                self.model.FC(
-                    self.model.input_feature_schema.float_features,
-                    output_dims,
-                    name='new_fc'
-                )
-                self.assertEqual(self.model.layers[-1].w,
-                                  'global_scope/shared_fc/w')
-
-    def test_layer_shared_parameter_name_different_shapes(self):
-        output_dims = 2
-        with scope.NameScope('global_scope'):
-            with ParameterSharing({'fc_auto_0': 'fc'}):
-                self.model.FC(
-                    self.model.input_feature_schema.float_features,
-                    output_dims
-                )
-                self.assertEqual(self.model.layers[-1].w,
-                                  'global_scope/fc/w')
-
-                with self.assertRaisesRegex(ValueError, 'Got inconsistent shapes .*'):
-                    self.model.FC(
-                        self.model.input_feature_schema.float_features,
-                        output_dims + 1
-                    )
-
-    def test_layer_duplicated_parameter_init(self):
-        output_dims = 2
-        with scope.NameScope('global_scope'):
-            with ParameterSharing({'new_fc': 'shared_fc'}):
-                self.model.FC(
-                    self.model.input_feature_schema.float_features,
-                    output_dims,
-                    name='shared_fc'
-                )
-                self.model.FC(
-                    self.model.input_feature_schema.float_features,
-                    output_dims,
-                    name='new_fc'
-                )
-
-        train_init_net = core.Net('train_init_net')
-        train_net = core.Net('train_net')
-        for layer in self.model.layers:
-            layer.add_operators(train_net, train_init_net)
-        op_outputs = []
-        for op in train_init_net._net.op:
-            op_outputs.extend(op.output)
-
-        # only fill these parameter blobs once
-        self.assertEqual(
-            sorted(op_outputs),
-            ['global_scope/shared_fc/b', 'global_scope/shared_fc/w']
-        )
-
-    def test_layer_shared_parameter_optim_validator(self):
-        """
-        This test is to cover the _validate_param_optim function in
-        layer_model_helper class.
-        """
-
-        output_dims = 2
-
-        adagrad_optim = AdagradOptimizer(
-            alpha=0.004,
-            epsilon=0.02,
-        )
-
-        self.model.default_optimizer = adagrad_optim
-
-        # the following covers the branch -- optim is None
-        with scope.NameScope('global_scope_0'):
-            with ParameterSharing({'scope_1': 'scope_0'}):
-                with scope.NameScope('scope_0'):
-                    fc1_output = self.model.FC(
-                        self.model.input_feature_schema.float_features,
-                        output_dims,
-                        weight_optim=self.model.NoOptim,
-                    )
-
-                with scope.NameScope('scope_1'), self.assertRaises(Exception):
-                    fc2_output = self.model.FC(
-                        self.model.input_feature_schema.float_features,
-                        output_dims
-                    )
-
-        # the following covers the branch -- optim is NoOptim
-        with scope.NameScope('global_scope_1'):
-            with ParameterSharing({'scope_1': 'scope_0'}):
-                with scope.NameScope('scope_0'):
-                    fc1_output = self.model.FC(
-                        self.model.input_feature_schema.float_features,
-                        output_dims,
-                        weight_optim=None,
-                    )
-
-                with scope.NameScope('scope_1'), self.assertRaises(Exception):
-                    fc2_output = self.model.FC(
-                        self.model.input_feature_schema.float_features,
-                        output_dims,
-                        weight_optim=self.model.NoOptim,
-                    )
-
-        # the following covers the branch -- optim is an instance of Optimizer
-        adagrad_optim_2 = AdagradOptimizer(
-            alpha=0.005,
-            epsilon=0.02,
-        )
-
-        adam_optim = AdamOptimizer()
-
-        self.model.default_optimizer = adagrad_optim_2
-
-        with scope.NameScope('global_scope_2'):
-            with ParameterSharing({'scope_1': 'scope_0', 'scope_2': 'scope_0'}):
-                with scope.NameScope('scope_0'):
-                    fc1_output = self.model.FC(
-                        self.model.input_feature_schema.float_features,
-                        output_dims,
-                        weight_optim=None,   # it will use adagrad_optim_2
-                    )
-
-                with scope.NameScope('scope_1'), self.assertRaises(Exception):
-                    fc2_output = self.model.FC(
-                        self.model.input_feature_schema.float_features,
-                        output_dims,
-                        weight_optim=adagrad_optim,
-                    )
-
-                with scope.NameScope('scope_2'), self.assertRaises(Exception):
-                    fc2_output = self.model.FC(
-                        self.model.input_feature_schema.float_features,
-                        output_dims,
-                        weight_optim=adam_optim,
-                    )
diff --git a/caffe2/python/layer_test_util.py b/caffe2/python/layer_test_util.py
deleted file mode 100644
index 32bf58edeb0d..000000000000
--- a/caffe2/python/layer_test_util.py
+++ /dev/null
@@ -1,139 +0,0 @@
-## @package layer_test_util
-# Module caffe2.python.layer_test_util
-
-
-
-
-
-from collections import namedtuple
-
-from caffe2.python import (
-    core,
-    layer_model_instantiator,
-    layer_model_helper,
-    schema,
-    test_util,
-    workspace,
-    utils,
-)
-from caffe2.proto import caffe2_pb2
-import numpy as np
-
-
-# pyre-fixme[13]: Pyre can't detect attribute initialization through the
-#    super().__new__ call
-class OpSpec(namedtuple("OpSpec", "type input output arg")):
-
-    def __new__(cls, op_type, op_input, op_output, op_arg=None):
-        return super(OpSpec, cls).__new__(cls, op_type, op_input,
-                                          op_output, op_arg)
-
-
-class LayersTestCase(test_util.TestCase):
-
-    def setUp(self):
-        super().setUp()
-        self.setup_example()
-
-    def setup_example(self):
-        """
-        This is undocumented feature in hypothesis,
-        https://github.com/HypothesisWorks/hypothesis-python/issues/59
-        """
-        workspace.ResetWorkspace()
-        self.reset_model()
-
-    def reset_model(self, input_feature_schema=None, trainer_extra_schema=None):
-        input_feature_schema = input_feature_schema or schema.Struct(
-            ('float_features', schema.Scalar((np.float32, (32,)))),
-        )
-        trainer_extra_schema = trainer_extra_schema or schema.Struct()
-        self.model = layer_model_helper.LayerModelHelper(
-            'test_model',
-            input_feature_schema=input_feature_schema,
-            trainer_extra_schema=trainer_extra_schema)
-
-    def new_record(self, schema_obj):
-        return schema.NewRecord(self.model.net, schema_obj)
-
-    def get_training_nets(self, add_constants=False):
-        """
-        We don't use
-        layer_model_instantiator.generate_training_nets_forward_only()
-        here because it includes initialization of global constants, which make
-        testing tricky
-        """
-        train_net = core.Net('train_net')
-        if add_constants:
-            train_init_net = self.model.create_init_net('train_init_net')
-        else:
-            train_init_net = core.Net('train_init_net')
-        for layer in self.model.layers:
-            layer.add_operators(train_net, train_init_net)
-        return train_init_net, train_net
-
-    def get_eval_net(self):
-        return layer_model_instantiator.generate_eval_net(self.model)
-
-    def get_predict_net(self):
-        return layer_model_instantiator.generate_predict_net(self.model)
-
-    def run_train_net(self):
-        self.model.output_schema = schema.Struct()
-        train_init_net, train_net = \
-            layer_model_instantiator.generate_training_nets(self.model)
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-
-    def run_train_net_forward_only(self, num_iter=1):
-        self.model.output_schema = schema.Struct()
-        train_init_net, train_net = \
-            layer_model_instantiator.generate_training_nets_forward_only(
-                self.model)
-        workspace.RunNetOnce(train_init_net)
-        assert num_iter > 0, 'num_iter must be larger than 0'
-        workspace.CreateNet(train_net)
-        workspace.RunNet(train_net.Proto().name, num_iter=num_iter)
-
-    def assertBlobsEqual(self, spec_blobs, op_blobs):
-        """
-        spec_blobs can either be None or a list of blob names. If it's None,
-        then no assertion is performed. The elements of the list can be None,
-        in that case, it means that position will not be checked.
-        """
-        if spec_blobs is None:
-            return
-        self.assertEqual(len(spec_blobs), len(op_blobs))
-        for spec_blob, op_blob in zip(spec_blobs, op_blobs):
-            if spec_blob is None:
-                continue
-            self.assertEqual(spec_blob, op_blob)
-
-    def assertArgsEqual(self, spec_args, op_args):
-        self.assertEqual(len(spec_args), len(op_args))
-        keys = [a.name for a in op_args]
-
-        def parse_args(args):
-            operator = caffe2_pb2.OperatorDef()
-            # Generate the expected value in the same order
-            for k in keys:
-                v = args[k]
-                arg = utils.MakeArgument(k, v)
-                operator.arg.add().CopyFrom(arg)
-            return operator.arg
-
-        self.assertEqual(parse_args(spec_args), op_args)
-
-    def assertNetContainOps(self, net, op_specs):
-        """
-        Given a net and a list of OpSpec's, check that the net match the spec
-        """
-        ops = net.Proto().op
-        self.assertEqual(len(op_specs), len(ops))
-        for op, op_spec in zip(ops, op_specs):
-            self.assertEqual(op_spec.type, op.type)
-            self.assertBlobsEqual(op_spec.input, op.input)
-            self.assertBlobsEqual(op_spec.output, op.output)
-            if op_spec.arg is not None:
-                self.assertArgsEqual(op_spec.arg, op.arg)
-        return ops
diff --git a/caffe2/python/layers/__init__.py b/caffe2/python/layers/__init__.py
deleted file mode 100644
index 487b7751fd08..000000000000
--- a/caffe2/python/layers/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-
-
-
-
-
-from importlib import import_module
-import pkgutil
-import sys
-from . import layers
-
-
-def import_recursive(package):
-    """
-    Takes a package and imports all modules underneath it
-    """
-
-    pkg_dir = package.__path__
-    module_location = package.__name__
-    for (_module_loader, name, ispkg) in pkgutil.iter_modules(pkg_dir):
-        module_name = "{}.{}".format(module_location, name)  # Module/package
-        module = import_module(module_name)
-        if ispkg:
-            import_recursive(module)
-
-
-def find_subclasses_recursively(base_cls, sub_cls):
-    cur_sub_cls = base_cls.__subclasses__()
-    sub_cls.update(cur_sub_cls)
-    for cls in cur_sub_cls:
-        find_subclasses_recursively(cls, sub_cls)
-
-
-import_recursive(sys.modules[__name__])
-
-model_layer_subcls = set()
-find_subclasses_recursively(layers.ModelLayer, model_layer_subcls)
-
-for cls in list(model_layer_subcls):
-    layers.register_layer(cls.__name__, cls)
diff --git a/caffe2/python/layers/adaptive_weight.py b/caffe2/python/layers/adaptive_weight.py
deleted file mode 100644
index 143c2df80d89..000000000000
--- a/caffe2/python/layers/adaptive_weight.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# @package adaptive_weight
-# Module caffe2.fb.python.layers.adaptive_weight
-
-
-import numpy as np
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import ModelLayer
-from caffe2.python.regularizer import BoundedGradientProjection, LogBarrier
-
-
-"""
-Implementation of adaptive weighting: https://arxiv.org/pdf/1705.07115.pdf
-"""
-
-
-class AdaptiveWeight(ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        name="adaptive_weight",
-        optimizer=None,
-        weights=None,
-        enable_diagnose=False,
-        estimation_method="log_std",
-        pos_optim_method="log_barrier",
-        reg_lambda=0.1,
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-        self.output_schema = schema.Scalar(
-            np.float32, self.get_next_blob_reference("adaptive_weight")
-        )
-        self.data = self.input_record.field_blobs()
-        self.num = len(self.data)
-        self.optimizer = optimizer
-        if weights is not None:
-            assert len(weights) == self.num
-        else:
-            weights = [1. / self.num for _ in range(self.num)]
-        assert min(weights) > 0, "initial weights must be positive"
-        self.weights = np.array(weights).astype(np.float32)
-        self.estimation_method = str(estimation_method).lower()
-        # used in positivity-constrained parameterization as when the estimation method
-        # is inv_var, with optimization method being either log barrier, or grad proj
-        self.pos_optim_method = str(pos_optim_method).lower()
-        self.reg_lambda = float(reg_lambda)
-        self.enable_diagnose = enable_diagnose
-        self.init_func = getattr(self, self.estimation_method + "_init")
-        self.weight_func = getattr(self, self.estimation_method + "_weight")
-        self.reg_func = getattr(self, self.estimation_method + "_reg")
-        self.init_func()
-        if self.enable_diagnose:
-            self.weight_i = [
-                self.get_next_blob_reference("adaptive_weight_%d" % i)
-                for i in range(self.num)
-            ]
-            for i in range(self.num):
-                self.model.add_ad_hoc_plot_blob(self.weight_i[i])
-
-    def concat_data(self, net):
-        reshaped = [net.NextScopedBlob("reshaped_data_%d" % i) for i in range(self.num)]
-        # coerce shape for single real values
-        for i in range(self.num):
-            net.Reshape(
-                [self.data[i]],
-                [reshaped[i], net.NextScopedBlob("new_shape_%d" % i)],
-                shape=[1],
-            )
-        concated = net.NextScopedBlob("concated_data")
-        net.Concat(
-            reshaped, [concated, net.NextScopedBlob("concated_new_shape")], axis=0
-        )
-        return concated
-
-    def log_std_init(self):
-        """
-        mu = 2 log sigma, sigma = standard variance
-        per task objective:
-        min 1 / 2 / e^mu X + mu / 2
-        """
-        values = np.log(1. / 2. / self.weights)
-        initializer = (
-            "GivenTensorFill",
-            {"values": values, "dtype": core.DataType.FLOAT},
-        )
-        self.mu = self.create_param(
-            param_name="mu",
-            shape=[self.num],
-            initializer=initializer,
-            optimizer=self.optimizer,
-        )
-
-    def log_std_weight(self, x, net, weight):
-        """
-        min 1 / 2 / e^mu X + mu / 2
-        """
-        mu_neg = net.NextScopedBlob("mu_neg")
-        net.Negative(self.mu, mu_neg)
-        mu_neg_exp = net.NextScopedBlob("mu_neg_exp")
-        net.Exp(mu_neg, mu_neg_exp)
-        net.Scale(mu_neg_exp, weight, scale=0.5)
-
-    def log_std_reg(self, net, reg):
-        net.Scale(self.mu, reg, scale=0.5)
-
-    def inv_var_init(self):
-        """
-        k = 1 / variance
-        per task objective:
-        min 1 / 2 * k  X - 1 / 2 * log k
-        """
-        values = 2. * self.weights
-        initializer = (
-            "GivenTensorFill",
-            {"values": values, "dtype": core.DataType.FLOAT},
-        )
-        if self.pos_optim_method == "log_barrier":
-            regularizer = LogBarrier(reg_lambda=self.reg_lambda)
-        elif self.pos_optim_method == "pos_grad_proj":
-            regularizer = BoundedGradientProjection(lb=0, left_open=True)
-        else:
-            raise TypeError(
-                "unknown positivity optimization method: {}".format(
-                    self.pos_optim_method
-                )
-            )
-        self.k = self.create_param(
-            param_name="k",
-            shape=[self.num],
-            initializer=initializer,
-            optimizer=self.optimizer,
-            regularizer=regularizer,
-        )
-
-    def inv_var_weight(self, x, net, weight):
-        net.Scale(self.k, weight, scale=0.5)
-
-    def inv_var_reg(self, net, reg):
-        log_k = net.NextScopedBlob("log_k")
-        net.Log(self.k, log_k)
-        net.Scale(log_k, reg, scale=-0.5)
-
-    def _add_ops_impl(self, net, enable_diagnose):
-        x = self.concat_data(net)
-        weight = net.NextScopedBlob("weight")
-        reg = net.NextScopedBlob("reg")
-        weighted_x = net.NextScopedBlob("weighted_x")
-        weighted_x_add_reg = net.NextScopedBlob("weighted_x_add_reg")
-        self.weight_func(x, net, weight)
-        self.reg_func(net, reg)
-        net.Mul([weight, x], weighted_x)
-        net.Add([weighted_x, reg], weighted_x_add_reg)
-        net.SumElements(weighted_x_add_reg, self.output_schema())
-        if enable_diagnose:
-            for i in range(self.num):
-                net.Slice(weight, self.weight_i[i], starts=[i], ends=[i + 1])
-
-    def add_ops(self, net):
-        self._add_ops_impl(net, self.enable_diagnose)
diff --git a/caffe2/python/layers/add_bias.py b/caffe2/python/layers/add_bias.py
deleted file mode 100644
index 811845944cd8..000000000000
--- a/caffe2/python/layers/add_bias.py
+++ /dev/null
@@ -1,44 +0,0 @@
-## @package add_bias
-# Module caffe2.python.layers.add_bias
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer
-import math
-
-
-class AddBias(ModelLayer):
-
-    def __init__(self, model, input_record, bias_init=None,
-                 bias_optim=None, name='add_bias'):
-        super().__init__(model, name, input_record)
-        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
-        assert len(input_record.field_type().shape) > 0, (
-            "AddBias expects limited dimensions of the input tensor")
-
-        input_dims = input_record.field_type().shape[0]
-        assert input_dims > 0, (
-            "AddBias expects input dimensions > 0, got {}".format(input_dims))
-
-        scale = math.sqrt(1.0 / input_dims)
-        bias_init = bias_init if bias_init else (
-            'UniformFill', {'min': -scale, 'max': scale})
-
-        self.b = self.create_param(
-            param_name='b',
-            shape=[input_dims, ],
-            initializer=bias_init,
-            optimizer=bias_optim,
-        )
-
-        self.output_schema = schema.Scalar(
-            (input_record.field_type().base, (input_dims, )),
-            self.get_next_blob_reference('output')
-        )
-
-    def add_ops(self, net):
-        net.Add(self.input_record.field_blobs() + [self.b],
-                self.output_schema.field_blobs(), broadcast=1)
diff --git a/caffe2/python/layers/arc_cosine_feature_map.py b/caffe2/python/layers/arc_cosine_feature_map.py
deleted file mode 100644
index 3b52652cdbf7..000000000000
--- a/caffe2/python/layers/arc_cosine_feature_map.py
+++ /dev/null
@@ -1,178 +0,0 @@
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer
-import numpy as np
-
-
-class ArcCosineFeatureMap(ModelLayer):
-    """
-    A general version of the arc-cosine kernel feature map (s = 1 restores
-    the original arc-cosine kernel feature map).
-
-    Applies H(x) * x^s, where H is the Heaviside step function and x is the
-    input after applying FC (such that x = w * x_orig + b).
-
-    For more information, see the original paper:
-        http://cseweb.ucsd.edu/~saul/papers/nips09_kernel.pdf
-
-    Inputs :
-        output_dims -- dimensions of the output vector
-        s -- degree to raise transformed features
-        scale -- amount to scale the standard deviation
-        weight_init -- initialization distribution for weight parameter
-        bias_init -- initialization distribution for bias pararmeter
-        weight_optim -- optimizer for weight params; None for random features
-        bias_optim -- optimizer for bias param; None for random features
-        set_weight_as_global_constant -- if True, initialized random parameters
-                                         will be constant across all distributed
-                                         instances of the layer
-        initialize_output_schema -- if True, initialize output schema as Scalar
-                                    from Arc Cosine; else output schema is None
-    """
-    def __init__(
-            self,
-            model,
-            input_record,
-            output_dims,
-            s=1,
-            scale=1.0,
-            weight_init=None,
-            bias_init=None,
-            weight_optim=None,
-            bias_optim=None,
-            set_weight_as_global_constant=False,
-            initialize_output_schema=True,
-            name='arc_cosine_feature_map',
-            **kwargs):
-
-        super().__init__(model, name, input_record, **kwargs)
-        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
-        self.params = []
-        self.model = model
-        self.set_weight_as_global_constant = set_weight_as_global_constant
-
-        self.input_dims = input_record.field_type().shape[0]
-        assert self.input_dims >= 1, "Expected input dimensions >= 1, got %s" \
-                                     % self.input_dims
-
-        if initialize_output_schema:
-            self.output_schema = schema.Scalar(
-                (np.float32, (output_dims, )),
-                model.net.NextScopedBlob(name + '_output')
-            )
-
-        self.output_dims = output_dims
-        assert self.output_dims >= 1, "Expected output dimensions >= 1, got %s" \
-                                      % self.output_dims
-        self.s = s
-        assert (self.s >= 0), "Expected s >= 0, got %s" % self.s
-        assert isinstance(self.s, int), "Expected s to be type int, got type %s" \
-                                        % type(self.s)
-
-        assert (scale > 0.0), "Expected scale > 0, got %s" % scale
-        self.stddev = scale * np.sqrt(1.0 / self.input_dims)
-
-        # Initialize train_init_net parameters
-        # Random Parameters
-        if set_weight_as_global_constant:
-            w_init = np.random.normal(scale=self.stddev,
-                                      size=(self.output_dims, self.input_dims))
-            b_init = np.random.uniform(low=-0.5 * self.stddev,
-                                       high=0.5 * self.stddev,
-                                       size=self.output_dims)
-            self.random_w = self.model.add_global_constant(
-                name=self.name + "_fixed_rand_W",
-                array=w_init
-            )
-            self.random_b = self.model.add_global_constant(
-                name=self.name + "_fixed_rand_b",
-                array=b_init
-            )
-        else:
-            (self.random_w, self.random_b) = self._initialize_params(
-                'random_w',
-                'random_b',
-                w_init=weight_init,
-                b_init=bias_init,
-                w_optim=weight_optim,
-                b_optim=bias_optim
-            )
-
-    def _initialize_params(self, w_name, b_name, w_init=None, b_init=None,
-                           w_optim=None, b_optim=None):
-        """
-        Initializes the Layer Parameters for weight and bias terms for features
-
-        Inputs :
-            w_blob -- blob to contain w values
-            b_blob -- blob to contain b values
-            w_init -- initialization distribution for weight parameter
-            b_init -- initialization distribution for bias parameter
-            w_optim -- optimizer to use for w; if None, then will use no optimizer
-            b_optim -- optimizer to user for b; if None, then will use no optimizer
-        """
-
-        w_init = w_init if w_init else (
-            'GaussianFill', {'mean': 0.0, 'std': self.stddev}
-        )
-        w_optim = w_optim if w_optim else self.model.NoOptim
-
-        b_init = b_init if b_init else (
-            'UniformFill', {'min': -0.5 * self.stddev, 'max': 0.5 * self.stddev}
-        )
-        b_optim = b_optim if b_optim else self.model.NoOptim
-
-        w_param = self.create_param(param_name=w_name,
-                                    shape=(self.output_dims, self.input_dims),
-                                    initializer=w_init,
-                                    optimizer=w_optim)
-
-        b_param = self.create_param(param_name=b_name,
-                                    shape=[self.output_dims],
-                                    initializer=b_init,
-                                    optimizer=b_optim)
-
-        return [w_param, b_param]
-
-    def _heaviside_with_power(self, net, input_features, output_blob, s):
-        """
-        Applies Heaviside step function and Relu / exponentiation to features
-        depending on the value of s.
-
-        Inputs:
-            net -- net with operators
-            input_features -- features to processes
-            output_blob -- output blob reference
-            s -- degree to raise the transformed features
-        """
-        if s == 0:
-            softsign_features = net.Softsign([input_features],
-                                             net.NextScopedBlob('softsign'))
-            return net.Relu(softsign_features, output_blob)
-        elif s == 1:
-            return net.Relu([input_features],
-                            output_blob)
-        else:
-            relu_features = net.Relu([input_features],
-                                     net.NextScopedBlob('relu_rand'))
-            pow_features = net.Pow([input_features],
-                                   net.NextScopedBlob('pow_rand'),
-                                   exponent=float(s - 1))
-            return net.Mul([relu_features, pow_features],
-                           output_blob)
-
-    def add_ops(self, net):
-        input_blob = self.input_record.field_blobs()
-
-        # Random features: wx + b
-        random_features = net.FC(input_blob + [self.random_w, self.random_b],
-                                 net.NextScopedBlob('random_features'))
-        # Process random features
-        self._heaviside_with_power(net,
-                                   random_features,
-                                   self.output_schema.field_blobs(),
-                                   self.s)
diff --git a/caffe2/python/layers/batch_huber_loss.py b/caffe2/python/layers/batch_huber_loss.py
deleted file mode 100644
index 72202314fe1a..000000000000
--- a/caffe2/python/layers/batch_huber_loss.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# @package batch_huber_loss
-# Module caffe2.python.layers.batch_huber_loss
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import (
-    ModelLayer,
-)
-from caffe2.python.layers.tags import (
-    Tags
-)
-import numpy as np
-
-
-class BatchHuberLoss(ModelLayer):
-
-    def __init__(self, model, input_record, name='batch_huber_loss', delta=1.0, **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-
-        assert delta > 0
-
-        self._delta = delta
-
-        assert schema.is_schema_subset(
-            schema.Struct(
-                ('label', schema.Scalar()),
-                ('prediction', schema.Scalar())
-            ),
-            input_record
-        )
-        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
-
-        self.output_schema = schema.Scalar(
-            np.float32,
-            self.get_next_blob_reference('output'))
-
-    def add_ops(self, net):
-        prediction = net.Squeeze(
-            self.input_record.prediction(),
-            net.NextScopedBlob('squeezed_prediction'),
-            dims=[1]
-        )
-
-        label = self.input_record.label.field_blobs()
-        if self.input_record.label.field_type().base != (
-                self.input_record.prediction.field_type().base):
-            label = net.Cast(
-                label,
-                net.NextScopedBlob('cast_label'),
-                to=schema.data_type_for_dtype(
-                    self.input_record.prediction.field_type()
-                )
-            )
-
-        const_delta = net.ConstantFill(
-            label,
-            net.NextScopedBlob("delta"),
-            value=self._delta,
-            dtype=core.DataType.FLOAT,
-        )
-
-        label = net.StopGradient(
-            label,
-            net.NextScopedBlob('stopped_label')
-        )
-
-        const_delta = net.StopGradient(
-            const_delta,
-            net.NextScopedBlob('stopped_delta')
-        )
-
-        # abs_error = np.abs(true - pred)
-        abs_error = net.L1Distance(
-            [label, prediction], net.NextScopedBlob("abs_error")
-        )
-
-        # quadratic = 0.5*min(abs_error, delta)^2, linear = delta*max(abs_error-delta, 0)
-        min_error = net.Min(
-            [abs_error, const_delta], net.NextScopedBlob("min_error_delta")
-        )
-
-        quadratic_term = net.Scale(
-            net.Sqr(min_error), scale=float(0.5)
-        )
-
-        linear_term = net.Mul(
-            [
-                net.Sub([abs_error, min_error]),
-                const_delta,
-            ],
-            net.NextScopedBlob("huber_linear_term")
-        )
-
-        # huber = 0.5 * min(abs_error, delta)^2 + delta * max(abs_error-delta, 0)
-        huber_dist = net.Add(
-            [quadratic_term, linear_term], net.NextScopedBlob("huber_dist")
-        )
-
-        if 'weight' in self.input_record.fields:
-            weight_blob = self.input_record.weight()
-            if self.input_record.weight.field_type().base != np.float32:
-                weight_blob = net.Cast(
-                    weight_blob,
-                    weight_blob + '_float32',
-                    to=core.DataType.FLOAT
-                )
-            weight_blob = net.StopGradient(
-                [weight_blob],
-                [net.NextScopedBlob('weight_stop_gradient')],
-            )
-            huber_dist = net.Mul(
-                [huber_dist, weight_blob],
-                net.NextScopedBlob("weighted_huber_distance"),
-            )
-
-        net.AveragedLoss(huber_dist, self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/batch_lr_loss.py b/caffe2/python/layers/batch_lr_loss.py
deleted file mode 100644
index 05d900325119..000000000000
--- a/caffe2/python/layers/batch_lr_loss.py
+++ /dev/null
@@ -1,332 +0,0 @@
-## @package batch_lr_loss
-# Module caffe2.python.layers.batch_lr_loss
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import (
-    ModelLayer,
-)
-from caffe2.python.layers.tags import (
-    Tags
-)
-import numpy as np
-
-
-class BatchLRLoss(ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        name='batch_lr_loss',
-        average_loss=True,
-        jsd_weight=0.0,
-        pos_label_target=1.0,
-        neg_label_target=0.0,
-        homotopy_weighting=False,
-        log_D_trick=False,
-        unjoined_lr_loss=False,
-        uncertainty_penalty=1.0,
-        focal_gamma=0.0,
-        stop_grad_in_focal_factor=False,
-        task_gamma=1.0,
-        task_gamma_lb=0.1,
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-
-        self.average_loss = average_loss
-
-        assert (schema.is_schema_subset(
-            schema.Struct(
-                ('label', schema.Scalar()),
-                ('logit', schema.Scalar())
-            ),
-            input_record
-        ))
-
-        self.jsd_fuse = False
-        assert jsd_weight >= 0 and jsd_weight <= 1
-        if jsd_weight > 0 or homotopy_weighting:
-            assert 'prediction' in input_record
-            self.init_weight(jsd_weight, homotopy_weighting)
-            self.jsd_fuse = True
-        self.homotopy_weighting = homotopy_weighting
-
-        assert pos_label_target <= 1 and pos_label_target >= 0
-        assert neg_label_target <= 1 and neg_label_target >= 0
-        assert pos_label_target >= neg_label_target
-        self.pos_label_target = pos_label_target
-        self.neg_label_target = neg_label_target
-
-        assert not (log_D_trick and unjoined_lr_loss)
-        self.log_D_trick = log_D_trick
-        self.unjoined_lr_loss = unjoined_lr_loss
-        assert uncertainty_penalty >= 0
-        self.uncertainty_penalty = uncertainty_penalty
-
-        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
-
-        self.output_schema = schema.Scalar(
-            np.float32,
-            self.get_next_blob_reference('output')
-        )
-
-        self.focal_gamma = focal_gamma
-        self.stop_grad_in_focal_factor = stop_grad_in_focal_factor
-
-        self.apply_exp_decay = False
-        if task_gamma < 1.0:
-            self.apply_exp_decay = True
-            self.task_gamma_cur = self.create_param(
-                param_name=('%s_task_gamma_cur' % self.name),
-                shape=[1],
-                initializer=(
-                    'ConstantFill', {
-                        'value': 1.0,
-                        'dtype': core.DataType.FLOAT
-                    }
-                ),
-                optimizer=self.model.NoOptim,
-            )
-
-            self.task_gamma = self.create_param(
-                param_name=('%s_task_gamma' % self.name),
-                shape=[1],
-                initializer=(
-                    'ConstantFill', {
-                        'value': task_gamma,
-                        'dtype': core.DataType.FLOAT
-                    }
-                ),
-                optimizer=self.model.NoOptim,
-            )
-
-            self.task_gamma_lb = self.create_param(
-                param_name=('%s_task_gamma_lb' % self.name),
-                shape=[1],
-                initializer=(
-                    'ConstantFill', {
-                        'value': task_gamma_lb,
-                        'dtype': core.DataType.FLOAT
-                    }
-                ),
-                optimizer=self.model.NoOptim,
-            )
-
-    def init_weight(self, jsd_weight, homotopy_weighting):
-        if homotopy_weighting:
-            self.mutex = self.create_param(
-                param_name=('%s_mutex' % self.name),
-                shape=None,
-                initializer=('CreateMutex', ),
-                optimizer=self.model.NoOptim,
-            )
-            self.counter = self.create_param(
-                param_name=('%s_counter' % self.name),
-                shape=[1],
-                initializer=(
-                    'ConstantFill', {
-                        'value': 0,
-                        'dtype': core.DataType.INT64
-                    }
-                ),
-                optimizer=self.model.NoOptim,
-            )
-            self.xent_weight = self.create_param(
-                param_name=('%s_xent_weight' % self.name),
-                shape=[1],
-                initializer=(
-                    'ConstantFill', {
-                        'value': 1.,
-                        'dtype': core.DataType.FLOAT
-                    }
-                ),
-                optimizer=self.model.NoOptim,
-            )
-            self.jsd_weight = self.create_param(
-                param_name=('%s_jsd_weight' % self.name),
-                shape=[1],
-                initializer=(
-                    'ConstantFill', {
-                        'value': 0.,
-                        'dtype': core.DataType.FLOAT
-                    }
-                ),
-                optimizer=self.model.NoOptim,
-            )
-        else:
-            self.jsd_weight = self.model.add_global_constant(
-                '%s_jsd_weight' % self.name, jsd_weight
-            )
-            self.xent_weight = self.model.add_global_constant(
-                '%s_xent_weight' % self.name, 1. - jsd_weight
-            )
-
-    def update_weight(self, net):
-        net.AtomicIter([self.mutex, self.counter], [self.counter])
-        # iter = 0: lr = 1;
-        # iter = 1e6; lr = 0.5^0.1  = 0.93
-        # iter = 1e9; lr = 1e-3^0.1 = 0.50
-        net.LearningRate([self.counter], [self.xent_weight], base_lr=1.0,
-                         policy='inv', gamma=1e-6, power=0.1,)
-        net.Sub(
-            [self.model.global_constants['ONE'], self.xent_weight],
-            [self.jsd_weight]
-        )
-        return self.xent_weight, self.jsd_weight
-
-    def add_ops(self, net):
-        # numerically stable log-softmax with crossentropy
-        label = self.input_record.label()
-        # mandatory cast to float32
-        # self.input_record.label.field_type().base is np.float32 but
-        # label type is actually int
-        label = net.Cast(
-            label,
-            net.NextScopedBlob('label_float32'),
-            to=core.DataType.FLOAT)
-        label = net.ExpandDims(label, net.NextScopedBlob('expanded_label'),
-                                dims=[1])
-        if self.pos_label_target != 1.0 or self.neg_label_target != 0.0:
-            label = net.StumpFunc(
-                label,
-                net.NextScopedBlob('smoothed_label'),
-                threshold=0.5,
-                low_value=self.neg_label_target,
-                high_value=self.pos_label_target,
-            )
-        xent = net.SigmoidCrossEntropyWithLogits(
-            [self.input_record.logit(), label],
-            net.NextScopedBlob('cross_entropy'),
-            log_D_trick=self.log_D_trick,
-            unjoined_lr_loss=self.unjoined_lr_loss
-        )
-
-        if self.focal_gamma != 0:
-            label = net.StopGradient(
-                [label],
-                [net.NextScopedBlob('label_stop_gradient')],
-            )
-
-            prediction = self.input_record.prediction()
-            # focal loss = (y(1-p) + p(1-y))^gamma * original LR loss
-            # y(1-p) + p(1-y) = y + p - 2 * yp
-            y_plus_p = net.Add(
-                [prediction, label],
-                net.NextScopedBlob("y_plus_p"),
-            )
-            yp = net.Mul([prediction, label], net.NextScopedBlob("yp"))
-            two_yp = net.Scale(yp, net.NextScopedBlob("two_yp"), scale=2.0)
-            y_plus_p_sub_two_yp = net.Sub(
-                [y_plus_p, two_yp], net.NextScopedBlob("y_plus_p_sub_two_yp")
-            )
-            focal_factor = net.Pow(
-                y_plus_p_sub_two_yp,
-                net.NextScopedBlob("y_plus_p_sub_two_yp_power"),
-                exponent=float(self.focal_gamma),
-            )
-            if self.stop_grad_in_focal_factor is True:
-                focal_factor = net.StopGradient(
-                    [focal_factor],
-                    [net.NextScopedBlob("focal_factor_stop_gradient")],
-                )
-            xent = net.Mul(
-                [xent, focal_factor], net.NextScopedBlob("focallossxent")
-            )
-
-        if self.apply_exp_decay:
-            net.Mul(
-                [self.task_gamma_cur, self.task_gamma],
-                self.task_gamma_cur
-            )
-
-            task_gamma_multiplier = net.Max(
-                [self.task_gamma_cur, self.task_gamma_lb],
-                net.NextScopedBlob("task_gamma_cur_multiplier")
-            )
-
-            xent = net.Mul(
-                [xent, task_gamma_multiplier], net.NextScopedBlob("expdecayxent")
-            )
-
-        # fuse with JSD
-        if self.jsd_fuse:
-            jsd = net.BernoulliJSD(
-                [self.input_record.prediction(), label],
-                net.NextScopedBlob('jsd'),
-            )
-            if self.homotopy_weighting:
-                self.update_weight(net)
-            loss = net.WeightedSum(
-                [xent, self.xent_weight, jsd, self.jsd_weight],
-                net.NextScopedBlob('loss'),
-            )
-        else:
-            loss = xent
-
-        if 'log_variance' in self.input_record.fields:
-            # mean (0.5 * exp(-s) * loss + 0.5 * penalty * s)
-            log_variance_blob = self.input_record.log_variance()
-
-            log_variance_blob = net.ExpandDims(
-                log_variance_blob, net.NextScopedBlob('expanded_log_variance'),
-                dims=[1]
-            )
-
-            neg_log_variance_blob = net.Negative(
-                [log_variance_blob],
-                net.NextScopedBlob('neg_log_variance')
-            )
-
-            # enforce less than 88 to avoid OverflowError
-            neg_log_variance_blob = net.Clip(
-                [neg_log_variance_blob],
-                net.NextScopedBlob('clipped_neg_log_variance'),
-                max=88.0
-            )
-
-            exp_neg_log_variance_blob = net.Exp(
-                [neg_log_variance_blob],
-                net.NextScopedBlob('exp_neg_log_variance')
-            )
-
-            exp_neg_log_variance_loss_blob = net.Mul(
-                [exp_neg_log_variance_blob, loss],
-                net.NextScopedBlob('exp_neg_log_variance_loss')
-            )
-
-            penalized_uncertainty = net.Scale(
-                log_variance_blob, net.NextScopedBlob("penalized_unceratinty"),
-                scale=float(self.uncertainty_penalty)
-            )
-
-            loss_2x = net.Add(
-                [exp_neg_log_variance_loss_blob, penalized_uncertainty],
-                net.NextScopedBlob('loss')
-            )
-            loss = net.Scale(loss_2x, net.NextScopedBlob("loss"), scale=0.5)
-
-        if 'weight' in self.input_record.fields:
-            weight_blob = self.input_record.weight()
-            if self.input_record.weight.field_type().base != np.float32:
-                weight_blob = net.Cast(
-                    weight_blob,
-                    weight_blob + '_float32',
-                    to=core.DataType.FLOAT
-                )
-            weight_blob = net.StopGradient(
-                [weight_blob],
-                [net.NextScopedBlob('weight_stop_gradient')],
-            )
-            loss = net.Mul(
-                [loss, weight_blob],
-                net.NextScopedBlob('weighted_cross_entropy'),
-            )
-
-        if self.average_loss:
-            net.AveragedLoss(loss, self.output_schema.field_blobs())
-        else:
-            net.ReduceFrontSum(loss, self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/batch_mse_loss.py b/caffe2/python/layers/batch_mse_loss.py
deleted file mode 100644
index 70c73aed497a..000000000000
--- a/caffe2/python/layers/batch_mse_loss.py
+++ /dev/null
@@ -1,79 +0,0 @@
-## @package batch_mse_loss
-# Module caffe2.python.layers.batch_mse_loss
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import (
-    ModelLayer,
-)
-from caffe2.python.layers.tags import (
-    Tags
-)
-import numpy as np
-
-
-class BatchMSELoss(ModelLayer):
-
-    def __init__(self, model, input_record, name='batch_mse_loss', **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-
-        assert schema.is_schema_subset(
-            schema.Struct(
-                ('label', schema.Scalar()),
-                ('prediction', schema.Scalar())
-            ),
-            input_record
-        )
-        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
-
-        self.output_schema = schema.Scalar(
-            np.float32,
-            self.get_next_blob_reference('output'))
-
-    def add_ops(self, net):
-        prediction = self.input_record.prediction()
-        label = self.input_record.label.field_blobs()
-        if self.input_record.label.field_type().base != (
-                self.input_record.prediction.field_type().base):
-
-            label = net.Cast(
-                label,
-                net.NextScopedBlob('cast_label'),
-                to=schema.data_type_for_dtype(
-                    self.input_record.prediction.field_type()
-                )
-            )
-
-        label = net.ExpandDims(label, 1, dims=[1])
-
-        label = net.StopGradient(
-            label,
-            net.NextScopedBlob('stopped_label')
-        )
-
-        l2dist = net.SquaredL2Distance(
-            [label, prediction],
-            net.NextScopedBlob('l2')
-        )
-
-        if 'weight' in self.input_record.fields:
-            weight_blob = self.input_record.weight()
-            if self.input_record.weight.field_type().base != np.float32:
-                weight_blob = net.Cast(
-                    weight_blob,
-                    weight_blob + '_float32',
-                    to=core.DataType.FLOAT
-                )
-            weight_blob = net.StopGradient(
-                [weight_blob],
-                [net.NextScopedBlob('weight_stop_gradient')],
-            )
-            l2dist = net.Mul(
-                [l2dist, weight_blob],
-                net.NextScopedBlob('weighted_l2_distance'),
-            )
-
-        net.AveragedLoss(l2dist, self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/batch_normalization.py b/caffe2/python/layers/batch_normalization.py
deleted file mode 100644
index 0de3e6a62455..000000000000
--- a/caffe2/python/layers/batch_normalization.py
+++ /dev/null
@@ -1,107 +0,0 @@
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer
-
-import numpy as np
-
-
-class BatchNormalization(ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        name='batch_normalization',
-        scale_optim=None,
-        bias_optim=None,
-        momentum=0.9,
-        order='NCHW',
-        scale_init_value=1.0,
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-
-        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
-
-        self.input_shape = input_record.field_type().shape
-
-        if len(self.input_shape) == 3:
-            if order == "NCHW":
-                input_dims = self.input_shape[0]
-            elif order == "NHWC":
-                input_dims = self.input_shape[2]
-            else:
-                raise ValueError("Please specify a correct order")
-        else:
-            assert len(self.input_shape) == 1, (
-                "This layer supports only 4D or 2D tensors")
-            input_dims = self.input_shape[0]
-
-        self.output_schema = schema.Scalar(
-            (np.float32, self.input_shape),
-            self.get_next_blob_reference('output')
-        )
-
-        self.momentum = momentum
-        self.order = order
-
-        self.scale = self.create_param(param_name='scale',
-                                       shape=[input_dims],
-                                       initializer=('ConstantFill', {'value': scale_init_value}),
-                                       optimizer=scale_optim)
-        self.bias = self.create_param(param_name='bias',
-                                       shape=[input_dims],
-                                       initializer=('ConstantFill', {'value': 0.0}),
-                                       optimizer=bias_optim)
-        self.rm = self.create_param(param_name='running_mean',
-                                       shape=[input_dims],
-                                       initializer=('ConstantFill', {'value': 0.0}),
-                                       optimizer=model.NoOptim)
-        self.riv = self.create_param(param_name='running_inv_var',
-                                       shape=[input_dims],
-                                       initializer=('ConstantFill', {'value': 1.0}),
-                                       optimizer=model.NoOptim)
-
-    def _add_ops(self, net, is_test, out_blob=None):
-        original_input_blob = self.input_record.field_blobs()
-        input_blob = net.NextScopedBlob('expand_input')
-        if len(self.input_shape) == 1:
-            input_blob = net.ExpandDims(original_input_blob,
-                                        dims=[2, 3])
-        else:
-            input_blob = original_input_blob[0]
-
-        if out_blob is None:
-            bn_output = self.output_schema.field_blobs()
-        else:
-            bn_output = out_blob
-        if is_test:
-            output_blobs = bn_output
-        else:
-            output_blobs = bn_output + [self.rm, self.riv,
-                                        net.NextScopedBlob('bn_saved_mean'),
-                                        net.NextScopedBlob('bn_saved_iv')]
-
-        net.SpatialBN([input_blob, self.scale,
-                       self.bias, self.rm, self.riv],
-                      output_blobs,
-                      momentum=self.momentum,
-                      is_test=is_test,
-                      order=self.order)
-
-        if len(self.input_shape) == 1:
-            net.Squeeze(bn_output,
-                        bn_output,
-                        dims=[2, 3])
-
-    def add_train_ops(self, net):
-        self._add_ops(net, is_test=False)
-
-    def add_eval_ops(self, net):
-        self._add_ops(net, is_test=True)
-
-    def add_ops(self, net):
-        self.add_eval_ops(net)
diff --git a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py b/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
deleted file mode 100644
index 8500dcddb84c..000000000000
--- a/caffe2/python/layers/batch_sigmoid_cross_entropy_loss.py
+++ /dev/null
@@ -1,48 +0,0 @@
-## @package batch_sigmoid_cross_entropy_loss
-# Module caffe2.python.layers.batch_sigmoid_cross_entropy_loss
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer
-from caffe2.python.layers.tags import Tags
-import numpy as np
-
-
-class BatchSigmoidCrossEntropyLoss(ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        name='batch_sigmoid_cross_entropy_loss',
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-
-        assert schema.is_schema_subset(
-            schema.Struct(
-                ('label', schema.Scalar(np.float32)),
-                ('prediction', schema.Scalar(np.float32)),
-            ),
-            input_record
-        )
-        assert input_record.prediction.field_type().shape == \
-            input_record.label.field_type().shape, \
-            "prediction and label must have the same shape"
-
-        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
-
-        self.output_schema = schema.Scalar(
-            (np.float32, tuple()), self.get_next_blob_reference('loss')
-        )
-
-    def add_ops(self, net):
-        sigmoid_cross_entropy = net.SigmoidCrossEntropyWithLogits(
-            [self.input_record.prediction(), self.input_record.label()],
-            net.NextScopedBlob('sigmoid_cross_entropy')
-        )
-
-        net.AveragedLoss(
-            sigmoid_cross_entropy, self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/batch_softmax_loss.py b/caffe2/python/layers/batch_softmax_loss.py
deleted file mode 100644
index a2b718d81564..000000000000
--- a/caffe2/python/layers/batch_softmax_loss.py
+++ /dev/null
@@ -1,127 +0,0 @@
-## @package batch_softmax_loss
-# Module caffe2.python.layers.batch_softmax_loss
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import ModelLayer
-import numpy as np
-
-
-class BatchSoftmaxLoss(ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        name='batch_softmax_loss',
-        label_smoothing_matrix=None,
-        label_prob=False,
-        scale=1.0,
-        average_by_batch_size=False,
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-
-        assert schema.is_schema_subset(
-            schema.Struct(
-                ('label', schema.Scalar()),
-                ('prediction', schema.Scalar()),
-            ),
-            input_record
-        )
-        self.label_prob = label_prob
-        self.scale = scale
-        self.average_by_batch_size = average_by_batch_size
-
-        # label smoothing matrix: a K * K matrix where K is the label
-        # cardinality; (i, j) element is the value of for label i
-        # treated/smoothed as label j
-        self.label_smoothing_matrix = label_smoothing_matrix
-        if self.label_smoothing_matrix is not None:
-            self.initialize_label_smoothing_constants()
-
-        self.output_schema = schema.Struct(
-            (
-                'softmax', schema.Scalar(
-                    input_record.prediction.field_type(),
-                    self.get_next_blob_reference('softmax')
-                )
-            ),
-            (
-                'loss', schema.Scalar(
-                    np.float32, self.get_next_blob_reference('loss')
-                )
-            ),
-        )
-
-    def initialize_label_smoothing_constants(self):
-        assert self.label_smoothing_matrix is not None
-        self.label_smoothing_matrix = np.array(
-            self.label_smoothing_matrix).astype(np.float32)
-        assert len(self.label_smoothing_matrix.shape) == 2
-        label_dim = self.label_smoothing_matrix.shape[0]
-        assert label_dim == self.label_smoothing_matrix.shape[1]
-
-        self.label_smoothing_matrix = self.model.add_global_constant(
-            '%s_label_smoothing_matrix' % self.name,
-            array=self.label_smoothing_matrix,
-            dtype=np.dtype(np.float32),
-        )
-        self.label_dim = self.model.add_global_constant(
-            '%s_label_dim' % self.name,
-            array=label_dim,
-            dtype=np.dtype(np.int64),
-        )
-        # default case: label is given NOT as target distribution
-        # but when used in label smoothing, the label must be in probabilities
-        self.label_prob = True
-
-    def compute_smoothed_label(self, net):
-        assert self.label_smoothing_matrix is not None
-        label = self.input_record.label()
-        original_label_type = self.input_record.label.field_type()
-        if original_label_type.base != np.int64:
-            int64_label = net.NextScopedBlob('int64_label')
-            net.Cast([label], [int64_label], to=core.DataType.INT64)
-        else:
-            int64_label = label
-        one_hot_label = net.NextScopedBlob('one_hot_label')
-        smoothed_label = net.NextScopedBlob('smoothed_label')
-        net.OneHot([int64_label, self.label_dim], [one_hot_label])
-        net.MatMul([one_hot_label, self.label_smoothing_matrix], smoothed_label)
-        return smoothed_label
-
-    def add_ops(self, net):
-        label = self.input_record.label.field_blobs()
-        if self.label_smoothing_matrix is not None:
-            label = [self.compute_smoothed_label(net)]
-        elif not self.label_prob:
-            if self.input_record.label.field_types()[0].base != np.int32:
-                label = [
-                    net.Cast(label,
-                             net.NextScopedBlob('int32_label'),
-                             to=core.DataType.INT32)
-                ]
-
-        softmax_input = self.input_record.prediction.field_blobs() + label
-
-        if 'weight' in self.input_record:
-            weight_blob = self.input_record.weight()
-            if self.input_record.weight.field_type().base != np.float32:
-                weight_blob = net.Cast(
-                    weight_blob,
-                    weight_blob + '_float32',
-                    to=core.DataType.FLOAT
-                )
-
-            softmax_input += [weight_blob]
-
-        net.SoftmaxWithLoss(
-            softmax_input,
-            self.output_schema.field_blobs(),
-            label_prob=self.label_prob,
-            scale=self.scale,
-            average_by_batch_size=self.average_by_batch_size,
-        )
diff --git a/caffe2/python/layers/blob_weighted_sum.py b/caffe2/python/layers/blob_weighted_sum.py
deleted file mode 100644
index 669d4a54f0c1..000000000000
--- a/caffe2/python/layers/blob_weighted_sum.py
+++ /dev/null
@@ -1,73 +0,0 @@
-## @package BlobWeightedSum
-# Module caffe2.python.layers.blob_weighted_sum
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer
-
-
-class BlobWeightedSum(ModelLayer):
-    """
-    This layer implements the weighted sum:
-    weighted element-wise sum of input blobs.
-    """
-    def __init__(
-        self,
-        model,
-        input_record,
-        init_weights=None,
-        weight_optim=None,
-        name='blob_weighted_sum',
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-
-        self.blobs = self.input_record.field_blobs()
-
-        self.num_weights = len(self.blobs)
-        assert self.num_weights > 1, (
-            "BlobWeightedSum expects more than one input blobs"
-        )
-
-        assert len(input_record.field_types()[0].shape) > 0, (
-            "BlobWeightedSum expects limited dimensions of the input tensor"
-        )
-
-        assert all(
-            input_record.field_types()[0].shape == input_record.field_types()[i].shape
-            for i in range(1, self.num_weights)
-        ), "Shape of input blobs should be the same shape {}".format(
-            input_record.field_types()[0].shape
-        )
-
-        if init_weights:
-            assert self.num_weights == len(init_weights), (
-                "the size of init_weights should be the same as input blobs, "
-                "expects {}, got {}".format(self.num_weights, len(init_weights))
-            )
-        else:
-            init_weights = [1.0] * self.num_weights
-
-        self.weights = [
-            self.create_param(
-                param_name="w_{}".format(idx),
-                shape=[1],
-                initializer=('ConstantFill', {'value': float(init_weights[idx])}),
-                optimizer=weight_optim
-            ) for idx in range(self.num_weights)
-        ]
-
-        self.output_schema = schema.Scalar(
-            input_record.field_types()[0],
-            self.get_next_blob_reference('blob_weighted_sum_out')
-        )
-
-    def add_ops(self, net):
-        net.WeightedSum(
-            [x for pair in zip(self.blobs, self.weights) for x in pair],
-            self.output_schema(),
-            grad_on_w=True,
-        )
diff --git a/caffe2/python/layers/bpr_loss.py b/caffe2/python/layers/bpr_loss.py
deleted file mode 100644
index 5f2446404683..000000000000
--- a/caffe2/python/layers/bpr_loss.py
+++ /dev/null
@@ -1,49 +0,0 @@
-## @package bpr_loss
-# Module caffe2.python.layers.bpr_loss
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import (
-    ModelLayer,
-)
-from caffe2.python.layers.tags import (
-    Tags
-)
-import numpy as np
-
-
-# ref: https://arxiv.org/pdf/1205.2618.pdf
-class BPRLoss(ModelLayer):
-
-    def __init__(self, model, input_record, name='bpr_loss', **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-        assert schema.is_schema_subset(
-            schema.Struct(
-                ('pos_prediction', schema.Scalar()),
-                ('neg_prediction', schema.List(np.float32)),
-            ),
-            input_record
-        )
-        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
-        self.output_schema = schema.Scalar(
-            np.float32,
-            self.get_next_blob_reference('output'))
-
-    def add_ops(self, net):
-        # formula:
-        # loss = - SUM(Ln(Sigmoid(Simlarity(u, pos) - Simlarity(u, neg))))
-        neg_score = self.input_record.neg_prediction['values']()
-
-        pos_score = net.LengthsTile(
-            [
-                self.input_record.pos_prediction(),
-                self.input_record.neg_prediction['lengths']()
-            ],
-            net.NextScopedBlob('pos_score_repeated')
-        )
-        # https://www.tensorflow.org/api_docs/python/tf/math/log_sigmoid
-        softplus = net.Softplus([net.Sub([neg_score, pos_score])])
-        net.ReduceFrontSum(softplus, self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/bucket_weighted.py b/caffe2/python/layers/bucket_weighted.py
deleted file mode 100644
index c72aceaaf17d..000000000000
--- a/caffe2/python/layers/bucket_weighted.py
+++ /dev/null
@@ -1,73 +0,0 @@
-## @package bucket_weighted
-# Module caffe2.python.layers.bucket_weighted
-
-
-
-
-
-import logging
-import numpy as np
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import (
-    get_categorical_limit,
-    ModelLayer,
-)
-
-from caffe2.python.layers.tags import Tags
-
-logger = logging.getLogger(__name__)
-
-
-class BucketWeighted(ModelLayer):
-    def __init__(self, model, input_record, max_score=0, bucket_boundaries=None,
-                 hash_buckets=True, weight_optim=None, name="bucket_weighted"):
-        super().__init__(model, name, input_record)
-
-        assert isinstance(input_record, schema.List), "Incorrect input type"
-        self.bucket_boundaries = bucket_boundaries
-        self.hash_buckets = hash_buckets
-        if bucket_boundaries is not None:
-            self.shape = len(bucket_boundaries) + 1
-        elif max_score > 0:
-            self.shape = max_score
-        else:
-            self.shape = get_categorical_limit(input_record)
-
-        self.bucket_w = self.create_param(param_name='bucket_w',
-                                       shape=[self.shape, ],
-                                       initializer=('ConstantFill', {'value': 1.0}),
-                                       optimizer=weight_optim)
-
-        self.output_schema = schema.Struct(
-            ('bucket_weights',
-                schema.Scalar((np.float32, self.shape),
-                              self.get_next_blob_reference("bucket_w_gather")))
-        )
-
-        self.tags.update({Tags.HANDLE_AS_SPARSE_LAYER})
-
-    def get_memory_usage(self):
-        return self.shape
-
-    def add_ops(self, net):
-        if self.bucket_boundaries is not None:
-            buckets_int = net.Bucketize(
-                self.input_record.values(),
-                "buckets_int",
-                boundaries=self.bucket_boundaries
-            )
-        else:
-            buckets = self.input_record.values()
-            buckets_int = net.Cast(
-                buckets,
-                "buckets_int",
-                to=core.DataType.INT32
-            )
-        if self.hash_buckets:
-            buckets_int = net.IndexHash(
-                buckets_int, "hashed_buckets_int", seed=0, modulo=self.shape
-            )
-        net.Gather(
-            [self.bucket_w, buckets_int],
-            self.output_schema.bucket_weights.field_blobs())
diff --git a/caffe2/python/layers/build_index.py b/caffe2/python/layers/build_index.py
deleted file mode 100644
index 2505a15f74b3..000000000000
--- a/caffe2/python/layers/build_index.py
+++ /dev/null
@@ -1,70 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import ModelLayer
-
-
-class MapToRange(ModelLayer):
-    """
-    This layer aims to build a mapping from raw keys to indices within [0, max_index).
-    The mapping is continuously built during training. The mapping will be frozen during
-    evaluation and prediction. Unseen keys will be assigned to index 0.
-    """
-
-    def __init__(
-        self, model,
-        input_record,
-        max_index,
-        name='map_to_range',
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-
-        assert max_index > 0
-        assert isinstance(input_record, schema.Scalar)
-
-        self.max_index = max_index
-
-        self.handler = self.create_param(
-            param_name='handler',
-            shape=[],
-            initializer=('LongIndexCreate', {'max_elements': self.max_index}),
-            optimizer=model.NoOptim
-        )
-
-        self.output_schema = schema.Struct(
-            ('indices', schema.Scalar(
-                np.int64, self.get_next_blob_reference("indices")
-            )),
-            ('handler', schema.Scalar(
-                np.void, self.handler
-            )),
-        )
-
-    def add_train_ops(self, net):
-        if self.input_record.field_type().base != np.int64:
-            keys = net.Cast(
-                self.input_record(),
-                net.NextScopedBlob("indices_before_mapping"),
-                to=core.DataType.INT64
-            )
-        else:
-            keys = self.input_record()
-
-        # Load keys into indices
-        indices = net.IndexGet([self.handler, keys],
-                                self.output_schema.indices())
-
-        net.StopGradient(indices, indices)
-
-    def add_eval_ops(self, net):
-        net.IndexFreeze(self.handler, self.handler)
-        self.add_train_ops(net)
-
-    def add_ops(self, net):
-        self.add_eval_ops(net)
diff --git a/caffe2/python/layers/concat.py b/caffe2/python/layers/concat.py
deleted file mode 100644
index f7dabe7fd608..000000000000
--- a/caffe2/python/layers/concat.py
+++ /dev/null
@@ -1,136 +0,0 @@
-## @package concat
-# Module caffe2.python.layers.concat
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import (
-    ModelLayer,
-)
-import numpy as np
-from collections import defaultdict
-
-import logging
-logger = logging.getLogger(__name__)
-
-
-def get_concatenated_feature_to_index(blobs_to_concat):
-    concat_feature_to_index = defaultdict(list)
-    start_pos = 0
-    for scalar in blobs_to_concat:
-        num_dims = scalar.dtype.shape[0]
-        if hasattr(scalar, 'metadata') \
-            and hasattr(scalar.metadata, 'feature_specs') \
-            and hasattr(scalar.metadata.feature_specs, 'feature_to_index') \
-                and isinstance(scalar.metadata.feature_specs.feature_to_index, dict):  # noqa B950
-            for k, v in scalar.metadata.feature_specs.feature_to_index.items():
-                concat_feature_to_index[k].extend([start_pos + vi for vi in v])
-        start_pos += num_dims
-    return dict(concat_feature_to_index) if concat_feature_to_index.keys() else None
-
-
-class Concat(ModelLayer):
-    """
-    Construct Concat layer
-    Assume that first dimension is batch,
-
-    Example:
-
-        embedding_dim = 64
-        input_record = self.new_record(schema.Struct(
-            ('input1', schema.Scalar((np.float32, (embedding_dim, )))),
-            ('input2', schema.Scalar((np.float32, (embedding_dim, )))),
-            ('input3', schema.Scalar((np.float32, (embedding_dim, )))),
-        ))
-
-        output = self.model.Concat(input_record)
-        self.assertEqual(
-            schema.Scalar((np.float32, ((len(input_record.fields) * embedding_dim, )))),
-            output
-        )
-
-        # Note that in Concat layer we assume first dimension is batch.
-        # so input is B * embedding_dim
-        # add_axis=1 make it B * 1 * embedding_dim
-        # Concat on axis=1 make it B * N * embedding_dim
-
-        output = self.model.Concat(input_record, axis=1, add_axis=1)
-        self.assertEqual(
-            schema.Scalar((np.float32, ((len(input_record.fields), embedding_dim)))),
-            output
-        )
-    """
-
-    def __init__(self, model, input_record, axis=1, add_axis=0,
-                 name='concat', **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-        self.axis = axis
-        self.add_axis = add_axis
-        assert not (axis == 0 and add_axis == 1), \
-            "It's not allowed to add axis=0"
-        assert isinstance(input_record, schema.Struct),\
-            "Incorrect input type. Expected Struct, but received: {0}".\
-            format(input_record)
-
-        shapes = []
-        for field_name, field_type in input_record.fields.items():
-            assert isinstance(field_type, schema.Scalar),\
-                "Incorrect input type for {}. Expected Scalar, but got: {}".\
-                format(field_name, field_type)
-            # Assume that first dimension is batch, so actual axis in shape is
-            # axis - 1
-            shape = list(field_type.field_type().shape)
-            if add_axis:
-                shape.insert(axis - 1, 1)
-            assert len(shape) >= axis,\
-                "Concat expects that limited dimensions of the input tensor"
-            shapes.append(shape)
-        logger.info('Concat Layer input shapes: ' + str(shapes))
-
-        if axis == 0:
-            self.output_schema = schema.from_blob_list(
-                input_record[0],
-                [self.get_next_blob_reference('output')]
-            )
-            return
-
-        concat_dim = 0
-        for shape in shapes:
-            concat_dim += shape[axis - 1]
-            shape[axis - 1] = 0
-            assert shape == shapes[0],\
-                "Shapes {0} and {1} are not compatible for Concat".\
-                format(shape, shapes[0])
-        output_dims = shapes[0]
-        output_dims[axis - 1] = concat_dim
-
-        logger.info('Concat Layer output_dims: ' + str(output_dims))
-        self.output_schema = schema.Scalar(
-            (np.float32, output_dims),
-            self.get_next_blob_reference('output'))
-
-        record_to_concat = input_record.fields.values()
-        concated_feature_to_index = get_concatenated_feature_to_index(
-            record_to_concat
-        )
-        if concated_feature_to_index:
-            metadata = schema.Metadata(
-                feature_specs=schema.FeatureSpec(
-                    feature_to_index=concated_feature_to_index
-                )
-            )
-            self.output_schema.set_metadata(metadata)
-
-
-    def add_ops(self, net):
-        net.Concat(
-            self.input_record.field_blobs(),
-            [
-                self.output_schema.field_blobs()[0],
-                self.output_schema.field_blobs()[0] + "_concat_dims"
-            ],
-            axis=self.axis,
-            add_axis=self.add_axis,
-        )
diff --git a/caffe2/python/layers/constant_weight.py b/caffe2/python/layers/constant_weight.py
deleted file mode 100644
index d160ed8206b3..000000000000
--- a/caffe2/python/layers/constant_weight.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# @package constant_weight
-# Module caffe2.fb.python.layers.constant_weight
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer
-import numpy as np
-
-
-class ConstantWeight(ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        weights=None,
-        name='constant_weight',
-        **kwargs
-    ):
-        super(ConstantWeight,
-              self).__init__(model, name, input_record, **kwargs)
-        self.output_schema = schema.Scalar(
-            np.float32, self.get_next_blob_reference('constant_weight')
-        )
-        self.data = self.input_record.field_blobs()
-        self.num = len(self.data)
-        weights = (
-            weights if weights is not None else
-            [1. / self.num for _ in range(self.num)]
-        )
-        assert len(weights) == self.num
-        self.weights = [
-            self.model.add_global_constant(
-                '%s_weight_%d' % (self.name, i), float(weights[i])
-            ) for i in range(self.num)
-        ]
-
-    def add_ops(self, net):
-        net.WeightedSum(
-            [b for x_w_pair in zip(self.data, self.weights) for b in x_w_pair],
-            self.output_schema()
-        )
diff --git a/caffe2/python/layers/conv.py b/caffe2/python/layers/conv.py
deleted file mode 100644
index 6b7e15fe9041..000000000000
--- a/caffe2/python/layers/conv.py
+++ /dev/null
@@ -1,135 +0,0 @@
-## @package conv
-# Module caffe2.python.layers.conv
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import (
-    ModelLayer,
-)
-import numpy as np
-
-
-class Conv(ModelLayer):
-    """
-        Convolutional layer
-        Input:
-        - input_record: at least has the shape info of C (num_channels)
-        - output_dim: number of convolutional filters
-        - kernel_h, kernel_w: kernel size for h and w
-        - stride_h, stride_w: stride for h and w
-        - pad_b, pad_l, pad_r, pad_t: padding sizes, if stride == 1,
-                                      'None' value will do auto padding
-        - order: either 'NHWC' or 'NCHW'
-    """
-
-    def __init__(self, model, input_record, output_dim, kernel_h, kernel_w,
-                 stride_h, stride_w, pad_b=None, pad_l=None, pad_r=None,
-                 pad_t=None, order='NHWC', kernel_init=None, bias_init=None,
-                 kernel_optim=None, bias_optim=None,
-                 name='conv', **kwargs):
-
-        super().__init__(model, name, input_record, **kwargs)
-        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
-        # input num_channels (C) is needed
-        input_dims = input_record.field_type().shape
-
-        assert (kernel_h > 0 and isinstance(kernel_h, int)), (
-            "kernel_h should be positive integer")
-        assert (kernel_w > 0 and isinstance(kernel_w, int)), (
-            "kernel_w should be positive integer")
-        self.kernel_h = kernel_h
-        self.kernel_w = kernel_w
-
-        assert (stride_h > 0 and isinstance(stride_h, int)), (
-            "stride_h should be positive integer")
-        assert (stride_w > 0 and isinstance(stride_w, int)), (
-            "stride_w should be positive integer")
-        self.stride_h = stride_h
-        self.stride_w = stride_w
-
-        # output_dim calculation (http://cs231n.github.io/convolutional-networks/)
-        # output_dim_w = (input_dim_w - kernel_w + pad_r + pad_l) / stride_w + 1
-        # so, do auto_padding requires
-        # pad_r, pad_l = [(input_dim_w - 1) * stride_w - input_dim_w + kernel_w] / 2
-        # similair for pad_t and pad_b to auto pad kernel_h
-        # here we only do auto padding for stride = 1 case
-        if stride_h == 1:
-            pad_t = int((kernel_h - 1) / 2) if pad_t is None else pad_t
-            pad_b = int((kernel_h - 1) / 2) if pad_b is None else pad_b
-        else:
-            pad_t = 0 if pad_t is None else pad_t
-            pad_b = 0 if pad_b is None else pad_b
-
-        if stride_w == 1:
-            pad_r = int((kernel_w - 1) / 2) if pad_r is None else pad_r
-            pad_l = int((kernel_w - 1) / 2) if pad_l is None else pad_l
-        else:
-            pad_r = 0 if pad_r is None else pad_r
-            pad_l = 0 if pad_l is None else pad_l
-
-        assert (pad_t >= 0 and isinstance(pad_t, int)), "pad_t should be int >= 0"
-        assert (pad_b >= 0 and isinstance(pad_b, int)), "pad_b should be int >= 0"
-        assert (pad_r >= 0 and isinstance(pad_r, int)), "pad_r should be int >= 0"
-        assert (pad_l >= 0 and isinstance(pad_l, int)), "pad_l should be int >= 0"
-        self.pad_t = pad_t
-        self.pad_b = pad_b
-        self.pad_r = pad_r
-        self.pad_l = pad_l
-
-        assert order in ['NHWC', 'NCHW'], "order should either 'NHWC' or 'NCHW'"
-        self.order = order
-
-        if order == 'NHWC':
-            input_c = input_dims[-1]
-            kernel_shape = [output_dim, kernel_h, kernel_w, input_c]
-        elif order == 'NCHW':
-            input_c = input_dims[0]
-            kernel_shape = [output_dim, input_c, kernel_h, kernel_w]
-        assert input_c > 0, (
-            "Number of input channels in conv parameters should be positive")
-
-        kernel_init = kernel_init if kernel_init else (
-            'XavierFill', {}
-        )
-        bias_init = bias_init if bias_init else (
-            'ConstantFill', {'value': 0.0}
-        )
-
-        self.kernel = self.create_param(
-            param_name='conv_kernel',
-            shape=kernel_shape,
-            initializer=kernel_init,
-            optimizer=kernel_optim,
-        )
-
-        self.bias = self.create_param(
-            param_name='conv_bias',
-            shape=[output_dim],
-            initializer=bias_init,
-            optimizer=bias_optim,
-        )
-
-        # the output_schema only has the num of output channels
-        # output_h and output_w would be inferred internally
-        self.output_schema = schema.Scalar(
-            (np.float32, (output_dim,)),
-            self.get_next_blob_reference('output')
-        )
-
-    def add_ops(self, net):
-        net.Conv(
-            self.input_record.field_blobs() + [self.kernel, self.bias],
-            self.output_schema.field_blobs(),
-            kernel_h=self.kernel_h,
-            kernel_w=self.kernel_w,
-            stride_h=self.stride_h,
-            stride_w=self.stride_w,
-            pad_t=self.pad_t,
-            pad_l=self.pad_l,
-            pad_b=self.pad_b,
-            pad_r=self.pad_r,
-            order=self.order
-        )
diff --git a/caffe2/python/layers/dropout.py b/caffe2/python/layers/dropout.py
deleted file mode 100644
index 27d3c91039cc..000000000000
--- a/caffe2/python/layers/dropout.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Module caffe2.python.layers.dropout
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer
-
-
-class Dropout(ModelLayer):
-
-    def __init__(
-            self,
-            model,
-            input_record,
-            name='dropout',
-            ratio=0.5,
-            dropout_for_eval=False,
-            **kwargs):
-
-        super().__init__(model, name, input_record, **kwargs)
-        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
-        assert (ratio >= 0 and ratio < 1.0), \
-            "Expected 0 <= ratio < 1, but got ratio of %s" % ratio
-
-        self.output_schema = input_record.clone_schema()
-        self.output_schema.set_value(self.get_next_blob_reference('output'))
-        self.dropout_for_eval = dropout_for_eval
-
-        self.ratio = ratio
-
-    def _add_ops(self, net, is_test):
-        input_blob = self.input_record.field_blobs()
-        output_blobs = self.output_schema.field_blobs() \
-                     + [net.NextScopedBlob('d_mask')]
-
-        net.Dropout(input_blob,
-                    output_blobs,
-                    ratio=self.ratio,
-                    is_test=is_test)
-
-    def add_train_ops(self, net):
-        self._add_ops(net, is_test=False)
-
-    def add_eval_ops(self, net):
-        self._add_ops(net, is_test=(not self.dropout_for_eval))
-
-    def add_ops(self, net):
-        self.add_eval_ops(net)
diff --git a/caffe2/python/layers/fc.py b/caffe2/python/layers/fc.py
deleted file mode 100644
index a67240a9cd77..000000000000
--- a/caffe2/python/layers/fc.py
+++ /dev/null
@@ -1,230 +0,0 @@
-## @package fc
-# Module caffe2.python.layers.fc
-
-
-
-
-
-from caffe2.python.helpers.arg_scope import get_current_scope
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer
-from caffe2.python.layers.sampling_trainable_mixin import SamplingTrainableMixin
-import math
-import numpy as np
-
-
-def get_fc_predictor_version(fc_version):
-    assert fc_version in ["fp32", "fp16"], (
-        "Only support fp32 and fp16 for the fully connected layer "
-        "in the predictor net, the provided FC precision is {}".format(fc_version)
-    )
-    return fc_version
-
-
-class FC(SamplingTrainableMixin, ModelLayer):
-
-    def __init__(self, model, input_record, output_dims, weight_init=None,
-                 bias_init=None, weight_optim=None, bias_optim=None, name='fc',
-                 weight_reg=None, bias_reg=None, clip_param=None,
-                 max_fc_size=None, axis=1, transposed=False,
-                 uniform_weight_init_scale_numerator=1.0,
-                 **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-        assert isinstance(input_record, schema.Scalar), (
-            "Incorrect input type {}".format(input_record))
-        assert len(input_record.field_types()[0].shape) > 0, (
-            "FC expects limited dimensions of the input tensor")
-        assert axis >= 1, "axis {} should >= 1.".format(axis)
-        self.axis = axis
-        input_dims = np.prod(input_record.field_types()[0].shape[axis - 1:])
-
-        assert input_dims > 0, (
-            "FC expects input dimensions > 0, got {}".format(input_dims))
-
-        self.clip_args = None
-        if (clip_param is not None):
-            assert len(clip_param) == 2, (
-                'clip_param must be a tuple / list '
-                'of length 2 and in the form of (clip_min, clip max)'
-            )
-            clip_min, clip_max = clip_param
-            assert clip_min is not None or clip_max is not None, (
-                'clip_min, and clip_max in clip_param cannot both be None'
-            )
-            assert (
-                (clip_min is None or clip_max is None) or clip_min < clip_max
-            ), (
-                'clip_param = [clip_min, clip_max] must have clip_min < clip_max'
-            )
-            self.clip_args = {}
-            if clip_min is not None:
-                self.clip_args['min'] = clip_min
-            if clip_max is not None:
-                self.clip_args['max'] = clip_max
-
-        if uniform_weight_init_scale_numerator is None:
-            uniform_weight_init_scale_numerator = 1.0
-
-        scale = math.sqrt(uniform_weight_init_scale_numerator / input_dims)
-        weight_init = weight_init if weight_init else (
-            'UniformFill', {'min': -scale, 'max': scale})
-        bias_init = bias_init if bias_init else (
-            'UniformFill', {'min': -scale, 'max': scale})
-
-        self.output_dim_vec = FC.calculate_fc_output_dims(
-            max_fc_size, input_dims, output_dims)
-
-        self.transposed = transposed
-        if self.output_dim_vec is None or len(self.output_dim_vec) == 1:
-            weight_shape = [input_dims, output_dims] if transposed else [output_dims, input_dims]
-            self.w = self.create_param(param_name='w',
-                                       shape=weight_shape,
-                                       initializer=weight_init,
-                                       optimizer=weight_optim,
-                                       regularizer=weight_reg)
-
-            self.b = self.create_param(param_name='b',
-                                       shape=[output_dims, ],
-                                       initializer=bias_init,
-                                       optimizer=bias_optim,
-                                       regularizer=bias_reg)
-        else:
-            self.w_vec = []
-            self.b_vec = []
-
-            for idx, output_dim in enumerate(self.output_dim_vec):
-                weight_shape = [input_dims, output_dim] if transposed else [output_dim, input_dims]
-                self.w_vec.append(self.create_param(param_name='w_sub_{}'.format(idx),
-                                             shape=weight_shape,
-                                             initializer=weight_init,
-                                             optimizer=weight_optim,
-                                             regularizer=weight_reg))
-
-                self.b_vec.append(self.create_param(param_name='b_sub_{}'.format(idx),
-                                             shape=[output_dim, ],
-                                             initializer=weight_init,
-                                             optimizer=weight_optim,
-                                             regularizer=weight_reg))
-        if axis == 1:
-            output_shape = (output_dims, )
-        else:
-            output_shape = list(input_record.field_types()[0].shape)[0: axis - 1]
-            output_shape = tuple(output_shape + [output_dims])
-
-        self.output_schema = schema.Scalar(
-            (np.float32, output_shape),
-            self.get_next_blob_reference('output')
-        )
-
-    @staticmethod
-    def calculate_fc_output_dims(max_fc_size, input_dim, output_dim):
-
-        if not max_fc_size or max_fc_size < 0:
-            return None
-
-        assert max_fc_size >= input_dim, "Currently we split along the output " \
-            "dimension. So we need max_fc_size >= input_dim. But, max_fc_size: " \
-            "{}, input_dim: {}".format(max_fc_size, input_dim)
-
-        output_dim_allowed = int(np.floor(max_fc_size / input_dim))
-        num_fc = int(np.floor((output_dim - 1) / output_dim_allowed) + 1)
-
-        output_dim_vec = [output_dim_allowed] * (num_fc - 1)
-
-        output_dim_vec.append(output_dim - sum(output_dim_vec))
-
-        return output_dim_vec
-
-    def _insert_fc_ops(self, net, params, outputs, version):
-        """
-        Args:
-            net: the caffe2 net to insert operator
-            params: weight and bias for FC
-            outputs: the output blobs
-            version: support fp32 and fp16 for now.
-        """
-        if version == "fp32":
-            if self.transposed:
-                return net.FCTransposed(
-                    self.input_record.field_blobs() + params,
-                    outputs,
-                    axis=self.axis,
-                    **self.kwargs
-                )
-            else:
-                return net.FC(
-                    self.input_record.field_blobs() + params,
-                    outputs,
-                    axis=self.axis,
-                    **self.kwargs
-                )
-        elif version == "fp16":
-            return net.FbFCPacked(
-                self.input_record.field_blobs() + params,
-                outputs,
-                axis=self.axis,
-                **self.kwargs
-            )
-        else:
-            raise Exception("unsupported FC type version {}".format(version))
-
-    def _add_ops(self, net, params, version):
-        """
-        Args:
-            params : the weight and bias,
-                passed by either add_ops or add_train_ops function
-            version : fp16 or fp32, might support in8 in the future.
-        """
-        if self.clip_args is not None:
-            clipped_params = [net.NextScopedBlob(
-                'clipped_%s' % str(p)) for p in params]
-            for p, cp in zip(params, clipped_params):
-                net.Clip([p], [cp], **self.clip_args)
-            params = clipped_params
-
-        if self.output_dim_vec is None or len(self.output_dim_vec) == 1:
-            self._insert_fc_ops(net, params, self.output_schema.field_blobs(), version)
-        else:
-            w_vec = params[:int(len(params) / 2)]
-            b_vec = params[int(len(params) / 2):]
-
-            assert len(w_vec) == len(b_vec)
-
-            output_blob_vec = []
-
-            for i in range(len(self.output_dim_vec)):
-                output_blob = net.NextScopedBlob(
-                    'output_sub_{}'.format(i))
-                insert_ret = self._insert_fc_ops(
-                    net, [w_vec[i], b_vec[i]], [output_blob], version
-                )
-                output_blob_vec.append(insert_ret)
-            net.Concat(output_blob_vec,
-                       self.output_schema.field_blobs() +
-                       [self.output_schema.field_blobs()[0] + "_concat_dims"])
-
-    def add_ops(self, net):
-        """Both the predict net and the eval net will call this function
-        """
-        version_info = get_current_scope().get(
-            get_fc_predictor_version.__name__, {'fc_version': 'fp32'}
-        )
-        predictor_fc_fp_version = version_info['fc_version']
-        self._add_ops(net, self.param_blobs, predictor_fc_fp_version)
-
-    def add_train_ops(self, net):
-        # use the train_param_blobs to be consistent with the SamplingTrain unittest
-        self._add_ops(net, self.train_param_blobs, "fp32")
-
-    def get_fp16_compatible_parameters(self):
-        if self.output_dim_vec is None or len(self.output_dim_vec) == 1:
-            return [self.w]
-        else:
-            return self.w_vec
-
-    @property
-    def param_blobs(self):
-        if self.output_dim_vec is None or len(self.output_dim_vec) == 1:
-            return [self.w, self.b]
-        else:
-            return self.w_vec + self.b_vec
diff --git a/caffe2/python/layers/fc_with_bootstrap.py b/caffe2/python/layers/fc_with_bootstrap.py
deleted file mode 100644
index 75f5a41f51fe..000000000000
--- a/caffe2/python/layers/fc_with_bootstrap.py
+++ /dev/null
@@ -1,363 +0,0 @@
-## @package fc_with_bootstrap
-# Module caffe2.python.layers.fc_with_bootstrap
-
-
-import math
-
-import numpy as np
-from caffe2.python import core, schema
-from caffe2.python.helpers.arg_scope import get_current_scope
-from caffe2.python.layers.layers import ModelLayer
-from caffe2.python.layers.sampling_trainable_mixin import SamplingTrainableMixin
-
-
-def get_fc_predictor_version(fc_version):
-    assert fc_version in ["fp32"], (
-        "Only support fp32 for the fully connected layer "
-        "in the predictor net, the provided FC precision is {}".format(fc_version)
-    )
-    return fc_version
-
-
-class FCWithBootstrap(SamplingTrainableMixin, ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        output_dims,
-        num_bootstrap,
-        weight_init=None,
-        bias_init=None,
-        weight_optim=None,
-        bias_optim=None,
-        name="fc_with_bootstrap",
-        weight_reg=None,
-        bias_reg=None,
-        clip_param=None,
-        axis=1,
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-        assert isinstance(
-            input_record, schema.Scalar
-        ), "Incorrect input type {}".format(input_record)
-        assert (
-            len(input_record.field_types()[0].shape) > 0
-        ), "FC expects limited dimensions of the input tensor"
-        assert axis >= 1, "axis {} should >= 1.".format(axis)
-        self.axis = axis
-        input_dims = np.prod(input_record.field_types()[0].shape[axis - 1 :])
-
-        assert input_dims > 0, "FC expects input dimensions > 0, got {}".format(
-            input_dims
-        )
-
-        self.clip_args = None
-
-        # attributes for bootstrapping below
-        self.num_bootstrap = num_bootstrap
-
-        # input dim shape
-        self.input_dims = input_dims
-
-        # bootstrapped fully-connected layers to be used in eval time
-        self.bootstrapped_FCs = []
-
-        # scalar containing batch_size blob so that we don't need to recompute
-        self.batch_size = None
-
-        # we want this to be the last FC, so the output_dim should be 1, set to None
-        self.output_dim_vec = None
-
-        # lower bound when creating random indices
-        self.lower_bound = None
-
-        # upper bound when creating random indices
-        self.upper_bound = None
-
-        if clip_param is not None:
-            assert len(clip_param) == 2, (
-                "clip_param must be a tuple / list "
-                "of length 2 and in the form of (clip_min, clip max)"
-            )
-            clip_min, clip_max = clip_param
-            assert (
-                clip_min is not None or clip_max is not None
-            ), "clip_min, and clip_max in clip_param cannot both be None"
-            assert (
-                clip_min is None or clip_max is None
-            ) or clip_min < clip_max, (
-                "clip_param = [clip_min, clip_max] must have clip_min < clip_max"
-            )
-            self.clip_args = {}
-            if clip_min is not None:
-                self.clip_args["min"] = clip_min
-            if clip_max is not None:
-                self.clip_args["max"] = clip_max
-
-        scale = math.sqrt(1.0 / input_dims)
-        weight_init = (
-            weight_init
-            if weight_init
-            else ("UniformFill", {"min": -scale, "max": scale})
-        )
-        bias_init = (
-            bias_init if bias_init else ("UniformFill", {"min": -scale, "max": scale})
-        )
-
-        """
-        bootstrapped FCs:
-            Ex: [
-                bootstrapped_weights_blob_1, bootstrapped_bias_blob_1,
-                ...,
-                ...,
-                bootstrapped_weights_blob_b, bootstrapped_bias_blob_b
-                ]
-
-        output_schema:
-            Note: indices will always be on even indices.
-            Ex: Struct(
-                    indices_0_blob,
-                    preds_0_blob,
-                    ...
-                    ...
-                    indices_b_blob,
-                    preds_b_blob
-                )
-        """
-        bootstrapped_FCs = []
-        output_schema = schema.Struct()
-        for i in range(num_bootstrap):
-            output_schema += schema.Struct(
-                (
-                    "bootstrap_iteration_{}/indices".format(i),
-                    self.get_next_blob_reference(
-                        "bootstrap_iteration_{}/indices".format(i)
-                    ),
-                ),
-                (
-                    "bootstrap_iteration_{}/preds".format(i),
-                    self.get_next_blob_reference(
-                        "bootstrap_iteration_{}/preds".format(i)
-                    ),
-                ),
-            )
-            self.bootstrapped_FCs.extend(
-                [
-                    self.create_param(
-                        param_name="bootstrap_iteration_{}/w".format(i),
-                        shape=[output_dims, input_dims],
-                        initializer=weight_init,
-                        optimizer=weight_optim,
-                        regularizer=weight_reg,
-                    ),
-                    self.create_param(
-                        param_name="bootstrap_iteration_{}/b".format(i),
-                        shape=[output_dims],
-                        initializer=bias_init,
-                        optimizer=bias_optim,
-                        regularizer=bias_reg,
-                    ),
-                ]
-            )
-
-        self.output_schema = output_schema
-
-        if axis == 1:
-            output_shape = (output_dims,)
-        else:
-            output_shape = list(input_record.field_types()[0].shape)[0 : axis - 1]
-            output_shape = tuple(output_shape + [output_dims])
-
-    def _generate_bootstrapped_indices(self, net, copied_cur_layer, iteration):
-        """
-        Args:
-            net: the caffe2 net to insert operator
-
-            copied_cur_layer: blob of the bootstrapped features (make sure this
-            blob has a stop_gradient on)
-
-            iteration: the bootstrap interation to generate for. Used to correctly
-            populate the output_schema
-
-        Return:
-            A blob containing the generated indices of shape: (batch_size,)
-        """
-        with core.NameScope("bootstrap_iteration_{}".format(iteration)):
-            if iteration == 0:
-                # capture batch_size once for efficiency
-                input_shape = net.Shape(copied_cur_layer, "input_shape")
-                batch_size_index = net.Const(np.array([0]), "batch_size_index")
-                batch_size = net.Gather([input_shape, batch_size_index], "batch_size")
-                self.batch_size = batch_size
-
-                lower_bound = net.Const(np.array([0]), "lower_bound", dtype=np.int32)
-                offset = net.Const(np.array([1]), "offset", dtype=np.int32)
-                int_batch_size = net.Cast(
-                    [self.batch_size], "int_batch_size", to=core.DataType.INT32
-                )
-                upper_bound = net.Sub([int_batch_size, offset], "upper_bound")
-
-                self.lower_bound = lower_bound
-                self.upper_bound = upper_bound
-
-            indices = net.UniformIntFill(
-                [self.batch_size, self.lower_bound, self.upper_bound],
-                self.output_schema[iteration * 2].field_blobs()[0],
-                input_as_shape=1,
-            )
-
-            return indices
-
-    def _bootstrap_ops(self, net, copied_cur_layer, indices, iteration):
-        """
-            This method contains all the bootstrapping logic used to bootstrap
-            the features. Only used by the train_net.
-
-            Args:
-                net: the caffe2 net to insert bootstrapping operators
-
-                copied_cur_layer: the blob representing the current features.
-                    Note, this layer should have a stop_gradient on it.
-
-            Returns:
-                bootstrapped_features: blob of bootstrapped version of cur_layer
-                    with same dimensions
-        """
-
-        # draw features based upon the bootstrapped indices
-        bootstrapped_features = net.Gather(
-            [copied_cur_layer, indices],
-            net.NextScopedBlob("bootstrapped_features_{}".format(iteration)),
-        )
-
-        bootstrapped_features = schema.Scalar(
-            (np.float32, self.input_dims), bootstrapped_features
-        )
-
-        return bootstrapped_features
-
-    def _insert_fc_ops(self, net, features, params, outputs, version):
-        """
-        Args:
-            net: the caffe2 net to insert operator
-
-            features: Scalar containing blob of the bootstrapped features or
-            actual cur_layer features
-
-            params: weight and bias for FC
-
-            outputs: the output blobs
-
-            version: support fp32 for now.
-        """
-
-        if version == "fp32":
-            pred_blob = net.FC(
-                features.field_blobs() + params, outputs, axis=self.axis, **self.kwargs
-            )
-            return pred_blob
-        else:
-            raise Exception("unsupported FC type version {}".format(version))
-
-    def _add_ops(self, net, features, iteration, params, version):
-        """
-        Args:
-            params: the weight and bias, passed by either add_ops or
-            add_train_ops function
-
-            features: feature blobs to predict on. Can be the actual cur_layer
-            or the bootstrapped_feature blobs.
-
-            version: currently fp32 support only
-        """
-
-        if self.clip_args is not None:
-            clipped_params = [net.NextScopedBlob("clipped_%s" % str(p)) for p in params]
-            for p, cp in zip(params, clipped_params):
-                net.Clip([p], [cp], **self.clip_args)
-            params = clipped_params
-
-        if self.output_dim_vec is None or len(self.output_dim_vec) == 1:
-            self._insert_fc_ops(
-                net=net,
-                features=features,
-                params=params,
-                outputs=[self.output_schema.field_blobs()[(iteration * 2) + 1]],
-                version=version,
-            )
-
-    def add_ops(self, net):
-        """
-            Both the predict net and the eval net will call this function.
-
-            For bootstrapping approach, the goal is to pass the cur_layer feature
-            inputs through all the bootstrapped FCs that are stored under
-            self.bootstrapped_FCs. Return the preds in the same output_schema
-            with dummy indices (because they are not needed).
-        """
-
-        version_info = get_current_scope().get(
-            get_fc_predictor_version.__name__, {"fc_version": "fp32"}
-        )
-        predictor_fc_fp_version = version_info["fc_version"]
-
-        for i in range(self.num_bootstrap):
-            # these are dummy indices, not to be used anywhere
-            indices = self._generate_bootstrapped_indices(
-                net=net,
-                copied_cur_layer=self.input_record.field_blobs()[0],
-                iteration=i,
-            )
-
-            params = self.bootstrapped_FCs[i * 2 : (i * 2) + 2]
-
-            self._add_ops(
-                net=net,
-                features=self.input_record,
-                params=params,
-                iteration=i,
-                version=predictor_fc_fp_version,
-            )
-
-    def add_train_ops(self, net):
-        # use the train_param_blobs to be consistent with the SamplingTrain unittest
-
-        # obtain features
-        for i in range(self.num_bootstrap):
-            indices = self._generate_bootstrapped_indices(
-                net=net,
-                copied_cur_layer=self.input_record.field_blobs()[0],
-                iteration=i,
-            )
-            bootstrapped_features = self._bootstrap_ops(
-                net=net,
-                copied_cur_layer=self.input_record.field_blobs()[0],
-                indices=indices,
-                iteration=i,
-            )
-            self._add_ops(
-                net,
-                features=bootstrapped_features,
-                iteration=i,
-                params=self.train_param_blobs[i * 2 : (i * 2) + 2],
-                version="fp32",
-            )
-
-    def get_fp16_compatible_parameters(self):
-        if self.output_dim_vec is None or len(self.output_dim_vec) == 1:
-            return [
-                blob for idx, blob in enumerate(self.bootstrapped_FCs) if idx % 2 == 0
-            ]
-
-        else:
-            raise Exception(
-                "Currently only supports functionality for output_dim_vec == 1"
-            )
-
-    @property
-    def param_blobs(self):
-        if self.output_dim_vec is None or len(self.output_dim_vec) == 1:
-            return self.bootstrapped_FCs
-        else:
-            raise Exception("FCWithBootstrap layer only supports output_dim_vec==1")
diff --git a/caffe2/python/layers/fc_without_bias.py b/caffe2/python/layers/fc_without_bias.py
deleted file mode 100644
index 15f11c83dbb0..000000000000
--- a/caffe2/python/layers/fc_without_bias.py
+++ /dev/null
@@ -1,63 +0,0 @@
-## @package fc_without_bias
-# Module caffe2.python.layers.fc_without_bias
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer
-from caffe2.python.layers.sampling_trainable_mixin import SamplingTrainableMixin
-
-import math
-import numpy as np
-
-
-class FCWithoutBias(SamplingTrainableMixin, ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        output_dims,
-        weight_init=None,
-        weight_optim=None,
-        name='fc_without_bias',
-        uniform_weight_init_scale_numerator=1.0,
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
-        assert len(input_record.field_types()[0].shape) > 0, (
-            "FCWithoutBias expects limited dimensions of the input tensor"
-        )
-
-        input_dims = input_record.field_types()[0].shape[0]
-        assert input_dims > 0, (
-            "FCWithoutBias expects input dimensions > 0, got {}".format(input_dims)
-        )
-
-        self.output_schema = schema.Scalar(
-            (np.float32, (output_dims, )),
-            self.get_next_blob_reference('output')
-        )
-
-        scale = math.sqrt(uniform_weight_init_scale_numerator / input_dims)
-        weight_init = weight_init if weight_init else (
-            'UniformFill', {'min': -scale,
-                            'max': scale}
-        )
-
-        self.w = self.create_param(param_name='w',
-                                   shape=[output_dims, input_dims],
-                                   initializer=weight_init,
-                                   optimizer=weight_optim)
-
-    def _add_ops(self, net, params):
-        net.MatMul(
-            self.input_record.field_blobs() + params,
-            self.output_schema.field_blobs(), trans_b=1, **self.kwargs
-        )
-
-    @property
-    def param_blobs(self):
-        return [self.w]
diff --git a/caffe2/python/layers/feature_sparse_to_dense.py b/caffe2/python/layers/feature_sparse_to_dense.py
deleted file mode 100644
index 50ccdaafa7cd..000000000000
--- a/caffe2/python/layers/feature_sparse_to_dense.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# @package sparse_to_dense
-# Module caffe2.python.layers.sparse_to_dense
-
-
-from collections import defaultdict
-
-import numpy as np
-from caffe2.python import schema
-from caffe2.python.layers.layers import AccessedFeatures, ModelLayer
-
-
-class FeatureSparseToDense(ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        input_specs,
-        name="feature_sparse_to_dense",
-        default_dense_value=None,
-        **kwargs
-    ):
-        """
-        `input_specs` follows the format of FeatureSpec from schema. To be more
-        precise it's a namedtuple that should have:
-            'feature_type', 'feature_names', 'feature_ids'
-        Default_dense_value can only be 0.0 or float("NaN"). Any input that isn't
-        None will be NaN.
-        """
-        super().__init__(model, name, input_record, **kwargs)
-        if default_dense_value is None:
-            default_dense_value = 0.0
-        default_dense_value = float(default_dense_value)
-        assert (
-            np.isnan(default_dense_value) or default_dense_value == 0.0
-        ), "default_dense_value can only be 0.0 or NaN"
-
-        self.input_specs = input_specs
-        self.default_float_value = (
-            model.global_constants["NAN"]
-            if np.isnan(default_dense_value)
-            else model.global_constants["ZERO"]
-        )
-        self.zero_range = model.global_constants["ZERO_RANGE"]
-
-        outputs = []
-        for field, feature_specs in self.input_specs:
-            assert len(feature_specs.feature_names) == len(feature_specs.feature_ids)
-            if feature_specs.feature_type == "FLOAT":
-                outputs.append(
-                    (
-                        field,
-                        schema.Scalar(
-                            (np.float32, (len(feature_specs.feature_ids),)),
-                            self.get_next_blob_reference(field + "_output"),
-                        ),
-                    )
-                )
-            elif feature_specs.feature_type == "ID_LIST":
-                outputs.append(
-                    (
-                        field,
-                        schema.Struct(
-                            (
-                                "ranges",
-                                schema.Scalar(
-                                    (np.int32, (len(feature_specs.feature_ids), 2)),
-                                    self.get_next_blob_reference(field + "_ranges"),
-                                ),
-                            ),
-                            (
-                                "values",
-                                schema.Scalar(
-                                    np.int64,
-                                    self.get_next_blob_reference(field + "_values"),
-                                ),
-                            ),
-                        ),
-                    )
-                )
-            elif feature_specs.feature_type == "ID_SCORE_LIST":
-                outputs.append(
-                    (
-                        field,
-                        schema.Struct(
-                            (
-                                "ranges",
-                                schema.Scalar(
-                                    (np.int32, (len(feature_specs.feature_ids), 2)),
-                                    self.get_next_blob_reference(field + "_ranges"),
-                                ),
-                            ),
-                            (
-                                "ids",
-                                schema.Scalar(
-                                    np.int64,
-                                    self.get_next_blob_reference(field + "_ids"),
-                                ),
-                            ),
-                            (
-                                "scores",
-                                schema.Scalar(
-                                    np.float32,
-                                    self.get_next_blob_reference(field + "_scores"),
-                                ),
-                            ),
-                        ),
-                    )
-                )
-            elif feature_specs.feature_type == "EMBEDDING":
-                # We don't know dimensions of embeddings in input data.
-                # Even though they should match dimensions from feature config,
-                # we keep ranges blob to check input data later.
-                outputs.append(
-                    (
-                        field,
-                        schema.Struct(
-                            (
-                                "ranges",
-                                schema.Scalar(
-                                    (np.int32, (len(feature_specs.feature_ids), 2)),
-                                    self.get_next_blob_reference(field + "_ranges"),
-                                ),
-                            ),
-                            (
-                                "values",
-                                schema.Scalar(
-                                    np.float32,
-                                    self.get_next_blob_reference(field + "_values"),
-                                ),
-                            ),
-                        ),
-                    )
-                )
-            elif feature_specs.feature_type == "GENERIC_FEATURE":
-                # We don't know dimensions of embeddings in input data.
-                # Even though they should match dimensions from feature config,
-                # we keep ranges blob to check input data later.
-                # Currently this schema with ranges and values is only for
-                # generic type enum 1. If new types are implemented, we need to
-                # modify the ParseGeneric operator, and this part accordingly
-                outputs.append(
-                    (
-                        field,
-                        schema.Struct(
-                            (
-                                "ranges",
-                                schema.Scalar(
-                                    (np.int32, (len(feature_specs.feature_ids), 2)),
-                                    self.get_next_blob_reference(field + "_ranges"),
-                                ),
-                            ),
-                            (
-                                "values",
-                                schema.Scalar(
-                                    np.float32,
-                                    self.get_next_blob_reference(field + "_values"),
-                                ),
-                            ),
-                        ),
-                    )
-                )
-            else:
-                raise TypeError(
-                    "Unsupported input type: {0}".format(feature_specs.feature_type)
-                )
-
-        # TODO(amalevich): This schema is producing ranges. And thus if there is
-        # something using it it should support ranges as well. It might be
-        # confusing, if we don't add better support for ranges/have it as a
-        # first layer
-        self.output_schema = schema.Struct(*outputs)
-
-        # TODO(amalevich): Consider moving this data to schema, instead
-        # Structs doesn't support attaching metadata to them and clonning
-        # will break things badly, but this is the most elegant way to pass
-        # this info around. Should we change it or it'll be too much work and
-        # not worse it?
-        for field, feature_specs in input_specs:
-            schema.attach_metadata_to_scalars(
-                self.output_schema[field], schema.Metadata(feature_specs=feature_specs)
-            )
-
-    # Add operators to all types that need to be densified
-    def add_ops(self, net):
-        record = self.input_record
-        for field, feature_specs in self.input_specs:
-            if feature_specs.feature_type == "FLOAT":
-                net.SparseToDenseMask(
-                    [
-                        record[field].keys(),
-                        record[field].values(),
-                        self.default_float_value,
-                        record[field].lengths(),
-                    ],
-                    [self.output_schema[field]()],
-                    mask=feature_specs.feature_ids,
-                )
-            elif feature_specs.feature_type == "ID_LIST":
-                id_list_ranges = net.LengthsToRanges(
-                    record[field].values.lengths(), net.NextScopedBlob("id_list_ranges")
-                )
-                net.SparseToDenseMask(
-                    [
-                        record[field].keys(),
-                        id_list_ranges,
-                        self.zero_range,
-                        record[field].lengths(),
-                    ],
-                    self.output_schema[field].ranges(),
-                    mask=feature_specs.feature_ids,
-                )
-                # Alias helps to enforce the fact that all SparseToDense calls
-                # produce new blobs.
-                # Reusing blob names might result in some weird consequences
-                # during the delivery time, when content of the blobs is
-                # generated based on the inputSpecs.
-                net.Alias(
-                    record[field].values.items(), self.output_schema[field].values()
-                )
-            elif feature_specs.feature_type == "ID_SCORE_LIST":
-                # TODO: merge this to the case above?
-                id_list_ranges = net.LengthsToRanges(
-                    record[field].values.lengths(),
-                    net.NextScopedBlob("id_score_list_ranges"),
-                )
-                net.SparseToDenseMask(
-                    [
-                        record[field].keys(),
-                        id_list_ranges,
-                        self.zero_range,
-                        record[field].lengths(),
-                    ],
-                    self.output_schema[field].ranges(),
-                    mask=feature_specs.feature_ids,
-                )
-                # Alias helps to enforce the fact that all SparseToDense calls
-                # produce new blobs.
-                # Reusing blob names might result in some weird consequences
-                # during the delivery time, when content of the blobs is
-                # generated based on the inputSpecs.
-                net.Alias(record[field].values.keys(), self.output_schema[field].ids())
-                net.Alias(
-                    record[field].values.values(), self.output_schema[field].scores()
-                )
-            elif feature_specs.feature_type == "EMBEDDING":
-                ranges = net.LengthsToRanges(
-                    record[field].values.lengths(),
-                    net.NextScopedBlob("embeddings_ranges"),
-                )
-                net.SparseToDenseMask(
-                    [
-                        record[field].keys(),
-                        ranges,
-                        self.zero_range,
-                        record[field].lengths(),
-                    ],
-                    self.output_schema[field].ranges(),
-                    mask=feature_specs.feature_ids,
-                )
-                # Alias helps to enforce the fact that all SparseToDense calls
-                # produce new blobs.
-                # Reusing blob names might result in some weird consequences
-                # during the delivery time, when content of the blobs is
-                # generated based on the inputSpecs.
-                net.Alias(
-                    record[field].values.items(), self.output_schema[field].values()
-                )
-            elif feature_specs.feature_type == "GENERIC_FEATURE":
-                (
-                    feature_lengths_blob,
-                    feature_ids_blob,
-                    value_lengths_blob,
-                    value_values_blob,
-                ) = net.ParseGeneric(
-                    [record[field]()],
-                    ["feature_lengths", "feature_ids", "value_lengths", "value_values"],
-                    feature_type_enum=1,
-                )
-                # Currently our implementation only supports
-                # generic type enum 1. If new types are implemented, we need to
-                # modify the ParseGeneric operator, the schema above,
-                # and this part accordingly to parse the generic feature strings
-                # into input_record
-
-                ranges = net.LengthsToRanges(
-                    value_lengths_blob, net.NextScopedBlob("generics_ranges")
-                )
-                net.SparseToDenseMask(
-                    [feature_ids_blob, ranges, self.zero_range, feature_lengths_blob],
-                    self.output_schema[field].ranges(),
-                    mask=feature_specs.feature_ids,
-                )
-                # Alias helps to enforce the fact that all SparseToDense calls
-                # produce new blobs.
-                # Reusing blob names might result in some weird consequences
-                # during the delivery time, when content of the blobs is
-                # generated based on the inputSpecs.
-                net.Alias(value_values_blob, self.output_schema[field].values())
-
-    def get_metadata(self):
-        metadata = []
-        for field, feature_specs in self.input_specs:
-            metadata.append(
-                (
-                    {
-                        "type": feature_specs.feature_type,
-                        "names": feature_specs.feature_names,
-                        "ids": feature_specs.feature_ids,
-                    },
-                    self.output_schema[field].field_blobs(),
-                    self.output_schema[field].field_types(),
-                )
-            )
-            if feature_specs.feature_type == "FLOAT":
-                metadata[-1][0]["cardinality"] = 1
-        return metadata
-
-    def get_accessed_features(self):
-        accessed_features = defaultdict(list)
-
-        # The features that are accessed are just those features that appear in
-        # the input specs
-        for field, feature_specs in self.input_specs:
-            accessed_features[field].append(
-                AccessedFeatures(
-                    feature_specs.feature_type, set(feature_specs.feature_ids)
-                )
-            )
-
-        return accessed_features
diff --git a/caffe2/python/layers/functional.py b/caffe2/python/layers/functional.py
deleted file mode 100644
index 4543f695337d..000000000000
--- a/caffe2/python/layers/functional.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# @package functional
-# Module caffe2.python.layers.functional
-
-
-
-
-
-from caffe2.python import core, schema, scope, workspace
-from caffe2.python.layers.layers import (
-    ModelLayer,
-)
-import caffe2.proto.caffe2_pb2 as caffe2_pb2
-import numpy as np
-import logging
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-class Functional(ModelLayer):
-
-    def __init__(self, model, input_record, output_names_or_num, function,
-                 name='functional', output_dtypes=None, tags=None, **kwargs):
-
-        # allow coercion
-        input_record = schema.as_record(input_record)
-
-        super().__init__(model, name, input_record, tags=tags, **kwargs)
-        self._function = function
-        self._kwargs = kwargs
-        return_struct = (
-            isinstance(output_names_or_num, list) or
-            (isinstance(output_names_or_num, int) and
-             output_names_or_num != 1)
-        )
-
-        with scope.NameScope(self.name, reset=True):
-            if isinstance(output_names_or_num, int):
-                struct_output_schema = schema.NewRecord(
-                    model.net, schema.RawTuple(output_names_or_num))
-            elif isinstance(output_names_or_num, schema.Field):
-                self.output_schema = output_names_or_num.clone(keep_blobs=True)
-                return
-            else:
-                if not isinstance(output_names_or_num, list):
-                    output_names_or_num = [output_names_or_num]
-                out_tuple = [(out, np.void) for out in output_names_or_num]
-                struct_output_schema = schema.NewRecord(
-                    model.net, schema.Struct(*out_tuple))
-
-        num_outputs = len(struct_output_schema.field_blobs())
-
-        # functional layer returns Struct if more than one outputs or output is
-        # a list, otherwise Scalar
-        if return_struct:
-            self.output_schema = struct_output_schema
-        else:
-            self.output_schema = struct_output_schema[0]
-
-        # If output_dtypes is provided, use it for output schema. Otherwise
-        # the shape and type will be inferred.
-        if output_dtypes is not None:
-            if not isinstance(output_dtypes, list):
-                output_dtypes = [output_dtypes] * num_outputs
-            assert len(output_dtypes) == num_outputs
-            for dtype, scalar in zip(output_dtypes,
-                                     self.output_schema.all_scalars()):
-                scalar.set_type(dtype)
-            return
-
-        # Fake execution of the function to infer shapes and types automatically
-        had_issues = False
-        try:
-            type_net = core.Net('_temp_type_and_shape_inference_net')
-            schema.InitEmptyRecord(type_net, input_record, enforce_types=True)
-
-            function(type_net, self.input_record, self.output_schema, **kwargs)
-            (shapes, types) = workspace.InferShapesAndTypes([type_net], {})
-            for i in range(num_outputs):
-                scalar_schema = (self.output_schema[i] if return_struct
-                                 else self.output_schema)
-                blob = scalar_schema()
-                if blob not in types or blob not in shapes:
-                    had_issues = True
-                    continue
-                if shapes[blob] == []:
-                    # Scalar type
-                    shape = tuple()
-                elif shapes[blob][0] == 0:
-                    shape = tuple(shapes[blob][1:])
-                else:
-                    logger.warning("unexpected shape: {}".format(shapes[blob]))
-                    # If batch dimension is not first - give up on shape
-                    # inference for that blob
-                    had_issues = True
-                    continue
-
-                # TODO(amalevich): Move it to some shared library
-                dtype = None
-                if types[blob] == caffe2_pb2.TensorProto.DOUBLE:
-                    dtype = (np.float64, shape)
-                elif types[blob] == caffe2_pb2.TensorProto.FLOAT:
-                    dtype = (np.float32, shape)
-                elif types[blob] == caffe2_pb2.TensorProto.INT32:
-                    dtype = (np.int32, shape)
-                elif types[blob] == caffe2_pb2.TensorProto.INT64:
-                    dtype = (np.int64, shape)
-                elif types[blob] == caffe2_pb2.TensorProto.FLOAT16:
-                    dtype = (np.float16, shape)
-
-                if dtype is not None:
-                    scalar_schema.set_type(dtype)
-        except TypeError as ex:
-            had_issues = True
-            logger.warning(str(ex))
-
-        if had_issues:
-            logger.warning(
-                "Type inference had problems for layer: {}".format(self.name))
-
-    def add_ops(self, net):
-        self._function(
-            net, self.input_record, self.output_schema, **(self._kwargs))
diff --git a/caffe2/python/layers/gather_record.py b/caffe2/python/layers/gather_record.py
deleted file mode 100644
index 2ed36015981a..000000000000
--- a/caffe2/python/layers/gather_record.py
+++ /dev/null
@@ -1,89 +0,0 @@
-## @package gather_record
-# Module caffe2.python.layers.gather_record
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import ModelLayer
-
-
-class GatherRecord(ModelLayer):
-    """
-    Given 1-D `indices` tensor, gather elements at `i` in `indices` from all the
-    blobs in `record`. If a blob is a values blob of a list, all the elements
-    included by the list's lengths blob are gathered. For example,
-
-    Input:
-        indices = [0, 2]
-        record:a = [[0, 1], [2, 3], [4, 5], [6, 7]]
-        record:b:lengths = [0, 1, 2, 3]
-        record:b:items = [0, 1, 2, 3, 4, 5]
-
-    Output:
-        a = [[0, 1], [4, 5]]
-        b:lengths = [0, 2]
-        b:items = [1, 2]
-
-    This supports nested list.
-    """
-
-    def __init__(self, model, input_record, name='gather_record', **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-
-        assert 'indices' in input_record
-        assert 'record' in input_record
-
-        self.output_schema = schema.NewRecord(
-            model.net, input_record.record.clone_schema())
-
-        self._indices = self.input_record.indices()
-
-    def _gather_scalar(self, net, record, lengths_blob, output_record):
-        if lengths_blob is None:
-            net.Gather([record(), self._indices], output_record())
-        else:
-            net.LengthsGather([record(), lengths_blob, self._indices],
-                              output_record())
-
-    def _gather_struct(self, net, record, lengths_blob, output_record):
-        for name, field in record.get_children():
-            self._dispatch(net, field, lengths_blob, output_record[name])
-
-    def _gather_list(self, net, record, lengths_blob, output_record):
-        self._gather_scalar(
-            net, record.lengths, lengths_blob, output_record.lengths)
-        if lengths_blob is None:
-            lengths_blob = record.lengths()
-        else:
-            # TODO(kittipat): This is a hacky solution until LengthsSum for int
-            # is implemented
-            lengths_float = net.Cast(
-                record.lengths(),
-                net.NextScopedBlob(str(record.lengths()) + '_float'),
-                to=core.DataType.FLOAT,
-            )
-            lengths_blob_float = net.LengthsSum(
-                [lengths_float, lengths_blob],
-                net.NextScopedBlob(str(record.lengths()) + "_nested_float")
-            )
-            lengths_blob = net.Cast(
-                lengths_blob_float,
-                net.NextScopedBlob(str(record.lengths()) + "_nested"),
-                to=core.DataType.INT32,
-            )
-        self._dispatch(net, record._items, lengths_blob, output_record._items)
-
-    def _dispatch(self, net, record, lengths_blob, output_record):
-        if isinstance(record, schema.Scalar):
-            self._gather_scalar(net, record, lengths_blob, output_record)
-        elif isinstance(record, schema.Struct):
-            self._gather_struct(net, record, lengths_blob, output_record)
-        elif isinstance(record, schema.List):
-            self._gather_list(net, record, lengths_blob, output_record)
-        else:
-            raise NotImplementedError
-
-    def add_ops(self, net):
-        self._dispatch(net, self.input_record.record, None, self.output_schema)
diff --git a/caffe2/python/layers/homotopy_weight.py b/caffe2/python/layers/homotopy_weight.py
deleted file mode 100644
index 4c24223cbc8d..000000000000
--- a/caffe2/python/layers/homotopy_weight.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# @package homotopy_weight
-# Module caffe2.fb.python.layers.homotopy_weight
-
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import ModelLayer
-import numpy as np
-import logging
-logger = logging.getLogger(__name__)
-'''
-Homotopy Weighting between two weights x, y by doing:
-    alpha x + beta y
-where alpha is a decreasing scalar parameter ranging from [min, max] (default,
-[0, 1]), and alpha + beta = max + min, which means that beta is increasing in
-the range [min, max];
-
-Homotopy methods first solves an "easy" problem (one to which the solution is
-well known), and is gradually transformed into the target problem
-'''
-
-
-class HomotopyWeight(ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        name='homotopy_weight',
-        min_weight=0.,
-        max_weight=1.,
-        half_life=1e6,
-        quad_life=3e6,
-        atomic_iter=None,
-        **kwargs
-    ):
-        super(HomotopyWeight,
-              self).__init__(model, name, input_record, **kwargs)
-        self.output_schema = schema.Scalar(
-            np.float32, self.get_next_blob_reference('homotopy_weight')
-        )
-        data = self.input_record.field_blobs()
-        assert len(data) == 2
-        self.x = data[0]
-        self.y = data[1]
-        # TODO: currently model building does not have access to iter counter or
-        # learning rate; it's added at optimization time;
-        self.use_external_iter = (atomic_iter is not None)
-        self.atomic_iter = (
-            atomic_iter if self.use_external_iter else self.create_atomic_iter()
-        )
-        # to map lr to [min, max]; alpha = scale * lr + offset
-        assert max_weight > min_weight
-        self.scale = float(max_weight - min_weight)
-        self.offset = self.model.add_global_constant(
-            '%s_offset_1dfloat' % self.name, float(min_weight)
-        )
-        self.gamma, self.power = self.solve_inv_lr_params(half_life, quad_life)
-
-    def solve_inv_lr_params(self, half_life, quad_life):
-        # ensure that the gamma, power is solvable
-        assert half_life > 0
-        # convex monotonically decreasing
-        assert quad_life > 2 * half_life
-        t = float(quad_life) / float(half_life)
-        x = t * (1.0 + np.sqrt(2.0)) / 2.0 - np.sqrt(2.0)
-        gamma = (x - 1.0) / float(half_life)
-        power = np.log(2.0) / np.log(x)
-        logger.info(
-            'homotopy_weighting: found lr param: gamma=%g, power=%g' %
-            (gamma, power)
-        )
-        return gamma, power
-
-    def create_atomic_iter(self):
-        self.mutex = self.create_param(
-            param_name=('%s_mutex' % self.name),
-            shape=None,
-            initializer=('CreateMutex', ),
-            optimizer=self.model.NoOptim,
-        )
-        self.atomic_iter = self.create_param(
-            param_name=('%s_atomic_iter' % self.name),
-            shape=[1],
-            initializer=(
-                'ConstantFill', {
-                    'value': 0,
-                    'dtype': core.DataType.INT64
-                }
-            ),
-            optimizer=self.model.NoOptim,
-        )
-        return self.atomic_iter
-
-    def update_weight(self, net):
-        alpha = net.NextScopedBlob('alpha')
-        beta = net.NextScopedBlob('beta')
-        lr = net.NextScopedBlob('lr')
-        comp_lr = net.NextScopedBlob('complementary_lr')
-        scaled_lr = net.NextScopedBlob('scaled_lr')
-        scaled_comp_lr = net.NextScopedBlob('scaled_complementary_lr')
-        if not self.use_external_iter:
-            net.AtomicIter([self.mutex, self.atomic_iter], [self.atomic_iter])
-        net.LearningRate(
-            [self.atomic_iter],
-            [lr],
-            policy='inv',
-            gamma=self.gamma,
-            power=self.power,
-            base_lr=1.0,
-        )
-        net.Sub([self.model.global_constants['ONE'], lr], [comp_lr])
-        net.Scale([lr], [scaled_lr], scale=self.scale)
-        net.Scale([comp_lr], [scaled_comp_lr], scale=self.scale)
-        net.Add([scaled_lr, self.offset], [alpha])
-        net.Add([scaled_comp_lr, self.offset], [beta])
-        return alpha, beta
-
-    def add_ops(self, net):
-        alpha, beta = self.update_weight(net)
-        # alpha x + beta y
-        net.WeightedSum([self.x, alpha, self.y, beta], self.output_schema())
diff --git a/caffe2/python/layers/label_smooth.py b/caffe2/python/layers/label_smooth.py
deleted file mode 100644
index 5f6f6b9961a9..000000000000
--- a/caffe2/python/layers/label_smooth.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-# @package label_smooth
-# Module caffe2.python.layers.label_smooth
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import ModelLayer
-import numpy as np
-
-
-class LabelSmooth(ModelLayer):
-    def __init__(
-        self, model, label, smooth_matrix, name='label_smooth', **kwargs
-    ):
-        super().__init__(model, name, label, **kwargs)
-        self.label = label
-        # shape as a list
-        smooth_matrix = np.array(smooth_matrix).astype(np.float32).flatten()
-        self.set_dim(smooth_matrix)
-        self.set_smooth_matrix(smooth_matrix)
-        self.output_schema = schema.Scalar(
-            (np.float32, (self.dim, )),
-            self.get_next_blob_reference('smoothed_label')
-        )
-
-    def set_dim(self, smooth_matrix):
-        num_elements = smooth_matrix.size
-        self.binary_prob_label = (num_elements == 2)
-        if self.binary_prob_label:
-            self.dim = 1
-        else:
-            assert np.sqrt(num_elements)**2 == num_elements
-            self.dim = int(np.sqrt(num_elements))
-
-    def set_smooth_matrix(self, smooth_matrix):
-        if not self.binary_prob_label:
-            self.smooth_matrix = self.model.add_global_constant(
-                '%s_label_smooth_matrix' % self.name,
-                array=smooth_matrix.reshape((self.dim, self.dim)),
-                dtype=np.dtype(np.float32),
-            )
-            self.len = self.model.add_global_constant(
-                '%s_label_dim' % self.name,
-                array=self.dim,
-                dtype=np.dtype(np.int64),
-            )
-        else:
-            self.smooth_matrix = smooth_matrix
-
-    def add_ops_for_binary_prob_label(self, net):
-        if self.label.field_type().base != np.float32:
-            float32_label = net.NextScopedBlob('float32_label')
-            net.Cast([self.label()], [float32_label], to=core.DataType.FLOAT)
-        else:
-            float32_label = self.label()
-        net.StumpFunc(
-            float32_label,
-            self.output_schema(),
-            threshold=0.5,
-            low_value=self.smooth_matrix[0],
-            high_value=self.smooth_matrix[1],
-        )
-
-    def add_ops_for_categorical_label(self, net):
-        if self.label.field_type().base != np.int64:
-            int64_label = net.NextScopedBlob('int64_label')
-            net.Cast([self.label()], [int64_label], to=core.DataType.INT64)
-        else:
-            int64_label = self.label()
-        one_hot_label = net.NextScopedBlob('one_hot_label')
-        net.OneHot([int64_label, self.len], [one_hot_label])
-        net.MatMul([one_hot_label, self.smooth_matrix], self.output_schema())
-
-    def add_ops(self, net):
-        if self.binary_prob_label:
-            self.add_ops_for_binary_prob_label(net)
-        else:
-            self.add_ops_for_categorical_label(net)
diff --git a/caffe2/python/layers/last_n_window_collector.py b/caffe2/python/layers/last_n_window_collector.py
deleted file mode 100644
index 3b44ea708031..000000000000
--- a/caffe2/python/layers/last_n_window_collector.py
+++ /dev/null
@@ -1,69 +0,0 @@
-## @package last_n_window_collector
-# Module caffe2.python.layers.last_n_window_collector
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import ModelLayer
-
-
-class LastNWindowCollector(ModelLayer):
-    """
-    Collect last-N samples from input record. If you have complex data,
-    use PackRecords to pack it before using this layer.
-
-    This layer is not thread safe.
-    """
-
-    def __init__(self, model, input_record, num_to_collect,
-                 name='last_n_window_collector', **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-        assert num_to_collect > 0
-        self.num_to_collect = num_to_collect
-        assert isinstance(input_record, schema.Scalar), \
-            "Got {!r}".format(input_record)
-
-        self.last_n = self.create_param(param_name='last_n',
-                                        shape=[0],
-                                        initializer=('ConstantFill', {}),
-                                        optimizer=model.NoOptim)
-
-        self.next_blob = self.create_param(
-            param_name='next',
-            shape=[],
-            initializer=('ConstantFill',
-                         {'value': 0, 'dtype': core.DataType.INT32}),
-            optimizer=model.NoOptim
-        )
-
-        self.mutex = self.create_param(
-            param_name='mutex',
-            shape=[],
-            initializer=('CreateMutex',),
-            optimizer=model.NoOptim,
-        )
-
-        self.num_visited_blob = self.create_param(
-            param_name='num_visited',
-            shape=[],
-            initializer=('ConstantFill', {
-                'value': 0,
-                'dtype': core.DataType.INT64,
-            }),
-            optimizer=model.NoOptim,
-        )
-
-        self.output_schema = schema.Struct(
-            (
-                'last_n',
-                schema.from_blob_list(input_record, [self.last_n])
-            ),
-            ('num_visited', schema.Scalar(blob=self.num_visited_blob)),
-            ('mutex', schema.Scalar(blob=self.mutex)),
-        )
-
-    def add_ops(self, net):
-        net.LastNWindowCollector(
-            [self.last_n, self.next_blob, self.input_record(), self.mutex,
-             self.num_visited_blob],
-            [self.last_n, self.next_blob, self.num_visited_blob],
-            num_to_collect=self.num_to_collect,
-        )
diff --git a/caffe2/python/layers/layer_normalization.py b/caffe2/python/layers/layer_normalization.py
deleted file mode 100644
index 0e722c960e39..000000000000
--- a/caffe2/python/layers/layer_normalization.py
+++ /dev/null
@@ -1,120 +0,0 @@
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer
-
-import numpy as np
-
-
-class LayerNormalization(ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        name='layer_normalization',
-        scale_optim=None,
-        bias_optim=None,
-        epsilon=1e-4,
-        axis=1,
-        use_layer_norm_op=True,
-        scale_init_value=1.0,
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-
-        assert isinstance(input_record, schema.Scalar), (
-            "Incorrect input type: {}".format(input_record))
-
-        self.input_shape = input_record.field_type().shape
-        self.axis = axis
-
-        assert len(self.input_shape) >= 1, (
-            "This layer supports only >= 2D tensors")
-        input_dims = self.input_shape[0]
-
-        self.output_schema = schema.Scalar(
-            (np.float32, self.input_shape),
-            self.get_next_blob_reference('output')
-        )
-
-        self.scale = self.create_param(param_name='scale',
-                                       shape=[input_dims],
-                                       initializer=('ConstantFill', {'value': scale_init_value}),
-                                       optimizer=scale_optim)
-        self.bias = self.create_param(param_name='bias',
-                                       shape=[input_dims],
-                                       initializer=('ConstantFill', {'value': 0.0}),
-                                       optimizer=bias_optim)
-        self.use_layer_norm_op = use_layer_norm_op
-
-        if self.use_layer_norm_op:
-            self.epsilon = epsilon
-        else:
-            assert len(self.input_shape) == 1, (
-                "When using alternative implementation, "
-                "input data can only be 2D"
-            )
-            self.epsilon = model.maybe_add_global_constant(
-                "%s_epsilon" % self.name, float(epsilon)
-            )
-
-    def add_ops_with_layer_norm_op(self, net):
-        input_blob = self.input_record.field_blobs()
-        ln_output = self.output_schema.field_blobs()
-
-        output_blobs = [net.NextScopedBlob('ln_output'), net.NextScopedBlob('ln_mean'),
-                        net.NextScopedBlob('ln_stdev')]
-
-        normalized, mean, stdev = net.LayerNorm(input_blob,
-            output_blobs,
-            axis=self.axis,
-            epsilon=self.epsilon)
-
-        scaled = net.Mul(
-            [normalized, self.scale],
-            [net.NextScopedBlob('ln_scaled')],
-            broadcast=1,
-            axis=self.axis,
-        )
-
-        net.Add(
-            [scaled, self.bias],
-            ln_output,
-            broadcast=1,
-            axis=self.axis,
-        )
-
-    def add_ops_without_layer_norm_op(self, net):
-        # two issues here:
-        #  1. use multiple ops to replace the function of LayerNorm
-        #  2. do not use legacy broadcast
-        ln_output = net.NextScopedBlob("ln_output")
-        ln_mean = net.NextScopedBlob("ln_mean")
-        ln_stdev = net.NextScopedBlob("ln_stdev")
-        ln_mean_arr = net.NextScopedBlob("ln_mean_arr")
-        net.ReduceBackMean(self.input_record.field_blobs(), [ln_mean_arr])
-        net.ExpandDims([ln_mean_arr], [ln_mean], dims=[1])
-        ln_centered = net.NextScopedBlob("ln_centered")
-        net.Sub(self.input_record.field_blobs() + [ln_mean], [ln_centered])
-        ln_sqr = net.NextScopedBlob("ln_sqr")
-        net.Sqr([ln_centered], [ln_sqr])
-        ln_sqr_mean = net.NextScopedBlob("ln_sqr_mean")
-        net.ReduceBackMean([ln_sqr], [ln_sqr_mean])
-        ln_var = net.NextScopedBlob("ln_var")
-        net.Add([ln_sqr_mean, self.epsilon], ln_var)
-        ln_std_arr = net.NextScopedBlob("ln_std_arr")
-        net.Pow([ln_var], [ln_std_arr], exponent=0.5)
-        net.ExpandDims([ln_std_arr], [ln_stdev], dims=[1])
-        net.Div([ln_centered, ln_stdev], [ln_output])
-        ln_scaled = net.NextScopedBlob("ln_scaled")
-        net.Mul([ln_output, self.scale], [ln_scaled])
-        net.Add([ln_scaled, self.bias], self.output_schema.field_blobs())
-
-    def add_ops(self, net):
-        if self.use_layer_norm_op:
-            self.add_ops_with_layer_norm_op(net)
-        else:
-            self.add_ops_without_layer_norm_op(net)
diff --git a/caffe2/python/layers/layers.py b/caffe2/python/layers/layers.py
deleted file mode 100644
index 30b632eef2ba..000000000000
--- a/caffe2/python/layers/layers.py
+++ /dev/null
@@ -1,496 +0,0 @@
-## @package layers
-# Module caffe2.python.layers.layers
-
-
-import logging
-from collections import namedtuple
-
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, schema, scope, utils, workspace
-from caffe2.python.layers.tags import TagContext
-
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-# Some types to simplify descriptions of things traveling between ops
-IdList = schema.List(np.int64)
-IdScoreList = schema.Map(np.int64, np.float32)
-IdListWithEvicted = schema.ListWithEvicted(np.int64)
-IdScoreListWithEvicted = schema.MapWithEvicted(np.int64, np.float32)
-
-
-def almost_equal_schemas(
-    record,
-    original_schema,
-    check_field_names=True,
-    check_field_types=True,
-    check_field_metas=False,
-):
-    if original_schema == IdList:
-        return schema.equal_schemas(
-            record,
-            IdList,
-            check_field_names=check_field_names,
-            check_field_types=check_field_types,
-            check_field_metas=check_field_metas,
-        ) or schema.equal_schemas(
-            record,
-            IdListWithEvicted,
-            check_field_names=check_field_names,
-            check_field_types=check_field_types,
-            check_field_metas=check_field_metas,
-        )
-    elif original_schema == IdScoreList:
-        return schema.equal_schemas(
-            record,
-            IdScoreList,
-            check_field_names=check_field_names,
-            check_field_types=check_field_types,
-            check_field_metas=check_field_metas,
-        ) or schema.equal_schemas(
-            record,
-            IdScoreListWithEvicted,
-            check_field_names=check_field_names,
-            check_field_types=check_field_types,
-            check_field_metas=check_field_metas,
-        )
-    else:
-        return schema.equal_schemas(record, original_schema)
-
-
-def get_key(record):
-    if almost_equal_schemas(record, IdList):
-        key = "values"
-    elif almost_equal_schemas(
-        record, IdScoreList, check_field_types=False
-    ):
-        key = "values:keys"
-    else:
-        raise NotImplementedError("Not implemented for {}".format(record))
-    assert record[key].metadata is not None, "Blob {} doesn't have metadata".format(
-        str(record[key]())
-    )
-    return record[key]
-
-
-def get_categorical_limit(record):
-    key = get_key(record)
-    return key.metadata.categorical_limit
-
-
-def get_avg_length(record):
-    return record["lengths"].metadata.expected_value
-
-
-def set_request_only(field):
-    for f in field.all_scalars():
-        categorical_limit, expected_value = None, None
-        if not f.metadata:
-            feature_specs = schema.FeatureSpec(feature_is_request_only=True)
-        elif not f.metadata.feature_specs:
-            categorical_limit = f.metadata.categorical_limit
-            expected_value = f.metadata.expected_value
-            feature_specs = schema.FeatureSpec(feature_is_request_only=True)
-        else:
-            categorical_limit = f.metadata.categorical_limit
-            expected_value = f.metadata.expected_value
-            feature_specs = schema.FeatureSpec(
-                feature_type=f.metadata.feature_specs.feature_type,
-                feature_names=f.metadata.feature_specs.feature_names,
-                feature_ids=f.metadata.feature_specs.feature_ids,
-                feature_is_request_only=True,
-                desired_hash_size=f.metadata.feature_specs.desired_hash_size,
-            )
-
-        # make sure not to set categorical_limit for a non-integer field
-        if not np.issubdtype(f.field_type(), np.integer):
-            assert (
-                categorical_limit is None
-            ), "categorical_limit shouldn't be set for no-integer field"
-
-        f.set_metadata(
-            schema.Metadata(
-                categorical_limit=categorical_limit,
-                expected_value=expected_value,
-                feature_specs=feature_specs,
-            )
-        )
-
-
-class InstantiationContext:
-    """
-    List of contexts where layer could be instantitated
-    """
-
-    # The layers support this context will accumulate predictions, labels,
-    # weights. The accumulated data can later be used to compute
-    # calibration or for other
-    # purpose.
-    ACCUMULATE_PRED = "accumulate_pred"
-    EVAL = "eval"
-    PREDICTION = "prediction"
-    TRAINING = "training"
-
-
-_LAYER_REGISTRY = {}
-
-
-def register_layer(name, layer):
-    assert name not in _LAYER_REGISTRY, "{0} already exists".format(name)
-    _LAYER_REGISTRY[name] = layer
-
-
-def layer_exists(name):
-    return name in _LAYER_REGISTRY
-
-
-def get_layer_class(name):
-    return _LAYER_REGISTRY[name]
-
-
-def create_layer(layer_name, *args, **kwargs):
-    return _LAYER_REGISTRY[layer_name](*args, **kwargs)
-
-
-LayerPsParam = namedtuple("LayerPsParam", ["sparse_key", "average_length"])
-
-
-class LayerParameter:
-    def __init__(
-        self,
-        parameter=None,
-        optimizer=None,
-        initializer=None,
-        ps_param=None,
-        regularizer=None,
-    ):
-        assert isinstance(
-            parameter, core.BlobReference
-        ), "expect {0} to be a blob reference".format(str(parameter))
-        # need to put the following line (shape) before initialier
-        # shape will be updated once initializer is (re)set
-        self._shape = None
-        self.parameter = parameter
-        self.optimizer = optimizer
-        self.initializer = initializer
-        self.ps_param = ps_param
-        self.regularizer = regularizer
-
-    @property
-    def initializer(self):
-        return self._initializer
-
-    @initializer.setter
-    def initializer(self, op):
-        assert op is None or core.IsOperator(
-            getattr(op, "type", None)
-        ), "initializer expects an operator, got type: {}".format(type(op))
-        self._initializer = op
-        if op is not None:
-            self.shape = self._infer_shape_from_initializer()
-
-    @property
-    def shape(self):
-        return self._shape
-
-    @shape.setter
-    def shape(self, shape):
-        assert self.shape is None or self.shape == shape, (
-            "inconsistent shape for layer parameter:"
-            " {}, expect: {}, but got {}".format(self, self.shape, shape)
-        )
-        self._shape = shape
-
-    def _infer_shape_from_initializer(self):
-        for arg in self.initializer.arg:
-            if arg.name == "shape":
-                return list(arg.ints)
-        with workspace.WorkspaceGuard("model_init_by_loading_params"):
-            try:
-                net = core.Net("shape_checker")
-                net._net.op.extend([self.initializer])
-                shape_blob = net.NextScopedBlob(self.parameter + "_shape")
-                net.Shape([self.parameter], shape_blob)
-                workspace.RunNetOnce(net)
-                shape = workspace.FetchBlob(shape_blob).tolist()
-                # ResetWorkspace to save memory
-                workspace.ResetWorkspace()
-                return shape
-            except RuntimeError as exp:
-                logger.warning(
-                    "Cannot infer the shape of blob {} from operator {}: {}".format(
-                        self.parameter, self.initializer.type, exp
-                    )
-                )
-                workspace.ResetWorkspace()
-                return None
-
-    def __str__(self):
-        return str(self.parameter)
-
-
-def is_request_only_scalar(scalar):
-    if len(scalar.field_metadata()) == 0:
-        return False
-    for metadata in scalar.field_metadata():
-        if not (
-            metadata
-            and metadata.feature_specs
-            and getattr(metadata.feature_specs, "feature_is_request_only", False)
-        ):
-            return False
-    return True
-
-# Contains features accessed in a model layer of a given type
-# `type`: A string representing the kind of feature, consistent with FeatureSpec
-# `ids`: A set of feature IDs that are accessed in the model layer
-AccessedFeatures = namedtuple("AccessedFeatures", ["type", "ids"])
-
-class ModelLayer:
-    def __init__(
-        self,
-        model,
-        prefix,
-        input_record,
-        predict_input_record_fields=None,
-        tags=None,
-        **kwargs
-    ):
-        """
-        Base class for model layers. Layer is an abstraction that allows to
-        provide model description in terms of meta-operators, where each of the
-        meta-operators can have different implementations for training,
-        evaluation and prediction, that are instantiated later. As an example
-        SampledSoftmax can do something related to sampling depending on
-        supervision during the training and just apply softmax if it's used for
-        prediction/evaluation.
-
-        All inputs/outputs from layers are represented as a record (instance of
-        schema bounded to blobs) and are accessible through input_record and
-        output_schema. If Layer needs to have only a subset of inputs/provides
-        subset of outputs during the inference - it should provide
-        predict_input_record and predict_output_schema correspondingly (those
-        records are expected to be a subset of input_record/output_schema).
-
-        Each layer has a list of Tags associated with it, that depends on
-        current context and arguments. It's possible to use those tags during
-        the instantiation time.
-
-        """
-        self.name = model.next_layer_name(prefix)
-        self.model = model
-        self.kwargs = kwargs
-        self._input_record = input_record
-        if predict_input_record_fields:
-            if not isinstance(predict_input_record_fields, list):
-                predict_input_record_fields = [predict_input_record_fields]
-            self._predict_input_record = self._input_record[predict_input_record_fields]
-        else:
-            self._predict_input_record = None
-
-        self.request_only = True
-        if len(input_record.all_scalars()) == 0:
-            self.request_only = False
-        for scalar in input_record.all_scalars():
-            if not is_request_only_scalar(scalar):
-                self.request_only = False
-                break
-
-        self.precomputation_request_only = False
-        self.precomputation_object_only = False
-
-        self._output_schema = None
-        self._predict_output_schema = None
-        self.eval_output_schema = None
-        self.tags = set(tags or [])
-        self.tags.update(TagContext.current().tags)
-        self.params = []
-        self._export_output_for_metrics = False
-        self._export_params_for_metrics = False
-
-    def get_type(self):
-        return self.__class__.__name__
-
-    def _check_output_schema(self):
-        assert self._output_schema is not None, "Schema is not initialized"
-        assert self._predict_output_schema is None or schema.is_schema_subset(
-            self._predict_output_schema, self._output_schema
-        ), "predict_output_schema is not a subset of the output_schema"
-
-    @property
-    def predict_input_record(self):
-        return self._predict_input_record or self._input_record
-
-    @property
-    def input_record(self):
-        return self._input_record
-
-    @property
-    def predict_output_schema(self):
-        self._check_output_schema()
-        return self._predict_output_schema or self._output_schema
-
-    @predict_output_schema.setter
-    def predict_output_schema(self, output_schema):
-        assert self._predict_output_schema is None
-        self._predict_output_schema = output_schema
-
-    @property
-    def output_schema(self):
-        if self.request_only:
-            set_request_only(self._output_schema)
-        self._check_output_schema()
-        return self._output_schema
-
-    @output_schema.setter
-    def output_schema(self, output_schema):
-        assert self._output_schema is None
-        self._output_schema = output_schema
-
-    def get_parameters(self):
-        return self.params
-
-    def get_fp16_compatible_parameters(self):
-        """Return a subset of parameters which can be converted to fp16"""
-        return []
-
-    def get_memory_usage(self):
-        return 0
-
-    def get_accessed_features(self):
-        """
-        Return a map from field to list of AccessedFeatures, the map should
-        contain all features accessed in the model layer
-        """
-        return {}
-
-    def add_init_params(self, init_net):
-        """
-        Adds layer initialization operators to passed net.
-        """
-        for param in self.params:
-            # TODO(amalevich): Either return back to lambdas, that add
-            # all params (looks a bit safer and breaking less
-            # abstractions) or extend Net interface to this type of
-            # operations better
-            # TODO(xlwang) init_net._net.op has type google.protobuf.\
-            # internal.containers.RepeatedCompositeFieldContainer, but
-            # the version of protobuf in fbcode does not support append
-            # so extend is used
-            init_op = param.initializer
-            current_device_scope = scope.CurrentDeviceScope()
-            if not init_op:
-                continue
-
-            if not init_op.HasField("device_option") and current_device_scope:
-                init_op = caffe2_pb2.OperatorDef()
-                init_op.CopyFrom(param.initializer)
-                init_op.device_option.CopyFrom(current_device_scope)
-
-            # do not add duplicated init ops
-            if any(
-                utils.OpAlmostEqual(op, init_op, "debug_info")
-                for op in init_net._net.op
-            ):
-                continue
-
-            init_net._net.op.extend([init_op])
-
-    def create_param(
-        self, param_name, shape, initializer, optimizer, ps_param=None, regularizer=None
-    ):
-        with scope.NameScope(self.name, reset=True):
-            param = self.model.create_param(
-                param_name=param_name,
-                shape=shape,
-                initializer=initializer,
-                optimizer=optimizer,
-                ps_param=ps_param,
-                regularizer=regularizer,
-            )
-
-            # make sure we don't share parameters in the same layer
-            assert all(param.parameter != p.parameter for p in self.params)
-
-            self.params.append(param)
-            return param.parameter
-
-    def get_next_blob_reference(self, name):
-        with scope.NameScope(self.name, reset=True):
-            return self.model.net.NextScopedBlob(name)
-
-    def add_operators(self, net, init_net=None, context=InstantiationContext.TRAINING):
-        """
-        Adds layer trainig or initialization operators to the passed in net.
-        init_net can be None and can be called independently from add_init_params
-        """
-        # Namescope below should warranty that all intermediate blobs will be
-        # assiciated with the layer that produces them
-        with scope.NameScope(self.name):
-            if context not in {
-                InstantiationContext.PREDICTION,
-                InstantiationContext.EVAL,
-                InstantiationContext.ACCUMULATE_PRED,
-            }:
-                assert init_net, "Only prediction and eval context don't need init_net"
-            if init_net:
-                self.add_init_params(init_net)
-            if context == InstantiationContext.TRAINING:
-                self.add_train_ops(net)
-            elif context == InstantiationContext.EVAL:
-                self.add_eval_ops(net)
-            elif context == InstantiationContext.ACCUMULATE_PRED:
-                self.add_ops_to_accumulate_pred(net)
-            else:
-                self.add_ops(net)
-
-            if (
-                context in {InstantiationContext.TRAINING, InstantiationContext.EVAL}
-                and self._export_params_for_metrics
-            ):
-                self.add_param_copy_operators(net)
-
-    def add_ops(self, net):
-        # Predict layer implementation.
-        raise NotImplementedError
-
-    def add_eval_ops(self, net):
-        # Default eval layer implementation is completely matching
-        # predict layer implementation.
-        self.add_ops(net)
-
-    def add_train_ops(self, net):
-        # Default train layer implementation is completely matching
-        # eval layer implementation.
-        self.add_eval_ops(net)
-
-    def add_ops_to_accumulate_pred(self, net):
-        # This adds operators to accumulate predictions/labels/weights. The
-        # accumulated data can later be used to compute calibration or for other
-        # purpose. Default layer implementation is completely matching eval
-        # layer implementation.
-        self.add_eval_ops(net)
-
-    def add_param_copy_operators(self, net):
-        for param in self.params:
-            param_copy_ref = self.model.metrics_schema[str(param.parameter)]
-            net.Copy([param.parameter], param_copy_ref.field_blobs())
-
-    def export_output_for_metrics(self):
-        self._export_output_for_metrics = True
-
-        # Export output of the layer directly
-        export_name = self.name + "/output"
-        self.model.add_metric_field(export_name, self.output_schema)
-
-    def export_params_for_metrics(self):
-        self._export_params_for_metrics = True
-
-        # Export copies of parameters
-        for param in self.params:
-            param_copy_ref = self.get_next_blob_reference(
-                str(param).split("/")[-1] + "_copy"
-            )
-            self.model.add_metric_field(str(param.parameter), param_copy_ref)
diff --git a/caffe2/python/layers/margin_rank_loss.py b/caffe2/python/layers/margin_rank_loss.py
deleted file mode 100644
index be8762938824..000000000000
--- a/caffe2/python/layers/margin_rank_loss.py
+++ /dev/null
@@ -1,62 +0,0 @@
-## @package random_neg_rank_loss
-# Module caffe2.python.layers.random_neg_rank_loss
-
-
-
-
-
-from caffe2.python import schema, core
-from caffe2.python.layers.layers import (
-    ModelLayer,
-)
-from caffe2.python.layers.tags import (
-    Tags
-)
-import numpy as np
-
-
-class MarginRankLoss(ModelLayer):
-
-    def __init__(self, model, input_record, name='margin_rank_loss',
-                 margin=0.1, average_loss=False, **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-        assert margin >= 0, ('For hinge loss, margin should be no less than 0')
-        self._margin = margin
-        self._average_loss = average_loss
-        assert schema.is_schema_subset(
-            schema.Struct(
-                ('pos_prediction', schema.Scalar()),
-                ('neg_prediction', schema.List(np.float32)),
-            ),
-            input_record
-        )
-        self.tags.update([Tags.EXCLUDE_FROM_PREDICTION])
-        self.output_schema = schema.Scalar(
-            np.float32,
-            self.get_next_blob_reference('output'))
-
-    def add_ops(self, net):
-        neg_score = self.input_record.neg_prediction['values']()
-
-        pos_score = net.LengthsTile(
-            [
-                self.input_record.pos_prediction(),
-                self.input_record.neg_prediction['lengths']()
-            ],
-            net.NextScopedBlob('pos_score_repeated')
-        )
-        const_1 = net.ConstantFill(
-            neg_score,
-            net.NextScopedBlob('const_1'),
-            value=1,
-            dtype=core.DataType.INT32
-        )
-        rank_loss = net.MarginRankingCriterion(
-            [pos_score, neg_score, const_1],
-            net.NextScopedBlob('rank_loss'),
-            margin=self._margin,
-        )
-        if self._average_loss:
-            net.AveragedLoss(rank_loss, self.output_schema.field_blobs())
-        else:
-            net.ReduceFrontSum(rank_loss, self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/merge_id_lists.py b/caffe2/python/layers/merge_id_lists.py
deleted file mode 100644
index d130c48b6c4f..000000000000
--- a/caffe2/python/layers/merge_id_lists.py
+++ /dev/null
@@ -1,50 +0,0 @@
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import (
-    get_categorical_limit,
-    ModelLayer,
-    IdList
-)
-
-import numpy as np
-
-
-class MergeIdLists(ModelLayer):
-    """Merge multiple ID_LISTs into a single ID_LIST
-
-    Args:
-        model: A layer model instance
-        input_record: Tuple (Struct) of ID_LIST features to be
-        merged
-
-    Returns:
-        the merged ID_LIST feature
-    """
-    def __init__(self, model, input_record, name='merged'):
-        super().__init__(model, name, input_record)
-        assert all(schema.equal_schemas(x, IdList) for x in input_record), \
-            "Inputs to MergeIdLists should all be IdLists."
-
-        assert all(record.items.metadata is not None
-                   for record in self.input_record), \
-            "Features without metadata are not supported"
-
-        merge_dim = max(get_categorical_limit(record)
-                        for record in self.input_record)
-        assert merge_dim is not None, "Unbounded features are not supported"
-
-        self.output_schema = schema.NewRecord(
-            model.net, schema.List(
-                schema.Scalar(
-                    np.int64,
-                    blob=model.net.NextBlob(name),
-                    metadata=schema.Metadata(categorical_limit=merge_dim)
-                )))
-
-    def add_ops(self, net):
-        return net.MergeIdLists(self.input_record.field_blobs(),
-                                self.output_schema.field_blobs())
diff --git a/caffe2/python/layers/pairwise_similarity.py b/caffe2/python/layers/pairwise_similarity.py
deleted file mode 100644
index 0cdd0259cd08..000000000000
--- a/caffe2/python/layers/pairwise_similarity.py
+++ /dev/null
@@ -1,96 +0,0 @@
-## @package dot_product
-# Module caffe2.python.layers.dot_product
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import (
-    ModelLayer,
-)
-
-
-class PairwiseSimilarity(ModelLayer):
-
-    def __init__(self, model, input_record, output_dim, pairwise_similarity_func='dot',
-                 name='pairwise_similarity', **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-        assert isinstance(input_record, schema.Struct), (
-            "Incorrect input type. Expected Struct, but received: {0}".
-            format(input_record))
-        assert (
-            ('all_embeddings' in input_record) ^
-            ('x_embeddings' in input_record and 'y_embeddings' in input_record)
-        ), (
-            "either (all_embeddings) xor (x_embeddings and y_embeddings) " +
-            "should be given."
-        )
-        self.pairwise_similarity_func = pairwise_similarity_func
-        if 'all_embeddings' in input_record:
-            x_embeddings = input_record['all_embeddings']
-            y_embeddings = input_record['all_embeddings']
-        else:
-            x_embeddings = input_record['x_embeddings']
-            y_embeddings = input_record['y_embeddings']
-
-        assert isinstance(x_embeddings, schema.Scalar), (
-            "Incorrect input type for x. Expected Scalar, " +
-            "but received: {0}".format(x_embeddings))
-        assert isinstance(y_embeddings, schema.Scalar), (
-            "Incorrect input type for y. Expected Scalar, " +
-            "but received: {0}".format(y_embeddings)
-        )
-
-        if 'indices_to_gather' in input_record:
-            indices_to_gather = input_record['indices_to_gather']
-            assert isinstance(indices_to_gather, schema.Scalar), (
-                "Incorrect type of indices_to_gather. "
-                "Expected Scalar, but received: {0}".format(indices_to_gather)
-            )
-            self.indices_to_gather = indices_to_gather
-        else:
-            self.indices_to_gather = None
-
-        self.x_embeddings = x_embeddings
-        self.y_embeddings = y_embeddings
-
-        dtype = x_embeddings.field_types()[0].base
-
-        self.output_schema = schema.Scalar(
-            (dtype, (output_dim,)),
-            self.get_next_blob_reference('output')
-        )
-
-    def add_ops(self, net):
-        if self.pairwise_similarity_func == "cosine_similarity":
-            x_embeddings_norm = net.Normalize(self.x_embeddings(), axis=1)
-            y_embeddings_norm = net.Normalize(self.y_embeddings(), axis=1)
-            Y = net.BatchMatMul(
-                [x_embeddings_norm, y_embeddings_norm],
-                [self.get_next_blob_reference(x_embeddings_norm + '_matmul')],
-                trans_b=1,
-            )
-        elif self.pairwise_similarity_func == "dot":
-            Y = net.BatchMatMul(
-                [self.x_embeddings(), self.y_embeddings()],
-                [self.get_next_blob_reference(self.x_embeddings() + '_matmul')],
-                trans_b=1,
-            )
-        else:
-            raise NotImplementedError(
-                "pairwise_similarity_func={} is not valid".format(
-                    self.pairwise_similarity_func
-                )
-            )
-
-        if self.indices_to_gather:
-            flattened = net.Flatten(
-                Y, Y + '_flatten',
-            )
-            net.BatchGather(
-                [flattened, self.indices_to_gather()],
-                self.output_schema(),
-            )
-        else:
-            net.Flatten(Y, self.output_schema())
diff --git a/caffe2/python/layers/position_weighted.py b/caffe2/python/layers/position_weighted.py
deleted file mode 100644
index d2c917ed0243..000000000000
--- a/caffe2/python/layers/position_weighted.py
+++ /dev/null
@@ -1,64 +0,0 @@
-## @package position_weighted
-# Module caffe2.python.layers.position_weighted
-
-
-
-
-
-import logging
-import numpy as np
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import (
-    get_categorical_limit,
-    ModelLayer,
-)
-
-from caffe2.python.layers.tags import Tags
-
-logger = logging.getLogger(__name__)
-
-
-class PositionWeighted(ModelLayer):
-    def __init__(self, model, input_record, weight_optim=None,
-                 name="position_weights"):
-        super().__init__(model, name, input_record)
-
-        assert isinstance(input_record, schema.List), "Incorrect input type"
-        length_metadata = input_record.lengths.metadata
-        max_length = (length_metadata.categorical_limit if length_metadata is
-                      not None else None)
-        if max_length is not None:
-            self.shape = max_length
-        else:
-            self.shape = get_categorical_limit(input_record)
-            logger.warning(
-                '{}: categorical_limit of lengths is not available, using '
-                'categorical_limit of the keys: {}'.format(
-                    str(input_record.lengths()), self.shape))
-
-        self.pos_w = self.create_param(param_name='pos_w',
-                                       shape=[self.shape, ],
-                                       initializer=('ConstantFill', {'value': 1.0}),
-                                       optimizer=weight_optim)
-
-        self.output_schema = schema.Struct(
-            ('position_weights',
-                schema.Scalar((np.float32, self.shape),
-                              self.get_next_blob_reference("pos_w_gather")))
-        )
-
-        self.tags.update({Tags.HANDLE_AS_SPARSE_LAYER})
-
-    def get_memory_usage(self):
-        return self.shape
-
-    def add_ops(self, net):
-        inc_seq = net.LengthsRangeFill(
-            [self.input_record.lengths()],
-            self.input_record.lengths() + '_pos_w_seq'
-        )
-
-        net.Gather(
-            [self.pos_w, inc_seq],
-            self.output_schema.position_weights.field_blobs())
diff --git a/caffe2/python/layers/random_fourier_features.py b/caffe2/python/layers/random_fourier_features.py
deleted file mode 100644
index 350454b24977..000000000000
--- a/caffe2/python/layers/random_fourier_features.py
+++ /dev/null
@@ -1,90 +0,0 @@
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer
-
-import numpy as np
-
-
-class RandomFourierFeatures(ModelLayer):
-    """
-    Implementation of random fourier feature map for feature processing.
-
-    Applies sqrt(2 / output_dims) * cos(wx+b), where:
-        output_dims is the output feature dimensions, and
-        wx + b applies FC using randomized, fixed weight and bias parameters
-
-    For more information, see the original paper:
-        https://people.eecs.berkeley.edu/~brecht/papers/07.rah.rec.nips.pdf
-
-    Inputs:
-        output_dims -- output feature dimensions
-        sigma -- bandwidth for the Gaussian kernel estimator
-        w_init -- initialization options for weight parameter
-        b_init -- initialization options for bias parameter
-
-    """
-    def __init__(
-            self,
-            model,
-            input_record,
-            output_dims,
-            sigma,  # bandwidth
-            w_init=None,
-            b_init=None,
-            name='random_fourier_features',
-            **kwargs):
-
-        super().__init__(model, name, input_record, **kwargs)
-        assert isinstance(input_record, schema.Scalar), "Incorrect input type"
-
-        input_dims = input_record.field_type().shape[0]
-        assert input_dims >= 1, "Expected input dimensions >= 1, got %s" \
-                                % input_dims
-        self.output_dims = output_dims
-        assert self.output_dims >= 1, "Expected output dimensions >= 1, got %s" \
-                                      % self.output_dims
-
-        self.output_schema = schema.Scalar(
-            (np.float32, (self.output_dims, )),
-            self.get_next_blob_reference('output')
-        )
-
-        assert sigma > 0.0, "Expected bandwidth > 0, got %s" % sigma
-
-        # Initialize train_init_net parameters
-        w_init = w_init if w_init else (
-            'GaussianFill', {'mean': 0.0, 'std': 1.0 / sigma}
-        )
-
-        b_init = b_init if b_init else (
-            'UniformFill', {'min': 0.0, 'max': 2 * np.pi}
-        )
-
-        self.w = self.create_param(param_name='w',
-                                   shape=[self.output_dims, input_dims],
-                                   initializer=w_init,
-                                   optimizer=model.NoOptim)
-
-        self.b = self.create_param(param_name='b',
-                                   shape=[self.output_dims],
-                                   initializer=b_init,
-                                   optimizer=model.NoOptim)
-
-    def add_ops(self, net):
-        # Random features: wx + b
-        cosine_arg = net.FC(self.input_record.field_blobs() + [self.w, self.b],
-                            net.NextScopedBlob("cosine_arg"))
-
-        # Apply cosine to new vectors
-        new_feature_vec = net.Cos([cosine_arg],
-                                  net.NextScopedBlob('new_feature_vec'))
-
-        # Multiply each element in vector by sqrt(2/D)
-        scale = np.sqrt(2.0 / self.output_dims)
-        net.Scale([new_feature_vec],
-                  self.output_schema.field_blobs(),
-                  scale=scale)
diff --git a/caffe2/python/layers/reservoir_sampling.py b/caffe2/python/layers/reservoir_sampling.py
deleted file mode 100644
index fe7302c5045e..000000000000
--- a/caffe2/python/layers/reservoir_sampling.py
+++ /dev/null
@@ -1,88 +0,0 @@
-## @package reservoir_sampling
-# Module caffe2.python.layers.reservoir_sampling
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import ModelLayer
-
-
-class ReservoirSampling(ModelLayer):
-    """
-    Collect samples from input record w/ reservoir sampling. If you have complex
-    data, use PackRecords to pack it before using this layer.
-
-    This layer is not thread safe.
-    """
-
-    def __init__(self, model, input_record, num_to_collect,
-                 name='reservoir_sampling', **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-        assert num_to_collect > 0
-        self.num_to_collect = num_to_collect
-
-        self.reservoir = self.create_param(
-            param_name='reservoir',
-            shape=[0],
-            initializer=('ConstantFill',),
-            optimizer=model.NoOptim,
-        )
-        self.num_visited_blob = self.create_param(
-            param_name='num_visited',
-            shape=[],
-            initializer=('ConstantFill', {
-                'value': 0,
-                'dtype': core.DataType.INT64,
-            }),
-            optimizer=model.NoOptim,
-        )
-        self.mutex = self.create_param(
-            param_name='mutex',
-            shape=[],
-            initializer=('CreateMutex',),
-            optimizer=model.NoOptim,
-        )
-
-        self.extra_input_blobs = []
-        self.extra_output_blobs = []
-        if 'object_id' in input_record:
-            object_to_pos = self.create_param(
-                param_name='object_to_pos',
-                shape=None,
-                initializer=('CreateMap', {
-                    'key_dtype': core.DataType.INT64,
-                    'valued_dtype': core.DataType.INT32,
-                }),
-                optimizer=model.NoOptim,
-            )
-            pos_to_object = self.create_param(
-                param_name='pos_to_object',
-                shape=[0],
-                initializer=('ConstantFill', {
-                    'value': 0,
-                    'dtype': core.DataType.INT64,
-                }),
-                optimizer=model.NoOptim,
-            )
-            self.extra_input_blobs.append(input_record.object_id())
-            self.extra_input_blobs.extend([object_to_pos, pos_to_object])
-            self.extra_output_blobs.extend([object_to_pos, pos_to_object])
-
-        self.output_schema = schema.Struct(
-            (
-                'reservoir',
-                schema.from_blob_list(input_record.data, [self.reservoir])
-            ),
-            ('num_visited', schema.Scalar(blob=self.num_visited_blob)),
-            ('mutex', schema.Scalar(blob=self.mutex)),
-        )
-
-    def add_ops(self, net):
-        net.ReservoirSampling(
-            [self.reservoir, self.num_visited_blob, self.input_record.data(),
-             self.mutex] + self.extra_input_blobs,
-            [self.reservoir, self.num_visited_blob] + self.extra_output_blobs,
-            num_to_collect=self.num_to_collect,
-        )
diff --git a/caffe2/python/layers/sampling_train.py b/caffe2/python/layers/sampling_train.py
deleted file mode 100644
index ac63dc054442..000000000000
--- a/caffe2/python/layers/sampling_train.py
+++ /dev/null
@@ -1,69 +0,0 @@
-## @package sampling_train
-# Module caffe2.python.layers.sampling_train
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import ModelLayer, get_layer_class
-from caffe2.python.layers.sampling_trainable_mixin import SamplingTrainableMixin
-
-
-class SamplingTrain(ModelLayer):
-    def __init__(
-        self,
-        model,
-        input_record,
-        prediction_layer,
-        output_dims,
-        subtract_log_odd=True,
-        name='sampling_train',
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-
-        layer_class = get_layer_class(prediction_layer)
-        assert issubclass(layer_class, SamplingTrainableMixin)
-
-        assert 'indices' in input_record
-        assert isinstance(input_record.indices, schema.Scalar),\
-            "input_record.indices is expected to be a schema.Scalar"
-        assert 'input' in input_record
-
-        self.subtract_log_odd = subtract_log_odd
-        if self.subtract_log_odd:
-            assert 'sampling_prob' in input_record
-
-        self._prediction_layer = layer_class(
-            model,
-            input_record.input,
-            output_dims=output_dims,
-            **kwargs
-        )
-
-        self._prediction_layer.train_param_blobs = [
-            model.net.NextBlob(str(blob) + '_sampled')
-            for blob in self._prediction_layer.param_blobs
-        ]
-
-        self.params = self._prediction_layer.params
-
-        self.output_schema = self._prediction_layer.output_schema
-
-    def add_ops(self, net):
-        self._prediction_layer.add_ops(net)
-
-    def add_train_ops(self, net):
-        for full_blob, sampled_blob in zip(
-            self._prediction_layer.param_blobs,
-            self._prediction_layer.train_param_blobs
-        ):
-            net.Gather([full_blob, self.input_record.indices()], sampled_blob)
-        self._prediction_layer.add_train_ops(net)
-        if not self.subtract_log_odd:
-            return
-        log_q = net.Log(self.input_record.sampling_prob(),
-                        net.NextScopedBlob("log_q"))
-        net.Sub([self.output_schema(), log_q], self.output_schema(),
-                broadcast=1, use_grad_hack=1)
diff --git a/caffe2/python/layers/sampling_trainable_mixin.py b/caffe2/python/layers/sampling_trainable_mixin.py
deleted file mode 100644
index fdfbcb9e8ff4..000000000000
--- a/caffe2/python/layers/sampling_trainable_mixin.py
+++ /dev/null
@@ -1,53 +0,0 @@
-## @package sampling_trainable_mixin
-# Module caffe2.python.layers.sampling_trainable_mixin
-
-
-
-
-
-import abc
-
-
-class SamplingTrainableMixin(metaclass=abc.ABCMeta):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self._train_param_blobs = None
-        self._train_param_blobs_frozen = False
-
-    @property
-    @abc.abstractmethod
-    def param_blobs(self):
-        """
-        List of parameter blobs for prediction net
-        """
-        pass
-
-    @property
-    def train_param_blobs(self):
-        """
-        If train_param_blobs is not set before used, default to param_blobs
-        """
-        if self._train_param_blobs is None:
-            self.train_param_blobs = self.param_blobs
-        return self._train_param_blobs
-
-    @train_param_blobs.setter
-    def train_param_blobs(self, blobs):
-        assert not self._train_param_blobs_frozen
-        assert blobs is not None
-        self._train_param_blobs_frozen = True
-        self._train_param_blobs = blobs
-
-    @abc.abstractmethod
-    def _add_ops(self, net, param_blobs):
-        """
-        Add ops to the given net, using the given param_blobs
-        """
-        pass
-
-    def add_ops(self, net):
-        self._add_ops(net, self.param_blobs)
-
-    def add_train_ops(self, net):
-        self._add_ops(net, self.train_param_blobs)
diff --git a/caffe2/python/layers/select_record_by_context.py b/caffe2/python/layers/select_record_by_context.py
deleted file mode 100644
index e691cbce57a0..000000000000
--- a/caffe2/python/layers/select_record_by_context.py
+++ /dev/null
@@ -1,76 +0,0 @@
-
-
-
-
-
-import logging
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import (
-    InstantiationContext,
-    ModelLayer,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-class SelectRecordByContext(ModelLayer):
-    """
-    Allowing model to follow different paths for each instantiation context and
-    join later at some point. The implementation use `Alias` because schema
-    sometimes clone fields internally so we need static blob name for output
-    """
-
-    def __init__(
-        self,
-        model,
-        input_record,
-        name='select_record_by_context',
-        check_field_metas=True,
-        use_copy=False,
-        default_output_record_field=None,
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-
-        assert isinstance(input_record, schema.Struct)
-        assert len(input_record) > 1
-
-        self.use_copy = use_copy
-        self.default_output_record = (
-            input_record[default_output_record_field]
-            if (default_output_record_field is not None) else None
-        )
-        ref_record = input_record[0]
-        for record in input_record:
-            assert schema.equal_schemas(record, ref_record,
-                                        check_field_metas=check_field_metas)
-
-        self.output_schema = schema.NewRecord(model.net, ref_record)
-
-    def _set_output_blobs(self, net, context):
-        record = self.input_record.get(context, self.default_output_record)
-        assert record is not None, (
-            "{} context is not in input record without providing default"
-            " output".format(context)
-        )
-        for in_blob, out_blob in zip(
-                record.field_blobs(), self.output_schema.field_blobs()
-        ):
-            if self.use_copy:
-                net.Copy(in_blob, out_blob)
-            else:
-                net.Alias(in_blob, out_blob)
-
-    def add_ops(self, net):
-        self._set_output_blobs(net, InstantiationContext.PREDICTION)
-
-    def add_eval_ops(self, net):
-        self._set_output_blobs(net, InstantiationContext.EVAL)
-
-    def add_train_ops(self, net):
-        self._set_output_blobs(net, InstantiationContext.TRAINING)
-
-    def add_ops_to_accumulate_pred(self, net):
-        self._set_output_blobs(net, InstantiationContext.ACCUMULATE_PRED)
diff --git a/caffe2/python/layers/semi_random_features.py b/caffe2/python/layers/semi_random_features.py
deleted file mode 100644
index 0df5ce4190fe..000000000000
--- a/caffe2/python/layers/semi_random_features.py
+++ /dev/null
@@ -1,144 +0,0 @@
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.arc_cosine_feature_map import ArcCosineFeatureMap
-import numpy as np
-
-
-class SemiRandomFeatures(ArcCosineFeatureMap):
-    """
-    Implementation of the semi-random kernel feature map.
-
-    Applies H(x_rand) * x_rand^s * x_learned, where
-        H is the Heaviside step function,
-        x_rand is the input after applying FC with randomized parameters,
-        and x_learned is the input after applying FC with learnable parameters.
-
-    If using multilayer model with semi-random layers, then input and output records
-    should have a 'full' and 'random' Scalar. The random Scalar will be passed as
-    input to process the random features.
-
-    For more information, see the original paper:
-        https://arxiv.org/pdf/1702.08882.pdf
-
-    Inputs :
-        output_dims -- dimensions of the output vector
-        s -- if s == 0, will obtain linear semi-random features;
-             else if s == 1, will obtain squared semi-random features;
-             else s >= 2, will obtain higher order semi-random features
-        scale_random -- amount to scale the standard deviation
-                        (for random parameter initialization when weight_init or
-                        bias_init hasn't been specified)
-        scale_learned -- amount to scale the standard deviation
-                        (for learned parameter initialization when weight_init or
-                        bias_init hasn't been specified)
-
-        weight_init_random -- initialization distribution for random weight parameter
-                              (if None, will use Gaussian distribution)
-        bias_init_random -- initialization distribution for random bias pararmeter
-                            (if None, will use Uniform distribution)
-        weight_init_learned -- initialization distribution for learned weight parameter
-                               (if None, will use Gaussian distribution)
-        bias_init_learned -- initialization distribution for learned bias pararmeter
-                             (if None, will use Uniform distribution)
-        weight_optim -- optimizer for weight params for learned features
-        bias_optim -- optimizer for bias param for learned features
-
-        set_weight_as_global_constant -- if True, initialized random parameters
-                                         will be constant across all distributed
-                                         instances of the layer
-    """
-    def __init__(
-            self,
-            model,
-            input_record,
-            output_dims,
-            s=1,
-            scale_random=1.0,
-            scale_learned=1.0,
-            weight_init_random=None,
-            bias_init_random=None,
-            weight_init_learned=None,
-            bias_init_learned=None,
-            weight_optim=None,
-            bias_optim=None,
-            set_weight_as_global_constant=False,
-            name='semi_random_features',
-            **kwargs):
-
-        if isinstance(input_record, schema.Struct):
-            schema.is_schema_subset(
-                schema.Struct(
-                    ('full', schema.Scalar()),
-                    ('random', schema.Scalar()),
-                ),
-                input_record
-            )
-            self.input_record_full = input_record.full
-            self.input_record_random = input_record.random
-
-        elif isinstance(input_record, schema.Scalar):
-            self.input_record_full = input_record
-            self.input_record_random = input_record
-
-        super().__init__(
-            model,
-            self.input_record_full,
-            output_dims,
-            s=s,
-            scale=scale_random,       # To initialize the random parameters
-            weight_init=weight_init_random,
-            bias_init=bias_init_random,
-            weight_optim=None,
-            bias_optim=None,
-            set_weight_as_global_constant=set_weight_as_global_constant,
-            initialize_output_schema=False,
-            name=name,
-            **kwargs)
-
-        self.output_schema = schema.Struct(
-            ('full', schema.Scalar(
-                (np.float32, output_dims),
-                model.net.NextScopedBlob(name + '_full_output')
-            ),),
-            ('random', schema.Scalar(
-                (np.float32, output_dims),
-                model.net.NextScopedBlob(name + '_random_output')
-            ),),
-        )
-
-        # To initialize the learnable parameters
-        assert (scale_learned > 0.0), \
-            "Expected scale (learned) > 0, got %s" % scale_learned
-        self.stddev = scale_learned * np.sqrt(1.0 / self.input_dims)
-
-        # Learned Parameters
-        (self.learned_w, self.learned_b) = self._initialize_params(
-            'learned_w',
-            'learned_b',
-            w_init=weight_init_learned,
-            b_init=bias_init_learned,
-            w_optim=weight_optim,
-            b_optim=bias_optim
-        )
-
-    def add_ops(self, net):
-        # Learned features: wx + b
-        learned_features = net.FC(self.input_record_full.field_blobs() +
-                                  [self.learned_w, self.learned_b],
-                                  net.NextScopedBlob('learned_features'))
-        # Random features: wx + b
-        random_features = net.FC(self.input_record_random.field_blobs() +
-                                 [self.random_w, self.random_b],
-                                 net.NextScopedBlob('random_features'))
-        processed_random_features = self._heaviside_with_power(
-            net,
-            random_features,
-            self.output_schema.random.field_blobs(),
-            self.s
-        )
-        net.Mul([processed_random_features, learned_features],
-                self.output_schema.full.field_blobs())
diff --git a/caffe2/python/layers/sparse_dropout_with_replacement.py b/caffe2/python/layers/sparse_dropout_with_replacement.py
deleted file mode 100644
index e7df3b495032..000000000000
--- a/caffe2/python/layers/sparse_dropout_with_replacement.py
+++ /dev/null
@@ -1,98 +0,0 @@
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import (
-    IdList,
-    ModelLayer,
-)
-
-# Model layer for implementing probabilistic replacement of elements in
-# IdLists.  Takes probabilities for train, eval and predict nets as input, as
-# well as the replacement value when dropout happens.  For features we may have
-# available to us in train net but not in predict net, we'd set dropout
-# probability for predict net to be 1.0 and set the feature to the replacement
-# value given here.  This way, the value is tied to the particular model and not
-# to any specific logic in feature processing in serving.
-
-# Consider the following example where X is the values in the IdList and Lengths
-# is the number of values corresponding to each example.
-# X: [1, 2, 3, 4, 5]
-# Lengths: [2, 3]
-# This IdList contains 2 items of lengths 2, 3.  Let's assume we used a ratio of
-# 0.5 and ended up dropping out 2nd example, and used a replacement value of -1.
-# We will end up with the following IdList.
-#
-# Y: [1, 2, -1]
-# OutputLengths: [2, 1]
-# where the 2nd item values [3,4,5] were replaced with [-1] and the length got
-# set to 1.
-
-class SparseDropoutWithReplacement(ModelLayer):
-    def __init__(
-            self,
-            model,
-            input_record,
-            dropout_prob_train,
-            dropout_prob_eval,
-            dropout_prob_predict,
-            replacement_value,
-            name='sparse_dropout',
-            **kwargs):
-
-        super().__init__(model, name, input_record, **kwargs)
-        assert schema.equal_schemas(input_record, IdList), "Incorrect input type"
-
-        self.dropout_prob_train = float(dropout_prob_train)
-        self.dropout_prob_eval = float(dropout_prob_eval)
-        self.dropout_prob_predict = float(dropout_prob_predict)
-        self.replacement_value = int(replacement_value)
-        assert (self.dropout_prob_train >= 0 and
-                self.dropout_prob_train <= 1.0), \
-            "Expected 0 <= dropout_prob_train <= 1, but got %s" \
-            % self.dropout_prob_train
-        assert (self.dropout_prob_eval >= 0 and
-                self.dropout_prob_eval <= 1.0), \
-            "Expected 0 <= dropout_prob_eval <= 1, but got %s" \
-            % dropout_prob_eval
-        assert (self.dropout_prob_predict >= 0 and
-                self.dropout_prob_predict <= 1.0), \
-            "Expected 0 <= dropout_prob_predict <= 1, but got %s" \
-            % dropout_prob_predict
-        assert(self.dropout_prob_train > 0 or
-               self.dropout_prob_eval > 0 or
-               self.dropout_prob_predict > 0), \
-            "Ratios all set to 0.0 for train, eval and predict"
-
-        self.output_schema = schema.NewRecord(model.net, IdList)
-        if input_record.lengths.metadata:
-            self.output_schema.lengths.set_metadata(
-                input_record.lengths.metadata)
-        if input_record.items.metadata:
-            self.output_schema.items.set_metadata(
-                input_record.items.metadata)
-
-    def _add_ops(self, net, ratio):
-        input_values_blob = self.input_record.items()
-        input_lengths_blob = self.input_record.lengths()
-
-        output_lengths_blob = self.output_schema.lengths()
-        output_values_blob = self.output_schema.items()
-
-        net.SparseDropoutWithReplacement([input_values_blob,
-                                          input_lengths_blob],
-                                         [output_values_blob,
-                                          output_lengths_blob],
-                                         ratio=ratio,
-                                         replacement_value=self.replacement_value)
-
-    def add_train_ops(self, net):
-        self._add_ops(net, self.dropout_prob_train)
-
-    def add_eval_ops(self, net):
-        self._add_ops(net, self.dropout_prob_eval)
-
-    def add_ops(self, net):
-        self._add_ops(net, self.dropout_prob_predict)
diff --git a/caffe2/python/layers/sparse_feature_hash.py b/caffe2/python/layers/sparse_feature_hash.py
deleted file mode 100644
index 4b7f29a6a661..000000000000
--- a/caffe2/python/layers/sparse_feature_hash.py
+++ /dev/null
@@ -1,114 +0,0 @@
-## @package sparse_feature_hash
-# Module caffe2.python.layers.sparse_feature_hash
-
-
-
-
-
-from caffe2.python import schema, core
-from caffe2.python.layers.layers import (
-    ModelLayer,
-    IdList,
-    IdScoreList,
-)
-from caffe2.python.layers.tags import (
-    Tags
-)
-
-import numpy as np
-
-
-class SparseFeatureHash(ModelLayer):
-
-    def __init__(self, model, input_record, seed=0, modulo=None,
-                 use_hashing=True, use_divide_mod=False, divisor=None, name='sparse_feature_hash', **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-
-        assert use_hashing + use_divide_mod < 2, "use_hashing and use_divide_mod cannot be set true at the same time."
-
-        if use_divide_mod:
-            assert divisor >= 1, 'Unexpected divisor: {}'.format(divisor)
-
-            self.divisor = self.create_param(param_name='divisor',
-                                             shape=[1],
-                                             initializer=('GivenTensorInt64Fill', {'values': np.array([divisor])}),
-                                             optimizer=model.NoOptim)
-
-        self.seed = seed
-        self.use_hashing = use_hashing
-        self.use_divide_mod = use_divide_mod
-
-        if schema.equal_schemas(input_record, IdList):
-            self.modulo = modulo or self.extract_hash_size(input_record.items.metadata)
-            metadata = schema.Metadata(
-                categorical_limit=self.modulo,
-                feature_specs=input_record.items.metadata.feature_specs if input_record.items.metadata else None,
-                expected_value=input_record.items.metadata.expected_value if input_record.items.metadata else None
-            )
-            with core.NameScope(name):
-                self.output_schema = schema.NewRecord(model.net, IdList)
-            self.output_schema.items.set_metadata(metadata)
-
-        elif schema.equal_schemas(input_record, IdScoreList):
-            self.modulo = modulo or self.extract_hash_size(input_record.keys.metadata)
-            metadata = schema.Metadata(
-                categorical_limit=self.modulo,
-                feature_specs=input_record.keys.metadata.feature_specs,
-                expected_value=input_record.keys.metadata.expected_value
-            )
-            with core.NameScope(name):
-                self.output_schema = schema.NewRecord(model.net, IdScoreList)
-            self.output_schema.keys.set_metadata(metadata)
-
-        else:
-            assert False, "Input type must be one of (IdList, IdScoreList)"
-
-        assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo)
-        if input_record.lengths.metadata:
-            self.output_schema.lengths.set_metadata(input_record.lengths.metadata)
-
-        # operators in this layer do not have CUDA implementation yet.
-        # In addition, since the sparse feature keys that we are hashing are
-        # typically on CPU originally, it makes sense to have this layer on CPU.
-        self.tags.update([Tags.CPU_ONLY])
-
-    def extract_hash_size(self, metadata):
-        if metadata.feature_specs and metadata.feature_specs.desired_hash_size:
-            return metadata.feature_specs.desired_hash_size
-        elif metadata.categorical_limit is not None:
-            return metadata.categorical_limit
-        else:
-            assert False, "desired_hash_size or categorical_limit must be set"
-
-    def add_ops(self, net):
-        net.Copy(
-            self.input_record.lengths(),
-            self.output_schema.lengths()
-        )
-        if schema.equal_schemas(self.output_schema, IdList):
-            input_blob = self.input_record.items()
-            output_blob = self.output_schema.items()
-        elif schema.equal_schemas(self.output_schema, IdScoreList):
-            input_blob = self.input_record.keys()
-            output_blob = self.output_schema.keys()
-            net.Copy(
-                self.input_record.values(),
-                self.output_schema.values()
-            )
-        else:
-            raise NotImplementedError()
-
-        if self.use_hashing:
-            net.IndexHash(
-                input_blob, output_blob, seed=self.seed, modulo=self.modulo
-            )
-        else:
-            if self.use_divide_mod:
-                quotient = net.Div([input_blob, self.divisor], [net.NextScopedBlob('quotient')])
-                net.Mod(
-                    quotient, output_blob, divisor=self.modulo, sign_follow_divisor=True
-                )
-            else:
-                net.Mod(
-                    input_blob, output_blob, divisor=self.modulo, sign_follow_divisor=True
-                )
diff --git a/caffe2/python/layers/sparse_itemwise_dropout_with_replacement.py b/caffe2/python/layers/sparse_itemwise_dropout_with_replacement.py
deleted file mode 100644
index 8fa5ce0128b3..000000000000
--- a/caffe2/python/layers/sparse_itemwise_dropout_with_replacement.py
+++ /dev/null
@@ -1,103 +0,0 @@
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import (
-    IdList,
-    ModelLayer,
-)
-
-# Model layer for implementing probabilistic replacement of individual elements in
-# IdLists.  Takes probabilities for train, eval and predict nets as input, as
-# well as the replacement value when dropout happens.  For features we may have
-# available to us in train net but not in predict net, we'd set dropout
-# probability for predict net to be 1.0 and set the feature to the replacement
-# value given here.  This way, the value is tied to the particular model and not
-# to any specific logic in feature processing in serving.
-
-# Consider the following example where X is the values in the IdList and Lengths
-# is the number of values corresponding to each example.
-# X: [1, 2, 3, 4, 5]
-# Lengths: [2, 3]
-# This IdList contains 2 IdList features of lengths 2, 3.  Let's assume we used a
-# ratio of 0.5 and ended up dropping out 2nd item in 2nd IdList feature, and used a
-# replacement value of -1. We will end up with the following IdList.
-
-# Y: [1, 2, 3, -1, 5]
-# OutputLengths: [2, 3]
-# where the 2nd item in 2nd IdList feature [4] was replaced with [-1].
-
-class SparseItemwiseDropoutWithReplacement(ModelLayer):
-    def __init__(
-            self,
-            model,
-            input_record,
-            dropout_prob_train,
-            dropout_prob_eval,
-            dropout_prob_predict,
-            replacement_value,
-            name='sparse_itemwise_dropout',
-            **kwargs):
-
-        super().__init__(model, name, input_record, **kwargs)
-        assert schema.equal_schemas(input_record, IdList), "Incorrect input type"
-
-        self.dropout_prob_train = float(dropout_prob_train)
-        self.dropout_prob_eval = float(dropout_prob_eval)
-        self.dropout_prob_predict = float(dropout_prob_predict)
-        self.replacement_value = int(replacement_value)
-        assert (self.dropout_prob_train >= 0 and
-                self.dropout_prob_train <= 1.0), \
-            "Expected 0 <= dropout_prob_train <= 1, but got %s" \
-            % self.dropout_prob_train
-        assert (self.dropout_prob_eval >= 0 and
-                self.dropout_prob_eval <= 1.0), \
-            "Expected 0 <= dropout_prob_eval <= 1, but got %s" \
-            % dropout_prob_eval
-        assert (self.dropout_prob_predict >= 0 and
-                self.dropout_prob_predict <= 1.0), \
-            "Expected 0 <= dropout_prob_predict <= 1, but got %s" \
-            % dropout_prob_predict
-        assert(self.dropout_prob_train > 0 or
-               self.dropout_prob_eval > 0 or
-               self.dropout_prob_predict > 0), \
-            "Ratios all set to 0.0 for train, eval and predict"
-
-        self.output_schema = schema.NewRecord(model.net, IdList)
-        if input_record.lengths.metadata:
-            self.output_schema.lengths.set_metadata(
-                input_record.lengths.metadata)
-        if input_record.items.metadata:
-            self.output_schema.items.set_metadata(
-                input_record.items.metadata)
-
-    def _add_ops(self, net, ratio):
-        input_values_blob = self.input_record.items()
-        input_lengths_blob = self.input_record.lengths()
-
-        output_lengths_blob = self.output_schema.lengths()
-        output_values_blob = self.output_schema.items()
-
-        net.SparseItemwiseDropoutWithReplacement(
-            [
-                input_values_blob,
-                input_lengths_blob
-            ],
-            [
-                output_values_blob,
-                output_lengths_blob
-            ],
-            ratio=ratio,
-            replacement_value=self.replacement_value
-        )
-
-    def add_train_ops(self, net):
-        self._add_ops(net, self.dropout_prob_train)
-
-    def add_eval_ops(self, net):
-        self._add_ops(net, self.dropout_prob_eval)
-
-    def add_ops(self, net):
-        self._add_ops(net, self.dropout_prob_predict)
diff --git a/caffe2/python/layers/sparse_lookup.py b/caffe2/python/layers/sparse_lookup.py
deleted file mode 100644
index cff997152e5d..000000000000
--- a/caffe2/python/layers/sparse_lookup.py
+++ /dev/null
@@ -1,557 +0,0 @@
-## @package sparse_lookup
-# Module caffe2.python.layers.sparse_lookup
-
-
-
-
-
-from caffe2.python.optimizer import FP16_ENGINES, Optimizer
-from caffe2.python.helpers.arg_scope import get_current_scope
-from caffe2.python import schema
-from caffe2.python.layers.layers import (
-    get_categorical_limit,
-    get_key,
-    IdList,
-    IdScoreList,
-    IdListWithEvicted,
-    IdScoreListWithEvicted,
-    LayerPsParam,
-    ModelLayer,
-    almost_equal_schemas,
-)
-import collections
-import functools
-import logging
-import math
-import numpy as np
-import operator
-
-logger = logging.getLogger(__name__)
-
-
-def get_trainer_version_based_on_optim(optim_def):
-    if isinstance(optim_def, Optimizer) and hasattr(optim_def, "engine"):
-        logger.info(
-            "Attempting to set trainer version for engine {}".format(optim_def.engine)
-        )
-        if optim_def.engine in FP16_ENGINES:
-            logger.info("Setting FP16 trainer for engine {}".format(optim_def.engine))
-            return "fp16"
-        else:
-            logger.info("Setting FP32 trainer for engine {}".format(optim_def.engine))
-            return "fp32"
-    else:
-        return "fp32"
-
-
-def get_sparse_lookup_predictor_version(
-    version,
-    blob_size=None,
-    min_blob_size_4bits=None,
-    embedding_dim=None,
-    sparse_feature_name=None,
-):
-    assert version in {
-        'fp32', 'fp16', 'uint8rowwise', 'fused_uint8rowwise', 'fused_uint4rowwise'
-    }, "Unexpected version of sparse_lookup layer {0}".format(version)
-    if version == 'fused_uint4rowwise':
-        if (
-            blob_size is not None
-            and min_blob_size_4bits is not None
-            and embedding_dim is not None
-        ):
-            if blob_size < min_blob_size_4bits:
-                logger.info(
-                    "{} fall back to uint8 because lookup table size {} < min_blob_size_4bits {}".format(
-                        sparse_feature_name,
-                        blob_size,
-                        min_blob_size_4bits,
-                    )
-                )
-                version = 'fused_uint8rowwise'
-
-            if embedding_dim % 2 == 1:
-                logger.info(
-                    "{} fall back to uint8 because lookup table dimension {} is not divisible by 2".format(
-                        sparse_feature_name, embedding_dim
-                    )
-                )
-                version = 'fused_uint8rowwise'
-        else:
-            raise ValueError(
-                (
-                    "When 4 bit quantization is enabled for {}, "
-                    "(i.e., Sparse lookup predictor version:{}), "
-                    "requires arguments blob_size:{}, "
-                    "min_blob_size_4bits:{}, embedding_dim:{}"
-                ).format(
-                    sparse_feature_name,
-                    version,
-                    blob_size,
-                    min_blob_size_4bits,
-                    embedding_dim
-                )
-            )
-    return version
-
-
-def get_sparse_lookup_trainer_version(version):
-    assert version in {'fp32', 'fp16'},\
-        "Unexpected version of sparse_lookup layer {0}".format(version)
-    return version
-
-def _is_id_list(input_record):
-    return almost_equal_schemas(input_record, IdList)
-
-
-def _is_id_score_list(input_record):
-    return almost_equal_schemas(input_record,
-                                IdScoreList,
-                                check_field_types=False)
-
-
-class SparseLookup(ModelLayer):
-    _id_list_supported_reducers = [
-        'LogMeanExp', 'LogSumExp', 'Max', 'Mean', 'Sum',
-        'WeightedSum', 'WeightedMean', 'Sqrt', 'None']
-
-    _id_score_list_supported_reducers = [
-        'PositionWeighted', 'RecencyWeighted', 'Mean', 'Sum', 'WeightedSum',
-        'WeightedMean', 'None'
-    ]
-
-    _fp16_compatible_init_op_types = [
-        'Float16UniformFill'
-    ]
-
-    _fp16_compatible_reducers = [
-        'Sum', 'Mean', 'Sqrt', 'PositionWeighted', 'RecencyWeighted',
-    ]
-
-    def __init__(self, model, input_record, inner_shape, reducer,
-                 weight_init=None, weight_optim=None,
-                 name='sparse_lookup', regularizer=None, use_external_weights=False,
-                 uniform_weight_init_scale_numerator=1.0, **kwargs):
-
-        super().__init__(model, name, input_record, **kwargs)
-
-        self.sparse_key = get_key(self.input_record)()
-        logger.info("Setup the sparse lookup layer for " + self.sparse_key)
-
-        # TODO Add some asserts about input type
-        if isinstance(inner_shape, int):
-            inner_shape = [inner_shape]
-        assert isinstance(inner_shape, list) or isinstance(inner_shape, tuple),\
-            "Unexpected type for inner_shape, expected list or tuple, got {0} for {1}".\
-            format(type(inner_shape), self.sparse_key)
-
-        if reducer == "PositionWeighted":
-            assert _is_id_score_list(self.input_record), (
-                "PositionWeighted only support IdScoreList, but got {} for {}"
-                + "please use PositionWeighted layer to convert IdList "
-                + "to IdScoreList"
-            ).format(repr(self.input_record), self.sparse_key)
-            self.external_weights = self.input_record.values()
-
-        elif reducer == "RecencyWeighted":
-            assert _is_id_score_list(self.input_record), (
-                "RecencyWeighted only supports IdScoreList, "
-                "while the sparse feature {} is not.".format(self.sparse_key)
-            )
-            self.external_weights = self.input_record.values()
-        # TODO: create a new type of reducer with external weights to wrap
-        # this and the above two cases since essentially their input formats
-        # are the same.
-        elif use_external_weights:
-            assert _is_id_score_list(self.input_record), (
-                "Use_external_weights only supports IdScoreList, "
-                "while the sparse feature {} is not.".format(self.sparse_key)
-            )
-            assert reducer in ["Sum", "WeightedSum"], (
-                "Use_external_weights only supports Sum reducer, "
-                "while the reducer is {}.".format(reducer)
-            )
-            self.external_weights = self.input_record.values()
-        self.reducer = reducer
-        self.use_external_weights = use_external_weights
-
-        input_dim = get_categorical_limit(self.input_record)
-        assert input_dim > 0, "{} should have categorical limit > 0, but got {}".format(
-            self.sparse_key, input_dim
-        )
-
-        self.input_dim = input_dim
-        self.shape = [input_dim] + inner_shape
-
-        self.trainer_version = get_trainer_version_based_on_optim(
-            weight_optim
-        )
-
-        self.uniform_weight_init_scale_numerator = uniform_weight_init_scale_numerator
-        default_init_op = self._get_default_init_op()
-
-        self.weight_init = weight_init or default_init_op
-
-        self.evicted_values = None
-        if schema.equal_schemas(
-            self.input_record, IdListWithEvicted
-        ) or schema.equal_schemas(
-            self.input_record, IdScoreListWithEvicted, check_field_types=False
-        ):
-            self.evicted_values = self.input_record._evicted_values
-
-        # If fp16 is used, make sure fp16 init op is used
-        if self.trainer_version == "fp16":
-            assert self.reducer in self._fp16_compatible_reducers or use_external_weights, (
-                "Fp16 training is enabled. The reducer specified is not supported. "
-                "Got {}. Supported reducers: {}. Right now, in general, sum, mean, "
-                "positional pooling are supported. Attention is not. Please check "
-                "if there is fp16 trained sparse features using advanced pooling.".format(
-                    self.reducer, self._fp16_compatible_reducers)
-            )
-
-            # if init op is UniformFill, we replace it directly
-            if self.weight_init[0] == "UniformFill":
-                self.weight_init = ("Float16UniformFill", self.weight_init[1])
-            assert self.weight_init[0] in self._fp16_compatible_init_op_types, (
-                "Fp16 training is enabled. Init op for weight parameter must be fp16 "
-                "compatibale. Got {}. Supported ops: {}".format(
-                    self.weight_init[0],
-                    self._fp16_compatible_init_op_types)
-            )
-
-            assert regularizer is None, "Regularizer is not compatible with fp16"
-
-        if self.input_record.lengths.metadata:
-            avg_length = self.input_record.lengths.metadata.expected_value
-        else:
-            avg_length = None
-
-        self.w = self.create_param(
-            param_name='w',
-            shape=self.shape,
-            initializer=self.weight_init,
-            optimizer=weight_optim,
-            ps_param=LayerPsParam(
-                sparse_key=self.sparse_key,
-                average_length=avg_length),
-            regularizer=regularizer
-        )
-        if self.evicted_values:
-            self.reinit_vec = self.create_param(
-                param_name="reinit_vec",
-                shape=inner_shape,
-                initializer=self.weight_init,
-                optimizer=model.NoOptim,
-                regularizer=None,
-            )
-
-        self.scale_bias_init = ('ConstantFill', {'value': 0.0})
-
-        self.scale_bias = self.create_param(
-            param_name='scale_bias',
-            shape=[],
-            initializer=self.scale_bias_init,
-            optimizer=model.NoOptim,
-        )
-
-        self.output_schema = schema.Scalar(
-            (np.float32, inner_shape),
-            self.get_next_blob_reference('output'),
-        )
-
-    def get_memory_usage(self):
-        return functools.reduce(operator.mul, self.shape) * 4
-
-    def get_fp16_compatible_parameters(self):
-        return [self.w]
-
-    def support_8bit(self):
-        # Rowwise quantization makes sense only if shape it's 2D matrix with
-        # second dimension >= 8
-        if len(self.shape) != 2 or self.shape[1] < 8:
-            return False
-        return True
-
-    def get_8bits_compatible_parameters(self, fused=True):
-        if not self.support_8bit():
-            return []
-        if fused:
-            RowwiseQuantized8BitsWeight = collections.namedtuple(
-                'RowwiseQuantized8BitsWeight', 'w'
-            )
-            return [RowwiseQuantized8BitsWeight(self.w)]
-        else:
-            RowwiseQuantized8BitsWeight = collections.namedtuple(
-                'RowwiseQuantized8BitsWeight', 'w, scale_bias'
-            )
-            return [RowwiseQuantized8BitsWeight(self.w, self.scale_bias)]
-
-    def _get_default_init_op(self):
-        scale = math.sqrt(self.uniform_weight_init_scale_numerator / self.input_dim)
-
-        if self.trainer_version == 'fp32':
-            default_weight_init = ('UniformFill', {'min': -scale, 'max': scale})
-        elif self.trainer_version == 'fp16':
-            default_weight_init = ("Float16UniformFill", {'min': -scale, 'max': scale})
-        else:
-            raise NotImplementedError(
-                "Train version {} is not currently supported for sparse feature {}".format(
-                    trainer_version, self.sparse_key
-                )
-            )
-
-        return default_weight_init
-
-    def _gather_wrapper(self, net, version, in_indices, out):
-        # Gather can work on all kinds of input data types, and output
-        # data with the same type. Convert the output of Gather to float,
-        # because the follow-up Ops expect fp32.
-        if version == 'fp32':
-            return net.Gather([self.w, in_indices], out)
-        elif version == 'fp16':
-            gathered_w = net.Gather([self.w, in_indices], 'gathered_w')
-            return net.HalfToFloat(gathered_w, out)
-        elif version == 'uint8rowwise':
-            gathered_w = net.Gather([self.w, in_indices], 'gathered_w')
-            gathered_scale_bias = net.Gather(
-                [self.scale_bias, in_indices],
-                'gathered_scale_bias'
-            )
-
-            return net.Rowwise8BitQuantizedToFloat(
-                [gathered_w, gathered_scale_bias], out)
-        elif version == 'fused_uint8rowwise':
-            gathered_w = net.Gather([self.w, in_indices], 'gathered_w')
-            return net.Fused8BitRowwiseQuantizedToFloat(gathered_w, out)
-        elif version == 'fused_uint4rowwise':
-            gathered_w = net.Gather([self.w, in_indices], 'gathered_w')
-            return net.Fused4BitRowwiseQuantizedToFloat(gathered_w, out)
-
-        else:
-            raise "Unsupported version of operators in SparseLookup " +\
-                "layer: {0} for sparse feature {1}".format(
-                    version, self.sparse_key
-                )
-
-    def _sparse_lengths_weighted_reducer(
-        self,
-        in_indices,
-        weights,
-        reducer,
-        net,
-        version,
-        grad_on_weights=0,
-    ):
-        op_input = [
-            self.w,
-            weights,
-            in_indices,
-            self.input_record.lengths(),
-        ]
-        layer_name = 'SparseLengths' + reducer
-
-        if version in ['fp32', 'fp16']:
-            # SparseLengths* Ops will accept either fp16 or fp32 embedding
-            # matrix and output fp32 pooled embedding
-            # A special case here is that we need FP16 engine for
-            # SparseLengthsWeightedSum when FP16 embeedings are used for
-            # correct backward updates
-            if reducer == "WeightedSum" and version == "fp16":
-                net.SparseLengthsWeightedSum(
-                    op_input,
-                    self.output_schema.field_blobs(),
-                    grad_on_weights=grad_on_weights,
-                    engine='FP16',
-                )
-            else:
-                net.__getattr__(layer_name)(
-                    op_input,
-                    self.output_schema.field_blobs(),
-                    grad_on_weights=grad_on_weights,
-                )
-        elif version == 'uint8rowwise':
-            op_input.insert(len(op_input), self.scale_bias)
-            net.__getattr__(layer_name + '8BitsRowwise')(
-                op_input, self.output_schema.field_blobs())
-        elif version == 'fused_uint8rowwise':
-            net.__getattr__(layer_name + 'Fused8BitRowwise')(
-                op_input, self.output_schema.field_blobs())
-        elif version == 'fused_uint4rowwise':
-            net.__getattr__(layer_name + 'Fused4BitRowwise')(
-                op_input, self.output_schema.field_blobs())
-        else:
-            raise "Unsupported version of operator in SparseLookUp " +\
-                "layer: {0} for sparse feature {1}".format(
-                    version, self.sparse_key
-                )
-
-    # deal with sparse features of id_list type
-    def _add_ops_id_list(self, net, version):
-        assert self.reducer in self._id_list_supported_reducers, (
-            "Unsupported reducer: {} for ID_LIST {}".format(
-                self.reducer, self.sparse_key
-            )
-        )
-        if self.reducer in ['Sum', 'Mean', 'WeightedSum', 'WeightedMean']:
-            op_input = [self.w,
-                        self.input_record.items(),
-                        self.input_record.lengths()]
-
-            # For id list features, the behaviors of 'Sum' and
-            # 'WeightedSum' are identical, since we can regard the weight on each
-            # id as 1. Similarly, for 'Mean' and 'WeightedMean'.
-            if self.reducer == 'WeightedSum':
-                self.reducer = 'Sum'
-            elif self.reducer == 'WeightedMean':
-                self.reducer = 'Mean'
-
-            layer_name = 'SparseLengths' + self.reducer
-            if version in ['fp32', 'fp16']:
-                # SparseLengths* Ops will accept either fp16 or fp32 embedding
-                # matrix and output fp32 pooled embedding
-                net.__getattr__(layer_name)(
-                    op_input,
-                    self.output_schema.field_blobs(),
-                )
-            elif version == 'uint8rowwise':
-                op_input.insert(len(op_input), self.scale_bias)
-                net.__getattr__(layer_name + '8BitsRowwise')(
-                    op_input, self.output_schema.field_blobs())
-            elif version == 'fused_uint8rowwise':
-                net.__getattr__(layer_name + 'Fused8BitRowwise')(
-                    op_input, self.output_schema.field_blobs())
-            elif version == 'fused_uint4rowwise':
-                net.__getattr__(layer_name + 'Fused4BitRowwise')(
-                    op_input, self.output_schema.field_blobs())
-            else:
-                raise "Unsupported version of operator in SparseLookUp " +\
-                    "layer: {0} for sparse feature {1}".format(
-                        version, self.sparse_key
-                    )
-
-        elif self.reducer == 'Sqrt':
-            sqrt_weight = net.LengthsToWeights(
-                [self.input_record.lengths()],
-                [net.NextScopedBlob('lengths_sqrt')],
-                power=0.5,
-            )
-            self._sparse_lengths_weighted_reducer(
-                self.input_record.items(),
-                sqrt_weight,
-                'WeightedSum', net, version)
-
-        elif self.reducer == 'None':
-            # Gather operator will gather the embedding for each id of
-            # each IdList.
-            self._gather_wrapper(net, version, self.input_record.items(),
-                                 self.output_schema.field_blobs())
-
-        else:
-            table_rows = self._gather_wrapper(
-                net, version, self.input_record.items(), 'table_rows')
-
-            segment_ids = net.LengthsToSegmentIds(
-                self.input_record.lengths(),
-                net.NextScopedBlob(self.input_record.lengths() + '_sid'))
-            net.__getattr__('SortedSegmentRange' + self.reducer)(
-                [table_rows, segment_ids],
-                self.output_schema.field_blobs(),
-            )
-
-    # deal with sparse features of id_score_list type
-    def _add_ops_id_score_list(self, net, version):
-        assert self.reducer in self._id_score_list_supported_reducers, (
-            "Unsupported reducer: {} for ID_SCORE_LIST {}".format(
-                self.reducer, self.sparse_key
-            )
-        )
-        if self.reducer in ['WeightedSum', 'WeightedMean']:
-            self._sparse_lengths_weighted_reducer(
-                self.input_record.keys(),
-                self.input_record.values(),
-                self.reducer, net, version)
-
-        elif self.reducer in ['PositionWeighted', 'RecencyWeighted'] or self.use_external_weights:
-            self._sparse_lengths_weighted_reducer(
-                self.input_record.keys(),
-                self.external_weights,
-                'WeightedSum', net, version, grad_on_weights=1)
-
-        elif self.reducer in ['Sum', 'Mean']:
-            op_input = [self.w,
-                        self.input_record.keys(),
-                        self.input_record.lengths()]
-
-            layer_name = 'SparseLengths' + self.reducer
-
-            if version in ['fp32', 'fp16']:
-                net.__getattr__(layer_name)(
-                    op_input,
-                    self.output_schema.field_blobs(),
-                )
-            elif version == 'uint8rowwise':
-                net.__getattr__(layer_name + '8BitsRowwise')(
-                    op_input, self.output_schema.field_blobs())
-            elif version == 'fused_uint8rowwise':
-                net.__getattr__(layer_name + 'Fused8BitRowwise')(
-                    op_input, self.output_schema.field_blobs())
-            elif version == 'fused_uint4rowwise':
-                net.__getattr__(layer_name + 'Fused4BitRowwise')(
-                    op_input, self.output_schema.field_blobs())
-            else:
-                raise "Unsupported version of operator in SparseLookUp " +\
-                    "layer: {0} for sparse feature {1}".format(
-                        version, self.sparse_key
-                    )
-
-        elif self.reducer == 'None':
-            # Gather operator will gather the embedding for each id of
-            # each IdList.
-            self._gather_wrapper(net, version, self.input_record.keys(),
-                                 self.output_schema.field_blobs())
-        else:
-            raise "Only Sum, Mean, None are supported for IdScoreList input." +\
-                "Trying to create with {} for sparse feature {}".format(
-                    self.reducer, self.sparse_key
-                )
-
-    def _add_ops(self, net, version='fp32', is_train=True):
-        if self.evicted_values and is_train:
-            net.CopyRowsToTensor(
-                [self.w, self.evicted_values.get(), self.reinit_vec], [self.w])
-        if _is_id_list(self.input_record):
-            self._add_ops_id_list(net, version=version)
-        elif _is_id_score_list(self.input_record):
-            self._add_ops_id_score_list(net, version=version)
-        else:
-            raise "Unsupported input type {0}".format(self.input_record)
-
-    def add_train_ops(self, net):
-        self._add_ops(net, self.trainer_version, is_train=True)
-
-    def add_ops(self, net):
-        version_info = get_current_scope().get(
-            get_sparse_lookup_predictor_version.__name__, {'version': 'fp32'}
-        )
-        lookup_table_blob_size = self.shape[0] * self.shape[1]
-        version = get_sparse_lookup_predictor_version(
-            version_info['version'],
-            blob_size=lookup_table_blob_size,
-            min_blob_size_4bits=(
-                version_info['min_blob_size_4bits']
-                if 'min_blob_size_4bits' in version_info
-                else None
-            ),
-            embedding_dim=self.shape[1],
-            sparse_feature_name=self.sparse_key,
-        )
-
-        # TODO(amalevich): Layer should not be responsible for decision about
-        # quantization.
-        if not self.support_8bit() and version in {'uint8rowwise',
-                                                   'fused_uint8rowwise',
-                                                   'fused_uint4rowwise'}:
-            version = 'fp16'
-
-        self._add_ops(net, version, is_train=False)
diff --git a/caffe2/python/layers/split.py b/caffe2/python/layers/split.py
deleted file mode 100644
index c70bdc21b474..000000000000
--- a/caffe2/python/layers/split.py
+++ /dev/null
@@ -1,75 +0,0 @@
-## @package split
-# Module caffe2.python.layers.split
-
-
-
-
-
-from caffe2.python import schema
-from caffe2.python.layers.layers import (
-    ModelLayer,
-)
-
-
-class Split(ModelLayer):
-
-    def __init__(self, model, input_record, num_splits=1, axis=1,
-                 name='split', split=None, **kwargs):
-        super().__init__(model, name, input_record, **kwargs)
-        self.axis = axis
-        # Assume that first dimension is batch, so actual axis in shape is
-        # axis - 1
-        axis -= 1
-        assert axis >= 0
-
-        assert isinstance(input_record, schema.Scalar),\
-            "Incorrect input type. Expected Scalar, but received: {0}".\
-            format(input_record)
-
-        input_shape = input_record.field_type().shape
-        assert len(input_shape) >= axis
-        if split is None:
-            assert input_shape[axis] % num_splits == 0
-        else:
-            num_splits = len(split)
-            assert input_shape[axis] == sum(split)
-
-        if split is None:
-            output_shape = list(input_shape)
-            output_shape[axis] = int(output_shape[axis] / num_splits)
-        else:
-            output_shape = []
-            for i in range(num_splits):
-                output_shape_i = list(input_shape)
-                output_shape_i[axis] = split[i]
-                output_shape.append(output_shape_i)
-
-        data_type = input_record.field_type().base
-
-
-        if split is None:
-            output_scalars = [
-                schema.Scalar(
-                    (data_type, output_shape),
-                    self.get_next_blob_reference('output_{}'.format(i)),
-                )
-                for i in range(num_splits)
-            ]
-        else:
-            output_scalars = [
-                schema.Scalar(
-                    (data_type, output_shape[i]),
-                    self.get_next_blob_reference('output_{}'.format(i)),
-                )
-                for i in range(num_splits)
-            ]
-        self.output_schema = schema.Tuple(*output_scalars)
-        self.split = split
-
-    def add_ops(self, net):
-        net.Split(
-            self.input_record.field_blobs(),
-            self.output_schema.field_blobs(),
-            split=self.split,
-            axis=self.axis,
-        )
diff --git a/caffe2/python/layers/tags.py b/caffe2/python/layers/tags.py
deleted file mode 100644
index 7fbea3be9f7e..000000000000
--- a/caffe2/python/layers/tags.py
+++ /dev/null
@@ -1,122 +0,0 @@
-## @package tags
-# Module caffe2.python.layers.tags
-
-
-
-
-
-import functools
-
-from caffe2.python import context
-
-
-class TagContext(context.DefaultManaged):
-    """
-    Scope driven way to provide tags to the layers.
-    """
-
-    def __init__(self, tags=None):
-        # Tags is expected to be list to keep order of adding/removing things
-        self.tags = tags or []
-
-    def add_tags(self, tags):
-        self.tags.extend(tags)
-
-    def remove_tags(self, tags):
-        assert self.tags[-len(tags):] == tags
-        self.tags = self.tags[:-len(tags)]
-
-
-class Tags:
-    # TODO(amalevich): Tags might need to live in their own contexts, add this
-    # split later
-    EXCLUDE_FROM_TRAIN = 'exclude_from_train'
-    EXCLUDE_FROM_EVAL = 'exclude_from_eval'
-    EXCLUDE_FROM_PREDICTION = 'exclude_from_prediction'
-    EXCLUDE_FROM_ACCUMULATE_PRED = 'exclude_from_accumulate_pred'
-    PREPROCESSING = 'preprocessing'
-    HANDLE_AS_SPARSE_LAYER = 'handle_as_sparse_layer'
-    PREFER_GPU = 'prefer_gpu'
-    CPU_ONLY = 'cpu_only'
-    LOCAL = 'local'
-
-    # The following three tags are hints to **distributed training framework**.
-    """
-    Indicates a layer contains a sparse shardable parameter.  The parameter
-    should be sharded nd operators on those parameters should be done on
-    distributed parameter servers.
-    """
-    SPARSE_SHARDED = 'sparse_sharded'
-    """
-    Indicates a layer contains a sparse parameters among others, and that the
-    parameters should not be sharded (i.e. should be placed together on a node).
-    """
-    SPARSE_DONT_SHARD = 'sparse_dont_shard'
-    """
-    Used to manually indicate a component for an operator.  Parameters for
-    all operators with the same component should be colocated on the same
-    parameter server.
-    """
-    COMPONENT = 'component:'
-    PIPELINE = 'pipeline:'
-    """
-    Indicate it's a dense layer or dense param init,
-    but we use hogwild across multiple trainers
-    """
-    HOGWILD_DENSE = "hogwild_dense"
-    """
-    Valid tag prefixes for distributed training framework.
-    """
-    """
-    Used to pass on info to the 'extra_info' field in the net
-    Proto. Typically to provide info for distributed training.
-    """
-    EXTRA_INFO = 'extra_info:'
-    """
-    An empty tag, used to make conditional statement on with(Tags) block more concise
-    """
-    EMPTY_TAG = 'empty_tag'
-
-    DT_TAGS = (SPARSE_SHARDED, SPARSE_DONT_SHARD, COMPONENT, HOGWILD_DENSE)
-
-    # In certain cases we want to have different schema for training and
-    # prediction, as an example in prediction we might need to have only
-    # subset of ids present in the original schema. This tag is one of the ways
-    # to mark operators that will be removed from prediction and should
-    # override schema for predictors.
-    PREDICTION_SCHEMA = 'prediction_schema'
-
-    # This is to mark layers in the feature transform process.
-    FEATURE_TRANSFORM = 'feature_transform'
-    # This is to mark the output layers in the feature transform process
-    FEATURE_TRANSFORM_SCHEMA = 'feature_transform_schema'
-
-    def __init__(self, tags):
-        if not isinstance(tags, list):
-            tags = [tags]
-        self.tags = tags
-
-    def __enter__(self):
-        TagContext.current().add_tags(self.tags)
-        return self
-
-    def __exit__(self, type, value, traceback):
-        TagContext.current().remove_tags(self.tags)
-
-    def __call__(self, func):
-        @functools.wraps(func)
-        def wrapper(*args, **kwargs):
-            with self:
-                return func(*args, **kwargs)
-        return wrapper
-
-
-# pyre-fixme[16]: Tags has no attribute `TRAIN_ONLY`
-Tags.TRAIN_ONLY = [Tags.EXCLUDE_FROM_PREDICTION, Tags.EXCLUDE_FROM_EVAL,
-                   Tags.EXCLUDE_FROM_ACCUMULATE_PRED]
-# pyre-fixme[16]: Tags has no attribute `EVAL_ONLY`
-Tags.EVAL_ONLY = [Tags.EXCLUDE_FROM_PREDICTION, Tags.EXCLUDE_FROM_TRAIN,
-                  Tags.EXCLUDE_FROM_ACCUMULATE_PRED]
-# pyre-fixme[16]: Tags has no attribute `PREDICTION_ONLY`
-Tags.PREDICTION_ONLY = [Tags.EXCLUDE_FROM_TRAIN, Tags.EXCLUDE_FROM_EVAL,
-                        Tags.EXCLUDE_FROM_ACCUMULATE_PRED]
diff --git a/caffe2/python/layers/uniform_sampling.py b/caffe2/python/layers/uniform_sampling.py
deleted file mode 100644
index 76631b09bdd6..000000000000
--- a/caffe2/python/layers/uniform_sampling.py
+++ /dev/null
@@ -1,81 +0,0 @@
-## @package uniform_sampling
-# Module caffe2.python.layers.uniform_sampling
-
-
-
-
-
-import numpy as np
-
-from caffe2.python import core, schema
-from caffe2.python.layers.layers import ModelLayer
-
-
-class UniformSampling(ModelLayer):
-    """
-    Uniform sampling `num_samples - len(input_record)` unique elements from the
-    range [0, num_elements). `samples` is the concatenation of input_record and
-    the samples. input_record is expected to be unique.
-    """
-
-    def __init__(
-        self,
-        model,
-        input_record,
-        num_samples,
-        num_elements,
-        name='uniform_sampling',
-        **kwargs
-    ):
-        super().__init__(model, name, input_record, **kwargs)
-
-        assert num_elements > num_samples > 0
-        assert isinstance(input_record, schema.Scalar)
-
-        self.num_elements = num_elements
-
-        num_examples_init = ('GivenTensorInt64Fill',
-                             {'values': [num_samples]})
-        self.num_samples = self.create_param(param_name='num_examples',
-                                              shape=(1,),
-                                              initializer=num_examples_init,
-                                              optimizer=model.NoOptim)
-
-        sampling_blob_init = ('ConstantFill',
-                              {'value': float(num_samples) / num_elements,
-                               'dtype': core.DataType.FLOAT})
-        self.sampling_prob = self.create_param(param_name='prob',
-                                               shape=(num_samples,),
-                                               initializer=sampling_blob_init,
-                                               optimizer=model.NoOptim)
-
-        self.output_schema = schema.Struct(
-            (
-                'samples', schema.Scalar(
-                    np.int32, self.get_next_blob_reference("samples")
-                )
-            ),
-            ('sampling_prob', schema.Scalar(np.float32, self.sampling_prob)),
-        )
-
-    def add_ops(self, net):
-        net.StopGradient(self.sampling_prob, self.sampling_prob)
-
-        shape = net.Shape([self.input_record()], net.NextScopedBlob("shape"))
-        shape = net.Sub([self.num_samples, shape], shape)
-        samples = net.UniqueUniformFill(
-            [shape, self.input_record()],
-            net.NextScopedBlob("samples_before_concat"),
-            min=0,
-            max=self.num_elements - 1,
-            input_as_shape=True
-        )
-
-        net.Concat(
-            [self.input_record(), samples],
-            [self.output_schema.samples(), net.NextScopedBlob("split_info")],
-            axis=0
-        )
-        net.StopGradient(
-            self.output_schema.samples(), self.output_schema.samples()
-        )
diff --git a/caffe2/python/layers_test.py b/caffe2/python/layers_test.py
deleted file mode 100644
index 8449a66db770..000000000000
--- a/caffe2/python/layers_test.py
+++ /dev/null
@@ -1,2516 +0,0 @@
-
-
-
-
-
-import hypothesis.strategies as st
-import numpy as np
-import numpy.testing as npt
-from hypothesis import given, settings
-
-import caffe2.python.hypothesis_test_util as hu
-
-from caffe2.python import (
-    layer_model_instantiator,
-    core,
-    schema,
-    workspace,
-)
-from caffe2.python.layers.layers import (
-    AccessedFeatures,
-    almost_equal_schemas,
-    get_key,
-    IdList,
-    IdScoreList,
-    InstantiationContext,
-    is_request_only_scalar,
-    set_request_only,
-)
-from caffe2.python.layers.tags import Tags
-from caffe2.python.layer_test_util import (
-    LayersTestCase,
-    OpSpec,
-)
-import logging
-logger = logging.getLogger(__name__)
-
-
-class TestLayers(LayersTestCase):
-    def testSparseDropoutWithReplacement(self):
-        input_record = schema.NewRecord(self.model.net, IdList)
-        self.model.output_schema = schema.Struct()
-
-        lengths_blob = input_record.field_blobs()[0]
-        values_blob = input_record.field_blobs()[1]
-        lengths = np.array([1] * 10).astype(np.int32)
-        values = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.int64)
-        workspace.FeedBlob(lengths_blob, lengths)
-        workspace.FeedBlob(values_blob, values)
-
-        out = self.model.SparseDropoutWithReplacement(
-            input_record, 0.0, 0.5, 1.0, -1, output_names_or_num=1)
-        self.assertEqual(schema.List(schema.Scalar(np.int64,)), out)
-
-        train_init_net, train_net = self.get_training_nets()
-        eval_net = self.get_eval_net()
-        predict_net = self.get_predict_net()
-
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-        out_values = workspace.FetchBlob(out.items())
-        out_lengths = workspace.FetchBlob(out.lengths())
-        self.assertBlobsEqual(out_values, values)
-        self.assertBlobsEqual(out_lengths, lengths)
-
-        workspace.RunNetOnce(eval_net)
-
-        workspace.RunNetOnce(predict_net)
-        predict_values = workspace.FetchBlob("values_auto_0")
-        predict_lengths = workspace.FetchBlob("lengths_auto_0")
-        self.assertBlobsEqual(predict_values, np.array([-1] * 10).astype(np.int64))
-        self.assertBlobsEqual(predict_lengths, lengths)
-
-    def testAddLoss(self):
-        input_record_LR = self.new_record(
-            schema.Struct(
-                ('label', schema.Scalar((np.float64, (1, )))),
-                ('logit', schema.Scalar((np.float32, (2, )))),
-                ('weight', schema.Scalar((np.float64, (1, ))))
-            )
-        )
-        loss_LR = self.model.BatchLRLoss(input_record_LR)
-
-        self.model.add_loss(loss_LR)
-        assert 'unnamed' in self.model.loss
-        self.assertEqual(
-            schema.Scalar((np.float32, tuple())), self.model.loss.unnamed
-        )
-        self.assertEqual(loss_LR, self.model.loss.unnamed)
-
-        self.model.add_loss(loss_LR, 'addLoss')
-        assert 'addLoss' in self.model.loss
-        self.assertEqual(
-            schema.Scalar((np.float32, tuple())), self.model.loss.addLoss
-        )
-        self.assertEqual(loss_LR, self.model.loss.addLoss)
-
-        self.model.add_loss(
-            schema.Scalar(
-                dtype=np.float32, blob=core.BlobReference('loss_blob_1')
-            ), 'addLoss'
-        )
-        assert 'addLoss_auto_0' in self.model.loss
-        self.assertEqual(
-            schema.Scalar((np.float32, tuple())), self.model.loss.addLoss_auto_0
-        )
-        assert core.BlobReference('loss_blob_1') in self.model.loss.field_blobs()
-
-        self.model.add_loss(
-            schema.Struct(
-                (
-                    'structName', schema.Scalar(
-                        dtype=np.float32,
-                        blob=core.BlobReference('loss_blob_2')
-                    )
-                )
-            ), 'addLoss'
-        )
-        assert 'addLoss_auto_1' in self.model.loss
-        self.assertEqual(
-            schema.Struct(('structName', schema.Scalar((np.float32, tuple())))),
-            self.model.loss.addLoss_auto_1
-        )
-        assert core.BlobReference('loss_blob_2') in self.model.loss.field_blobs()
-
-        loss_in_tuple_0 = schema.Scalar(
-            dtype=np.float32, blob=core.BlobReference('loss_blob_in_tuple_0')
-        )
-
-        loss_in_tuple_1 = schema.Scalar(
-            dtype=np.float32, blob=core.BlobReference('loss_blob_in_tuple_1')
-        )
-
-        loss_tuple = schema.NamedTuple(
-            'loss_in_tuple', * [loss_in_tuple_0, loss_in_tuple_1]
-        )
-        self.model.add_loss(loss_tuple, 'addLoss')
-        assert 'addLoss_auto_2' in self.model.loss
-        self.assertEqual(
-            schema.Struct(
-                ('loss_in_tuple_0', schema.Scalar((np.float32, tuple()))),
-                ('loss_in_tuple_1', schema.Scalar((np.float32, tuple())))
-            ), self.model.loss.addLoss_auto_2
-        )
-        assert core.BlobReference('loss_blob_in_tuple_0')\
-         in self.model.loss.field_blobs()
-        assert core.BlobReference('loss_blob_in_tuple_1')\
-         in self.model.loss.field_blobs()
-
-    def testFilterMetricSchema(self):
-        self.model.add_metric_field("a:b", schema.Scalar())
-        self.model.add_metric_field("a:c", schema.Scalar())
-        self.model.add_metric_field("d", schema.Scalar())
-
-        self.assertEqual(
-            self.model.metrics_schema,
-            schema.Struct(
-                ("a", schema.Struct(
-                    ("b", schema.Scalar()),
-                    ("c", schema.Scalar()),
-                )),
-                ("d", schema.Scalar()),
-            ))
-
-        self.model.filter_metrics_schema({"a:b", "d"})
-        self.assertEqual(
-            self.model.metrics_schema,
-            schema.Struct(
-                ("a", schema.Struct(
-                    ("b", schema.Scalar()),
-                )),
-                ("d", schema.Scalar()),
-            ))
-
-    def testAddOutputSchema(self):
-        # add the first field
-        self.model.add_output_schema('struct', schema.Struct())
-        expected_output_schema = schema.Struct(('struct', schema.Struct()))
-        self.assertEqual(
-            self.model.output_schema,
-            expected_output_schema,
-        )
-
-        # add the second field
-        self.model.add_output_schema('scalar', schema.Scalar(np.float64))
-        expected_output_schema = schema.Struct(
-            ('struct', schema.Struct()),
-            ('scalar', schema.Scalar(np.float64)),
-        )
-        self.assertEqual(
-            self.model.output_schema,
-            expected_output_schema,
-        )
-
-        # overwrite a field should raise
-        with self.assertRaises(AssertionError):
-            self.model.add_output_schema('scalar', schema.Struct())
-
-    def _test_net(self, net, ops_list):
-        '''
-        Helper function to assert the net contains some set of operations and
-        then to run the net.
-
-        Inputs:
-            net -- the network to test and run
-            ops_list -- the list of operation specifications to check for
-                        in the net
-        '''
-        ops_output = self.assertNetContainOps(net, ops_list)
-        workspace.RunNetOnce(net)
-        return ops_output
-
-    def testFCWithoutBias(self):
-        output_dims = 2
-        fc_without_bias = self.model.FCWithoutBias(
-            self.model.input_feature_schema.float_features, output_dims)
-        self.model.output_schema = fc_without_bias
-
-        self.assertEqual(
-            schema.Scalar((np.float32, (output_dims, ))),
-            fc_without_bias
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-        init_ops = self.assertNetContainOps(
-            train_init_net,
-            [
-                OpSpec("UniformFill", None, None),
-            ]
-        )
-
-        mat_mul_spec = OpSpec(
-            "MatMul",
-            [
-                self.model.input_feature_schema.float_features(),
-                init_ops[0].output[0],
-            ],
-            fc_without_bias.field_blobs()
-        )
-
-        self.assertNetContainOps(train_net, [mat_mul_spec])
-
-        predict_net = self.get_predict_net()
-        self.assertNetContainOps(predict_net, [mat_mul_spec])
-
-    def testFCWithBootstrap(self):
-        output_dims = 1
-        fc_with_bootstrap = self.model.FCWithBootstrap(
-            self.model.input_feature_schema.float_features,
-            output_dims=output_dims,
-            num_bootstrap=2,
-            max_fc_size=-1
-        )
-        self.model.output_schema = fc_with_bootstrap
-
-
-        self.assertEqual(len(fc_with_bootstrap), 4)
-
-        # must be in this order
-        assert (
-            core.BlobReference("fc_with_bootstrap/bootstrap_iteration_0/indices") == fc_with_bootstrap[0].field_blobs()[0]
-        )
-        assert (
-            core.BlobReference("fc_with_bootstrap/bootstrap_iteration_0/preds") == fc_with_bootstrap[1].field_blobs()[0]
-        )
-        assert (
-            core.BlobReference("fc_with_bootstrap/bootstrap_iteration_1/indices") == fc_with_bootstrap[2].field_blobs()[0]
-        )
-        assert (
-            core.BlobReference("fc_with_bootstrap/bootstrap_iteration_1/preds") == fc_with_bootstrap[3].field_blobs()[0]
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-        predict_net = layer_model_instantiator.generate_predict_net(self.model)
-
-        train_proto = train_net.Proto()
-        eval_proto = predict_net.Proto()
-
-        train_ops = train_proto.op
-        eval_ops = eval_proto.op
-
-        master_train_ops = [
-            "Shape",
-            "GivenTensorInt64Fill",
-            "Gather",
-            "GivenTensorIntFill",
-            "GivenTensorIntFill",
-            "Cast",
-            "Sub",
-            "UniformIntFill",
-            "Gather",
-            "FC",
-            "UniformIntFill",
-            "Gather",
-            "FC",
-        ]
-
-        master_eval_ops = [
-            "Shape",
-            "GivenTensorInt64Fill",
-            "Gather",
-            "GivenTensorIntFill",
-            "GivenTensorIntFill",
-            "Cast",
-            "Sub",
-            "UniformIntFill",
-            "FC",
-            "UniformIntFill",
-            "FC",
-        ]
-
-        assert len(train_ops) == len(master_train_ops)
-        assert len(eval_ops) == len(master_eval_ops)
-
-        assert train_proto.external_input == eval_proto.external_input
-        assert train_proto.external_output == list()
-
-        # make sure all the ops are present and unchanged for train_net and eval_net
-        for idx, op in enumerate(master_train_ops):
-            assert train_ops[idx].type == op
-
-        for idx, op in enumerate(master_eval_ops):
-            assert eval_ops[idx].type == op
-
-
-    def testFCwithAxis2(self):
-        input_dim = 10
-        output_dim = 30
-        max_length = 20
-        input_record = self.new_record(
-            schema.Struct(
-                ('history_sequence', schema.Scalar((np.float32, (max_length,
-                    input_dim)))),
-            )
-        )
-        fc_out = self.model.FC(
-            input_record.history_sequence, output_dim,
-            axis=2)
-        self.model.output_schema = fc_out
-        self.assertEqual(
-            schema.Scalar((np.float32, (max_length, output_dim))),
-            fc_out
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-    def testFCTransposed(self):
-        input_dim = 10
-        output_dim = 30
-        max_length = 20
-        input_record = self.new_record(
-            schema.Struct(
-                ('history_sequence', schema.Scalar((np.float32, (max_length,
-                    input_dim)))),
-            )
-        )
-        fc_transposed_out = self.model.FC(
-            input_record.history_sequence, output_dim,
-            axis=2, transposed=True)
-        self.model.output_schema = fc_transposed_out
-        self.assertEqual(
-            schema.Scalar((np.float32, (max_length, output_dim))),
-            fc_transposed_out
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-    def testFCTransposedWithMaxFCSize(self):
-        input_dim = 10
-        output_dim = 30
-        max_length = 20
-        input_record = self.new_record(
-            schema.Struct(
-                ('history_sequence', schema.Scalar((np.float32, (max_length,
-                    input_dim)))),
-            )
-        )
-        fc_transposed_out = self.model.FC(
-            input_record.history_sequence, output_dim,
-            max_fc_size=input_dim * output_dim // 2,
-            axis=2, transposed=True)
-        self.model.output_schema = fc_transposed_out
-        self.assertEqual(
-            schema.Scalar((np.float32, (max_length, output_dim))),
-            fc_transposed_out
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-    def testSparseLookupSumPoolingWithEviction(self):
-        # Create test embedding table of 1 row
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('sparse', schema.Struct(
-                ('sparse_feature_0', schema.ListWithEvicted(
-                    schema.Scalar(np.int64,
-                                  metadata=schema.Metadata(categorical_limit=1)),)),)),
-        ))
-        embedding_dim = 8
-        lengths_blob = record.sparse.sparse_feature_0.lengths.get()
-        values_blob = record.sparse.sparse_feature_0.items.get()
-        evicted_values_blob = record.sparse.sparse_feature_0._evicted_values.get()
-        lengths = np.array([1]).astype(np.int32)
-        values = np.array([0]).astype(np.int64)
-        # Need to reset row 0
-        evicted_values = np.array([0]).astype(np.int64)
-        workspace.FeedBlob(lengths_blob, lengths)
-        workspace.FeedBlob(values_blob, values)
-        workspace.FeedBlob(evicted_values_blob, evicted_values)
-
-        embedding_after_pooling = self.model.SparseLookup(
-            record.sparse.sparse_feature_0, [embedding_dim], 'Sum', weight_init=("ConstantFill", {"value": 1.0}))
-
-        self.model.output_schema = schema.Struct()
-        self.assertEqual(
-            schema.Scalar((np.float32, (embedding_dim, ))),
-            embedding_after_pooling
-        )
-        train_init_net, train_net = self.get_training_nets()
-        workspace.RunNetOnce(train_init_net)
-        embedding_after_init = workspace.FetchBlob("sparse_lookup/w")
-        # Change row 0's value before reset
-        new_values = np.array([[2, 2, 2, 2, 2, 2, 2, 2]]).astype(np.float32)
-        workspace.FeedBlob("sparse_lookup/w", new_values)
-        workspace.RunNetOnce(train_net.Proto())
-        embedding_after_training = workspace.FetchBlob("sparse_lookup/w")
-        # Verify row 0's value does not change after reset
-        self.assertEqual(embedding_after_training.all(), embedding_after_init.all())
-
-
-    def testSparseLookupSumPooling(self):
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('sparse', schema.Struct(
-                ('sparse_feature_0', schema.List(
-                    schema.Scalar(np.int64,
-                                  metadata=schema.Metadata(categorical_limit=1000)))),
-            )),
-        ))
-        embedding_dim = 64
-        embedding_after_pooling = self.model.SparseLookup(
-            record.sparse.sparse_feature_0, [embedding_dim], 'Sum')
-        self.model.output_schema = schema.Struct()
-        self.assertEqual(
-            schema.Scalar((np.float32, (embedding_dim, ))),
-            embedding_after_pooling
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-        init_ops = self.assertNetContainOps(
-            train_init_net,
-            [
-                OpSpec("UniformFill", None, None),
-                OpSpec("ConstantFill", None, None),
-            ]
-        )
-        sparse_lookup_op_spec = OpSpec(
-            'SparseLengthsSum',
-            [
-                init_ops[0].output[0],
-                record.sparse.sparse_feature_0.items(),
-                record.sparse.sparse_feature_0.lengths(),
-            ],
-            [embedding_after_pooling()]
-        )
-        self.assertNetContainOps(train_net, [sparse_lookup_op_spec])
-
-        predict_net = self.get_predict_net()
-        self.assertNetContainOps(predict_net, [sparse_lookup_op_spec])
-
-    @given(
-        use_hashing=st.booleans(),
-        modulo=st.integers(min_value=100, max_value=200),
-        use_divide_mod=st.booleans(),
-        divisor=st.integers(min_value=10, max_value=20),
-    )
-    def testSparseFeatureHashIdList(self, use_hashing, modulo, use_divide_mod, divisor):
-        record = schema.NewRecord(
-            self.model.net,
-            schema.List(schema.Scalar(
-                np.int64,
-                metadata=schema.Metadata(categorical_limit=60000)
-            ))
-        )
-        use_divide_mod = use_divide_mod if use_hashing is False else False
-        output_schema = self.model.SparseFeatureHash(
-            record,
-            modulo=modulo,
-            use_hashing=use_hashing,
-            use_divide_mod=use_divide_mod,
-            divisor=divisor,
-        )
-
-        self.model.output_schema = output_schema
-
-        self.assertEqual(len(self.model.layers), 1)
-        self.assertEqual(output_schema._items.metadata.categorical_limit,
-                modulo)
-        train_init_net, train_net = self.get_training_nets()
-        if use_divide_mod:
-            self.assertEqual(len(train_net.Proto().op), 3)
-        else:
-            self.assertEqual(len(train_net.Proto().op), 2)
-
-    @given(
-        use_hashing=st.booleans(),
-        modulo=st.integers(min_value=100, max_value=200),
-    )
-    def testSparseFeatureHashIdScoreList(self, use_hashing, modulo):
-        record = schema.NewRecord(self.model.net,
-                schema.Map(schema.Scalar(np.int64,
-                    metadata=schema.Metadata(
-                        categorical_limit=60000)),
-                    np.float32))
-
-        output_schema = self.model.SparseFeatureHash(
-            record,
-            modulo=modulo,
-            use_hashing=use_hashing)
-
-        self.model.output_schema = output_schema
-
-        self.assertEqual(len(self.model.layers), 1)
-        self.assertEqual(output_schema._items.keys.metadata.categorical_limit,
-                modulo)
-        train_init_net, train_net = self.get_training_nets()
-
-    def testSparseLookupIncorrectPositionWeightedOnIdList(self):
-        '''
-        Currently the implementation of SparseLookup assumed input is id_score_list
-        when use PositionWeighted.
-        '''
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('sparse', schema.Struct(
-                ('sparse_feature_0', schema.List(
-                    schema.Scalar(np.int64,
-                                  metadata=schema.Metadata(categorical_limit=1000)))),
-            )),
-        ))
-
-        embedding_dim = 64
-        with self.assertRaises(AssertionError):
-            self.model.SparseLookup(
-                record.sparse.sparse_feature_0, [embedding_dim], 'PositionWeighted')
-
-    def testSparseLookupPositionWeightedOnIdList(self):
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('sparse', schema.Struct(
-                ('sparse_feature_0', schema.List(
-                    schema.Scalar(np.int64,
-                                  metadata=schema.Metadata(categorical_limit=1000)))),
-            )),
-        ))
-
-        # convert id_list to id_score_list with PositionWeighted layer
-        sparse_segment = record.sparse.sparse_feature_0
-        pos_w_layer = self.model.PositionWeighted(sparse_segment)
-
-        sparse_segment = schema.Map(
-            keys=get_key(sparse_segment),
-            values=pos_w_layer.position_weights,
-            lengths_blob=sparse_segment.lengths
-        )
-
-        embedding_dim = 64
-        embedding_after_pooling = self.model.SparseLookup(
-            sparse_segment, [embedding_dim], 'PositionWeighted')
-        self.model.output_schema = schema.Struct()
-        self.assertEqual(
-            schema.Scalar((np.float32, (embedding_dim, ))),
-            embedding_after_pooling
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-        self.assertNetContainOps(
-            train_init_net,
-            [
-                OpSpec("ConstantFill", None, None),  # position_weights/pos_w
-                OpSpec("UniformFill", None, None),
-                OpSpec("ConstantFill", None, None),
-            ]
-        )
-        self.assertNetContainOps(train_net, [
-            OpSpec("LengthsRangeFill", None, None),
-            OpSpec("Gather", None, None),
-            OpSpec("SparseLengthsWeightedSum", None, None),
-        ])
-
-        predict_net = self.get_predict_net()
-        self.assertNetContainOps(predict_net, [
-            OpSpec("LengthsRangeFill", None, None),
-            OpSpec("Gather", None, None),
-            OpSpec("SparseLengthsWeightedSum", None, None),
-        ])
-
-    def testSparseLookupPositionWeightedOnIdScoreList(self):
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('sparse', schema.Struct(
-                ('id_score_list_0', schema.Map(
-                    schema.Scalar(
-                        np.int64,
-                        metadata=schema.Metadata(
-                            categorical_limit=1000
-                        ),
-                    ),
-                    np.float32
-                )),
-            )),
-        ))
-
-        embedding_dim = 64
-        embedding_after_pooling = self.model.SparseLookup(
-            record.sparse.id_score_list_0, [embedding_dim], 'PositionWeighted')
-        self.model.output_schema = schema.Struct()
-        self.assertEqual(
-            schema.Scalar((np.float32, (embedding_dim, ))),
-            embedding_after_pooling
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-        init_ops = self.assertNetContainOps(
-            train_init_net,
-            [
-                OpSpec("UniformFill", None, None),
-                OpSpec("ConstantFill", None, None),
-            ]
-        )
-        sparse_lookup_op_spec = OpSpec(
-            'SparseLengthsWeightedSum',
-            [
-                init_ops[0].output[0],
-                record.sparse.id_score_list_0.values(),
-                record.sparse.id_score_list_0.keys(),
-                record.sparse.id_score_list_0.lengths(),
-            ],
-            [embedding_after_pooling()]
-        )
-        self.assertNetContainOps(train_net, [sparse_lookup_op_spec])
-
-        predict_net = self.get_predict_net()
-        self.assertNetContainOps(predict_net, [sparse_lookup_op_spec])
-
-    def testSparseLookupIncorrectRecencyWeightedOnIdList(self):
-        '''
-        Currently the implementation of SparseLookup assumed input is id_score_list
-        when use RecencyWeighted.
-        '''
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('sparse', schema.Struct(
-                ('sparse_feature_0', schema.List(
-                    schema.Scalar(np.int64,
-                                  metadata=schema.Metadata(categorical_limit=1000)))),
-            )),
-        ))
-
-        embedding_dim = 64
-        with self.assertRaises(AssertionError):
-            self.model.SparseLookup(
-                record.sparse.sparse_feature_0, [embedding_dim], 'RecencyWeighted')
-
-    def testSparseLookupRecencyWeightedOnIdScoreList(self):
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('sparse', schema.Struct(
-                ('id_score_list_0', schema.Map(
-                    schema.Scalar(
-                        np.int64,
-                        metadata=schema.Metadata(
-                            categorical_limit=1000
-                        ),
-                    ),
-                    np.float32
-                )),
-            )),
-        ))
-
-        embedding_dim = 64
-        embedding_after_pooling = self.model.SparseLookup(
-            record.sparse.id_score_list_0, [embedding_dim], 'RecencyWeighted')
-        self.model.output_schema = schema.Struct()
-        self.assertEqual(
-            schema.Scalar((np.float32, (embedding_dim, ))),
-            embedding_after_pooling
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-        init_ops = self.assertNetContainOps(
-            train_init_net,
-            [
-                OpSpec("UniformFill", None, None),
-                OpSpec("ConstantFill", None, None),
-            ]
-        )
-        sparse_lookup_op_spec = OpSpec(
-            'SparseLengthsWeightedSum',
-            [
-                init_ops[0].output[0],
-                record.sparse.id_score_list_0.values(),
-                record.sparse.id_score_list_0.keys(),
-                record.sparse.id_score_list_0.lengths(),
-            ],
-            [embedding_after_pooling()]
-        )
-        self.assertNetContainOps(train_net, [sparse_lookup_op_spec])
-
-        predict_net = self.get_predict_net()
-        self.assertNetContainOps(predict_net, [sparse_lookup_op_spec])
-
-    def testPairwiseSimilarityWithAllEmbeddings(self):
-        embedding_dim = 64
-        N = 5
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('all_embeddings', schema.Scalar(
-                ((np.float32, (N, embedding_dim)))
-            )),
-        ))
-        current = self.model.PairwiseSimilarity(
-            record, N * N)
-
-        self.assertEqual(
-            schema.Scalar((np.float32, (N * N, ))),
-            current
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-        self.assertNetContainOps(train_init_net, [])
-        self.assertNetContainOps(train_net, [
-            OpSpec("BatchMatMul", None, None),
-            OpSpec("Flatten", None, None),
-        ])
-
-    def testPairwiseSimilarityWithXandYEmbeddings(self):
-        embedding_dim = 64
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('x_embeddings', schema.Scalar(
-                ((np.float32, (5, embedding_dim)))
-            )),
-            ('y_embeddings', schema.Scalar(
-                ((np.float32, (6, embedding_dim)))
-            )),
-        ))
-        current = self.model.PairwiseSimilarity(
-            record, 5 * 6)
-
-        self.assertEqual(
-            schema.Scalar((np.float32, (5 * 6, ))),
-            current
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-        self.assertNetContainOps(train_init_net, [])
-        self.assertNetContainOps(train_net, [
-            OpSpec("BatchMatMul", None, None),
-            OpSpec("Flatten", None, None),
-        ])
-
-    def testPairwiseSimilarityWithXandYEmbeddingsAndGather(self):
-        embedding_dim = 64
-
-        output_idx = [1, 3, 5]
-        output_idx_blob = self.model.add_global_constant(
-            str(self.model.net.NextScopedBlob('pairwise_dot_product_gather')),
-            output_idx,
-            dtype=np.int32,
-        )
-        indices_to_gather = schema.Scalar(
-            (np.int32, len(output_idx)),
-            output_idx_blob,
-        )
-
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('x_embeddings', schema.Scalar(
-                ((np.float32, (5, embedding_dim)))
-            )),
-            ('y_embeddings', schema.Scalar(
-                ((np.float32, (6, embedding_dim)))
-            )),
-            ('indices_to_gather', indices_to_gather),
-        ))
-        current = self.model.PairwiseSimilarity(
-            record, len(output_idx))
-
-        # This assert is not necessary,
-        # output size is passed into PairwiseSimilarity
-        self.assertEqual(
-            schema.Scalar((np.float32, (len(output_idx), ))),
-            current
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-        self.assertNetContainOps(train_init_net, [])
-        self.assertNetContainOps(train_net, [
-            OpSpec("BatchMatMul", None, None),
-            OpSpec("Flatten", None, None),
-            OpSpec("BatchGather", None, None),
-        ])
-
-    def testPairwiseSimilarityIncorrectInput(self):
-        embedding_dim = 64
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('x_embeddings', schema.Scalar(
-                ((np.float32, (5, embedding_dim)))
-            )),
-        ))
-        with self.assertRaises(AssertionError):
-            self.model.PairwiseSimilarity(
-                record, 25)
-
-        record = schema.NewRecord(self.model.net, schema.Struct(
-            ('all_embeddings', schema.List(np.float32))
-        ))
-        with self.assertRaises(AssertionError):
-            self.model.PairwiseSimilarity(
-                record, 25)
-
-    def testConcat(self):
-        embedding_dim = 64
-        input_record = self.new_record(schema.Struct(
-            ('input1', schema.Scalar((np.float32, (embedding_dim, )))),
-            ('input2', schema.Scalar((np.float32, (embedding_dim, )))),
-            ('input3', schema.Scalar((np.float32, (embedding_dim, )))),
-        ))
-
-        output = self.model.Concat(input_record)
-        self.assertEqual(
-            schema.Scalar((np.float32, ((len(input_record.fields) * embedding_dim, )))),
-            output
-        )
-
-        # Note that in Concat layer we assume first dimension is batch.
-        # so input is B * embedding_dim
-        # add_axis=1 make it B * 1 * embedding_dim
-        # concat on axis=1 make it B * N * embedding_dim
-        output = self.model.Concat(input_record, axis=1, add_axis=1)
-        self.assertEqual(
-            schema.Scalar((np.float32, ((len(input_record.fields), embedding_dim)))),
-            output
-        )
-
-    def testSamplingTrain(self):
-        output_dims = 1000
-
-        indices = self.new_record(schema.Scalar((np.int32, (10,))))
-        sampling_prob = self.new_record(schema.Scalar((np.float32, (10, ))))
-
-        sampled_fc = self.model.SamplingTrain(
-            schema.Struct(
-                ('input', self.model.input_feature_schema.float_features),
-                ('indices', indices),
-                ('sampling_prob', sampling_prob),
-            ),
-            "FC",
-            output_dims,
-        )
-        self.model.output_schema = sampled_fc
-
-        # Check that we don't add prediction layer into the model
-        self.assertEqual(1, len(self.model.layers))
-
-        self.assertEqual(
-            schema.Scalar((np.float32, (output_dims, ))),
-            sampled_fc
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-        init_ops = self.assertNetContainOps(
-            train_init_net,
-            [
-                OpSpec("UniformFill", None, None),
-                OpSpec("UniformFill", None, None),
-            ]
-        )
-
-        sampled_fc_layer = self.model.layers[0]
-
-        gather_w_spec = OpSpec(
-            "Gather",
-            [
-                init_ops[0].output[0],
-                indices(),
-            ],
-            [
-                sampled_fc_layer._prediction_layer.train_param_blobs[0]
-            ]
-        )
-        gather_b_spec = OpSpec(
-            "Gather",
-            [
-                init_ops[1].output[0],
-                indices(),
-            ],
-            [
-                sampled_fc_layer._prediction_layer.train_param_blobs[1]
-            ]
-        )
-        train_fc_spec = OpSpec(
-            "FC",
-            [
-                self.model.input_feature_schema.float_features(),
-            ] + sampled_fc_layer._prediction_layer.train_param_blobs,
-            sampled_fc.field_blobs()
-        )
-        log_spec = OpSpec("Log", [sampling_prob()], [None])
-        sub_spec = OpSpec(
-            "Sub",
-            [sampled_fc.field_blobs()[0], None],
-            sampled_fc.field_blobs()
-        )
-
-        train_ops = self.assertNetContainOps(
-            train_net,
-            [gather_w_spec, gather_b_spec, train_fc_spec, log_spec, sub_spec])
-
-        self.assertEqual(train_ops[3].output[0], train_ops[4].input[1])
-
-        predict_net = self.get_predict_net()
-        self.assertNetContainOps(
-            predict_net,
-            [
-                OpSpec(
-                    "FC",
-                    [
-                        self.model.input_feature_schema.float_features(),
-                        init_ops[0].output[0],
-                        init_ops[1].output[0],
-                    ],
-                    sampled_fc.field_blobs()
-                )
-            ]
-        )
-
-    def testBatchLRLoss(self):
-        input_record = self.new_record(schema.Struct(
-            ('label', schema.Scalar((np.float64, (1,)))),
-            ('logit', schema.Scalar((np.float32, (2,)))),
-            ('weight', schema.Scalar((np.float64, (1,))))
-        ))
-        loss = self.model.BatchLRLoss(input_record)
-        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
-
-    def testBatchLRLossWithUncertainty(self):
-        input_record = self.new_record(schema.Struct(
-            ('label', schema.Scalar((np.float64, (1,)))),
-            ('logit', schema.Scalar((np.float32, (2,)))),
-            ('weight', schema.Scalar((np.float64, (1,)))),
-            ('log_variance', schema.Scalar((np.float64, (1,)))),
-        ))
-        loss = self.model.BatchLRLoss(input_record)
-        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
-
-    def testMarginRankLoss(self):
-        input_record = self.new_record(schema.Struct(
-            ('pos_prediction', schema.Scalar((np.float32, (1,)))),
-            ('neg_prediction', schema.List(np.float32)),
-        ))
-        pos_items = np.array([0.1, 0.2, 0.3], dtype=np.float32)
-        neg_lengths = np.array([1, 2, 3], dtype=np.int32)
-        neg_items = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6], dtype=np.float32)
-        schema.FeedRecord(
-            input_record,
-            [pos_items, neg_lengths, neg_items]
-        )
-        loss = self.model.MarginRankLoss(input_record)
-        self.run_train_net_forward_only()
-        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
-
-    def testBPRLoss(self):
-        input_record = self.new_record(schema.Struct(
-            ('pos_prediction', schema.Scalar((np.float32, (1,)))),
-            ('neg_prediction', schema.List(np.float32)),
-        ))
-        pos_items = np.array([0.8, 0.9], dtype=np.float32)
-        neg_lengths = np.array([1, 2], dtype=np.int32)
-        neg_items = np.array([0.1, 0.2, 0.3], dtype=np.float32)
-        schema.FeedRecord(
-            input_record,
-            [pos_items, neg_lengths, neg_items]
-        )
-        loss = self.model.BPRLoss(input_record)
-        self.run_train_net_forward_only()
-        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
-        result = workspace.FetchBlob('bpr_loss/output')
-        np.testing.assert_array_almost_equal(np.array(1.24386, dtype=np.float32), result)
-
-    def testBatchMSELoss(self):
-        input_record = self.new_record(schema.Struct(
-            ('label', schema.Scalar((np.float64, (1,)))),
-            ('prediction', schema.Scalar((np.float32, (2,)))),
-        ))
-        loss = self.model.BatchMSELoss(input_record)
-        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
-
-    def testBatchHuberLoss(self):
-        input_record = self.new_record(schema.Struct(
-            ('label', schema.Scalar((np.float32, (1,)))),
-            ('prediction', schema.Scalar((np.float32, (2,)))),
-        ))
-        loss = self.model.BatchHuberLoss(input_record)
-        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
-
-    def testBatchSigmoidCrossEntropyLoss(self):
-        input_record = self.new_record(schema.Struct(
-            ('label', schema.Scalar((np.float32, (32,)))),
-            ('prediction', schema.Scalar((np.float32, (32,))))
-        ))
-        loss = self.model.BatchSigmoidCrossEntropyLoss(input_record)
-        self.assertEqual(schema.Scalar((np.float32, tuple())), loss)
-
-    def testBatchSoftmaxLoss(self):
-        input_record = self.new_record(schema.Struct(
-            ('label', schema.Scalar((np.float32, tuple()))),
-            ('prediction', schema.Scalar((np.float32, (32,))))
-        ))
-        loss = self.model.BatchSoftmaxLoss(input_record)
-        self.assertEqual(schema.Struct(
-            ('softmax', schema.Scalar((np.float32, (32,)))),
-            ('loss', schema.Scalar(np.float32)),
-        ), loss)
-
-    def testBatchSoftmaxLossWeight(self):
-        input_record = self.new_record(schema.Struct(
-            ('label', schema.Scalar((np.float32, tuple()))),
-            ('prediction', schema.Scalar((np.float32, (32,)))),
-            ('weight', schema.Scalar((np.float64, (1,))))
-        ))
-        loss = self.model.BatchSoftmaxLoss(input_record)
-        self.assertEqual(schema.Struct(
-            ('softmax', schema.Scalar((np.float32, (32,)))),
-            ('loss', schema.Scalar(np.float32)),
-        ), loss)
-
-    @given(
-        X=hu.arrays(dims=[2, 5]),
-    )
-    def testBatchNormalization(self, X):
-        input_record = self.new_record(schema.Scalar((np.float32, (5,))))
-        schema.FeedRecord(input_record, [X])
-        bn_output = self.model.BatchNormalization(input_record)
-        self.assertEqual(schema.Scalar((np.float32, (5,))), bn_output)
-        self.model.output_schema = schema.Struct()
-
-        train_init_net, train_net = self.get_training_nets()
-
-        init_ops = self.assertNetContainOps(
-            train_init_net,
-            [
-                OpSpec("ConstantFill", None, None),
-                OpSpec("ConstantFill", None, None),
-                OpSpec("ConstantFill", None, None),
-                OpSpec("ConstantFill", None, None),
-            ]
-        )
-
-        input_blob = input_record.field_blobs()[0]
-        output_blob = bn_output.field_blobs()[0]
-
-        expand_dims_spec = OpSpec(
-            "ExpandDims",
-            [input_blob],
-            None,
-        )
-
-        train_bn_spec = OpSpec(
-            "SpatialBN",
-            [None, init_ops[0].output[0], init_ops[1].output[0],
-                init_ops[2].output[0], init_ops[3].output[0]],
-            [output_blob, init_ops[2].output[0], init_ops[3].output[0], None, None],
-            {'is_test': 0, 'order': 'NCHW', 'momentum': 0.9},
-        )
-
-        test_bn_spec = OpSpec(
-            "SpatialBN",
-            [None, init_ops[0].output[0], init_ops[1].output[0],
-                init_ops[2].output[0], init_ops[3].output[0]],
-            [output_blob],
-            {'is_test': 1, 'order': 'NCHW', 'momentum': 0.9},
-        )
-
-        squeeze_spec = OpSpec(
-            "Squeeze",
-            [output_blob],
-            [output_blob],
-        )
-
-        self.assertNetContainOps(
-            train_net,
-            [expand_dims_spec, train_bn_spec, squeeze_spec]
-        )
-
-        eval_net = self.get_eval_net()
-
-        self.assertNetContainOps(
-            eval_net,
-            [expand_dims_spec, test_bn_spec, squeeze_spec]
-        )
-
-        predict_net = self.get_predict_net()
-
-        self.assertNetContainOps(
-            predict_net,
-            [expand_dims_spec, test_bn_spec, squeeze_spec]
-        )
-
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-
-        schema.FeedRecord(input_record, [X])
-        workspace.RunNetOnce(eval_net)
-
-        schema.FeedRecord(input_record, [X])
-        workspace.RunNetOnce(predict_net)
-
-    @given(
-        X=hu.arrays(dims=[2, 5, 6]),
-        use_layer_norm_op=st.booleans(),
-    )
-    def testLayerNormalization(self, X, use_layer_norm_op):
-        expect = (5, 6,)
-        if not use_layer_norm_op:
-            X = X.reshape(10, 6)
-            expect = (6,)
-        input_record = self.new_record(schema.Scalar((np.float32, expect)))
-        schema.FeedRecord(input_record, [X])
-        ln_output = self.model.LayerNormalization(
-            input_record, use_layer_norm_op=use_layer_norm_op
-        )
-        self.assertEqual(schema.Scalar((np.float32, expect)), ln_output)
-        self.model.output_schema = schema.Struct()
-
-        train_init_net, train_net = self.get_training_nets(add_constants=True)
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-
-    @given(
-        X=hu.arrays(dims=[5, 2]),
-        num_to_collect=st.integers(min_value=1, max_value=10),
-    )
-    def testLastNWindowCollector(self, X, num_to_collect):
-        input_record = self.new_record(schema.Scalar(np.float32))
-        schema.FeedRecord(input_record, [X])
-        last_n = self.model.LastNWindowCollector(input_record, num_to_collect)
-        self.run_train_net_forward_only()
-        output_record = schema.FetchRecord(last_n.last_n)
-        start = max(0, 5 - num_to_collect)
-        npt.assert_array_equal(X[start:], output_record())
-        num_visited = schema.FetchRecord(last_n.num_visited)
-        npt.assert_array_equal([5], num_visited())
-
-    @given(
-        X=hu.arrays(dims=[5, 2]),
-        num_to_collect=st.integers(min_value=3, max_value=3),
-    )
-    @settings(deadline=1000)
-    def testReservoirSamplingWithID(self, X, num_to_collect):
-        ID = np.array([1, 2, 3, 1, 2], dtype=np.int64)
-        input_record = self.new_record(
-            schema.Struct(
-                ('record', schema.Struct(
-                    ('dense', schema.Scalar()),
-                )),
-                ('object_id', schema.Scalar(np.int64)),
-            )
-        )
-        schema.FeedRecord(input_record, [X, ID])
-        packed_record = self.model.PackRecords(
-            input_record.record, 1, fields=input_record.record.field_names())
-        reservoir_input = schema.Struct(
-            ('data', packed_record),
-            ('object_id', input_record.object_id),
-        )
-        reservoir = self.model.ReservoirSampling(
-            reservoir_input, num_to_collect)
-        self.model.output_schema = schema.Struct()
-        train_init_net, train_net = \
-            layer_model_instantiator.generate_training_nets_forward_only(
-                self.model)
-        workspace.RunNetOnce(train_init_net)
-        workspace.CreateNet(train_net)
-        workspace.RunNet(train_net.Proto().name, num_iter=2)
-        num_visited = schema.FetchRecord(reservoir.num_visited)
-        npt.assert_array_equal([3], num_visited())
-        for param in self.model.params:
-            serialized = workspace.SerializeBlob(str(param))
-            workspace.DeserializeBlob(str(param), serialized)
-        ID = np.array([3, 5, 3, 3, 5], dtype=np.int64)
-        schema.FeedRecord(input_record.object_id, [ID])
-        workspace.RunNet(train_net.Proto().name, num_iter=2)
-        num_visited = schema.FetchRecord(reservoir.num_visited)
-        npt.assert_array_equal([2], num_visited())
-
-    def testUniformSampling(self):
-        input_record = self.new_record(schema.Scalar(np.int32))
-        input_array = np.array([3, 10, 11, 15, 20, 99], dtype=np.int32)
-        schema.FeedRecord(input_record, [input_array])
-        num_samples = 20
-        num_elements = 100
-        uniform_sampling_output = self.model.UniformSampling(
-            input_record, num_samples, num_elements)
-        self.model.loss = uniform_sampling_output
-        self.run_train_net()
-        samples = workspace.FetchBlob(uniform_sampling_output.samples())
-        sampling_prob = workspace.FetchBlob(
-            uniform_sampling_output.sampling_prob())
-        self.assertEqual(num_samples, len(samples))
-        np.testing.assert_array_equal(input_array, samples[:len(input_array)])
-        np.testing.assert_almost_equal(
-            np.array([float(num_samples) / num_elements] * num_samples,
-                     dtype=np.float32),
-            sampling_prob
-        )
-
-    def testUniformSamplingWithIncorrectSampleSize(self):
-        input_record = self.new_record(schema.Scalar(np.int32))
-        num_samples = 200
-        num_elements = 100
-        with self.assertRaises(AssertionError):
-            self.model.UniformSampling(input_record, num_samples, num_elements)
-
-    def testGatherRecord(self):
-        indices = np.array([1, 3, 4], dtype=np.int32)
-        dense = np.array(list(range(20)), dtype=np.float32).reshape(10, 2)
-        lengths = np.array(list(range(10)), dtype=np.int32)
-        items = np.array(list(range(lengths.sum())), dtype=np.int64)
-        items_lengths = np.array(list(range(lengths.sum())), dtype=np.int32)
-        items_items = np.array(list(range(items_lengths.sum())), dtype=np.int64)
-        record = self.new_record(schema.Struct(
-            ('dense', schema.Scalar(np.float32)),
-            ('sparse', schema.Struct(
-                ('list', schema.List(np.int64)),
-                ('list_of_list', schema.List(schema.List(np.int64))),
-            )),
-            ('empty_struct', schema.Struct())
-        ))
-        indices_record = self.new_record(schema.Scalar(np.int32))
-        input_record = schema.Struct(
-            ('indices', indices_record),
-            ('record', record),
-        )
-        schema.FeedRecord(
-            input_record,
-            [indices, dense, lengths, items, lengths, items_lengths,
-             items_items])
-        gathered_record = self.model.GatherRecord(input_record)
-        self.assertTrue(schema.equal_schemas(gathered_record, record))
-
-        self.run_train_net_forward_only()
-        gathered_dense = workspace.FetchBlob(gathered_record.dense())
-        np.testing.assert_array_equal(
-            np.concatenate([dense[i:i + 1] for i in indices]), gathered_dense)
-        gathered_lengths = workspace.FetchBlob(
-            gathered_record.sparse.list.lengths())
-        np.testing.assert_array_equal(
-            np.concatenate([lengths[i:i + 1] for i in indices]),
-            gathered_lengths)
-        gathered_items = workspace.FetchBlob(
-            gathered_record.sparse.list.items())
-        offsets = lengths.cumsum() - lengths
-        np.testing.assert_array_equal(
-            np.concatenate([
-                items[offsets[i]: offsets[i] + lengths[i]]
-                for i in indices
-            ]), gathered_items)
-
-        gathered_items_lengths = workspace.FetchBlob(
-            gathered_record.sparse.list_of_list.items.lengths())
-        np.testing.assert_array_equal(
-            np.concatenate([
-                items_lengths[offsets[i]: offsets[i] + lengths[i]]
-                for i in indices
-            ]),
-            gathered_items_lengths
-        )
-
-        nested_offsets = []
-        nested_lengths = []
-        nested_offset = 0
-        j = 0
-        for l in lengths:
-            nested_offsets.append(nested_offset)
-            nested_length = 0
-            for _i in range(l):
-                nested_offset += items_lengths[j]
-                nested_length += items_lengths[j]
-                j += 1
-            nested_lengths.append(nested_length)
-
-        gathered_items_items = workspace.FetchBlob(
-            gathered_record.sparse.list_of_list.items.items())
-        np.testing.assert_array_equal(
-            np.concatenate([
-                items_items[nested_offsets[i]:
-                            nested_offsets[i] + nested_lengths[i]]
-                for i in indices
-            ]),
-            gathered_items_items
-        )
-
-    def testMapToRange(self):
-        input_record = self.new_record(schema.Scalar(np.int32))
-        indices_blob = self.model.MapToRange(input_record,
-                                             max_index=100).indices
-        self.model.output_schema = schema.Struct()
-
-        train_init_net, train_net = self.get_training_nets()
-
-        schema.FeedRecord(
-            input_record,
-            [np.array([10, 3, 20, 99, 15, 11, 3, 11], dtype=np.int32)]
-        )
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-        indices = workspace.FetchBlob(indices_blob())
-        np.testing.assert_array_equal(
-            np.array([1, 2, 3, 4, 5, 6, 2, 6], dtype=np.int32),
-            indices
-        )
-
-        schema.FeedRecord(
-            input_record,
-            [np.array([10, 3, 23, 35, 60, 15, 10, 15], dtype=np.int32)]
-        )
-        workspace.RunNetOnce(train_net)
-        indices = workspace.FetchBlob(indices_blob())
-        np.testing.assert_array_equal(
-            np.array([1, 2, 7, 8, 9, 5, 1, 5], dtype=np.int32),
-            indices
-        )
-
-        eval_net = self.get_eval_net()
-
-        schema.FeedRecord(
-            input_record,
-            [np.array([10, 3, 23, 35, 60, 15, 200], dtype=np.int32)]
-        )
-        workspace.RunNetOnce(eval_net)
-        indices = workspace.FetchBlob(indices_blob())
-        np.testing.assert_array_equal(
-            np.array([1, 2, 7, 8, 9, 5, 0], dtype=np.int32),
-            indices
-        )
-
-        schema.FeedRecord(
-            input_record,
-            [np.array([10, 3, 23, 15, 101, 115], dtype=np.int32)]
-        )
-        workspace.RunNetOnce(eval_net)
-        indices = workspace.FetchBlob(indices_blob())
-        np.testing.assert_array_equal(
-            np.array([1, 2, 7, 5, 0, 0], dtype=np.int32),
-            indices
-        )
-
-        predict_net = self.get_predict_net()
-
-        schema.FeedRecord(
-            input_record,
-            [np.array([3, 3, 20, 23, 151, 35, 60, 15, 200], dtype=np.int32)]
-        )
-        workspace.RunNetOnce(predict_net)
-        indices = workspace.FetchBlob(indices_blob())
-        np.testing.assert_array_equal(
-            np.array([2, 2, 3, 7, 0, 8, 9, 5, 0], dtype=np.int32),
-            indices
-        )
-
-    def testSelectRecordByContext(self):
-        float_features = self.model.input_feature_schema.float_features
-
-        float_array = np.array([1.0, 2.0], dtype=np.float32)
-
-        schema.FeedRecord(float_features, [float_array])
-
-        with Tags(Tags.EXCLUDE_FROM_PREDICTION):
-            log_float_features = self.model.Log(float_features, 1)
-        joined = self.model.SelectRecordByContext(
-            schema.Struct(
-                (InstantiationContext.PREDICTION, float_features),
-                (InstantiationContext.TRAINING, log_float_features),
-                # TODO: TRAIN_ONLY layers are also generated in eval
-                (InstantiationContext.EVAL, log_float_features),
-            )
-        )
-
-        # model.output_schema has to a struct
-        self.model.output_schema = schema.Struct((
-            'joined', joined
-        ))
-        predict_net = layer_model_instantiator.generate_predict_net(self.model)
-        workspace.RunNetOnce(predict_net)
-        predict_output = schema.FetchRecord(predict_net.output_record())
-        npt.assert_array_equal(float_array,
-                               predict_output['joined']())
-        eval_net = layer_model_instantiator.generate_eval_net(self.model)
-        workspace.RunNetOnce(eval_net)
-        eval_output = schema.FetchRecord(eval_net.output_record())
-        npt.assert_array_equal(np.log(float_array),
-                               eval_output['joined']())
-        _, train_net = (
-            layer_model_instantiator.generate_training_nets_forward_only(
-                self.model
-            )
-        )
-        workspace.RunNetOnce(train_net)
-        train_output = schema.FetchRecord(train_net.output_record())
-        npt.assert_array_equal(np.log(float_array),
-                               train_output['joined']())
-
-    def testFunctionalLayer(self):
-        def normalize(net, in_record, out_record):
-            mean = net.ReduceFrontMean(in_record(), 1)
-            net.Sub(
-                [in_record(), mean],
-                out_record(),
-                broadcast=1)
-        normalized = self.model.Functional(
-            self.model.input_feature_schema.float_features, 1,
-            normalize, name="normalizer")
-
-        # Attach metadata to one of the outputs and use it in FC
-        normalized.set_type((np.float32, 32))
-        self.model.output_schema = self.model.FC(normalized, 2)
-
-        predict_net = layer_model_instantiator.generate_predict_net(
-            self.model)
-        ops = predict_net.Proto().op
-        assert len(ops) == 3
-        assert ops[0].type == "ReduceFrontMean"
-        assert ops[1].type == "Sub"
-        assert ops[2].type == "FC"
-        assert len(ops[0].input) == 1
-        assert ops[0].input[0] ==\
-            self.model.input_feature_schema.float_features()
-        assert len(ops[1].output) == 1
-        assert ops[1].output[0] in ops[2].input
-
-    def testFunctionalLayerHelper(self):
-        mean = self.model.ReduceFrontMean(
-            self.model.input_feature_schema.float_features, 1)
-        normalized = self.model.Sub(
-            schema.Tuple(
-                self.model.input_feature_schema.float_features, mean),
-            1, broadcast=1)
-        # Attach metadata to one of the outputs and use it in FC
-        normalized.set_type((np.float32, (32,)))
-        self.model.output_schema = self.model.FC(normalized, 2)
-
-        predict_net = layer_model_instantiator.generate_predict_net(
-            self.model)
-        ops = predict_net.Proto().op
-        assert len(ops) == 3
-        assert ops[0].type == "ReduceFrontMean"
-        assert ops[1].type == "Sub"
-        assert ops[2].type == "FC"
-        assert len(ops[0].input) == 1
-        assert ops[0].input[0] ==\
-            self.model.input_feature_schema.float_features()
-        assert len(ops[1].output) == 1
-        assert ops[1].output[0] in ops[2].input
-
-    def testFunctionalLayerHelperAutoInference(self):
-        softsign = self.model.Softsign(
-            schema.Tuple(self.model.input_feature_schema.float_features),
-            1)
-        assert softsign.field_type().base == np.float32
-        assert softsign.field_type().shape == (32,)
-        self.model.output_schema = self.model.FC(softsign, 2)
-
-        predict_net = layer_model_instantiator.generate_predict_net(
-            self.model)
-        ops = predict_net.Proto().op
-        assert len(ops) == 2
-        assert ops[0].type == "Softsign"
-        assert ops[1].type == "FC"
-        assert len(ops[0].input) == 1
-        assert ops[0].input[0] ==\
-            self.model.input_feature_schema.float_features()
-        assert len(ops[0].output) == 1
-        assert ops[0].output[0] in ops[1].input
-
-    def testHalfToFloatTypeInference(self):
-        input = self.new_record(schema.Scalar((np.float32, (32,))))
-
-        output = self.model.FloatToHalf(input, 1)
-        assert output.field_type().base == np.float16
-        assert output.field_type().shape == (32, )
-
-        output = self.model.HalfToFloat(output, 1)
-        assert output.field_type().base == np.float32
-        assert output.field_type().shape == (32, )
-
-    def testFunctionalLayerHelperAutoInferenceScalar(self):
-        loss = self.model.AveragedLoss(self.model.input_feature_schema, 1)
-        self.assertEqual(1, len(loss.field_types()))
-        self.assertEqual(np.float32, loss.field_types()[0].base)
-        self.assertEqual(tuple(), loss.field_types()[0].shape)
-
-    def testFunctionalLayerInputCoercion(self):
-        one = self.model.global_constants['ONE']
-        two = self.model.Add([one, one], 1)
-        self.model.loss = two
-        self.run_train_net()
-        data = workspace.FetchBlob(two.field_blobs()[0])
-        np.testing.assert_array_equal([2.0], data)
-
-    def testFunctionalLayerWithOutputNames(self):
-        k = 3
-        topk = self.model.TopK(
-            self.model.input_feature_schema,
-            output_names_or_num=['values', 'indices'],
-            k=k,
-        )
-        self.assertEqual(2, len(topk.field_types()))
-        self.assertEqual(np.float32, topk.field_types()[0].base)
-        self.assertEqual((k,), topk.field_types()[0].shape)
-        self.assertEqual(np.int32, topk.field_types()[1].base)
-        self.assertEqual((k,), topk.field_types()[1].shape)
-        self.assertEqual(['TopK/values', 'TopK/indices'], topk.field_blobs())
-
-    def testFunctionalLayerSameOperatorOutputNames(self):
-        Con1 = self.model.ConstantFill([], 1, value=1)
-        Con2 = self.model.ConstantFill([], 1, value=2)
-        self.assertNotEqual(str(Con1), str(Con2))
-
-    def testFunctionalLayerWithOutputDtypes(self):
-        loss = self.model.AveragedLoss(
-            self.model.input_feature_schema,
-            1,
-            output_dtypes=(np.float32, (1,)),
-        )
-        self.assertEqual(1, len(loss.field_types()))
-        self.assertEqual(np.float32, loss.field_types()[0].base)
-        self.assertEqual((1,), loss.field_types()[0].shape)
-
-    def testPropagateRequestOnly(self):
-        # test case when output is request only
-        input_record = self.new_record(schema.Struct(
-            ('input1', schema.Scalar((np.float32, (32, )))),
-            ('input2', schema.Scalar((np.float32, (64, )))),
-            ('input3', schema.Scalar((np.float32, (16, )))),
-        ))
-
-        set_request_only(input_record)
-        concat_output = self.model.Concat(input_record)
-        self.assertEqual(is_request_only_scalar(concat_output), True)
-
-        # test case when output is not request only
-        input_record2 = self.new_record(schema.Struct(
-            ('input4', schema.Scalar((np.float32, (100, ))))
-        )) + input_record
-
-        concat_output2 = self.model.Concat(input_record2)
-        self.assertEqual(is_request_only_scalar(concat_output2), False)
-
-    def testSetRequestOnly(self):
-        input_record = schema.Scalar(np.int64)
-        schema.attach_metadata_to_scalars(
-            input_record,
-            schema.Metadata(
-                categorical_limit=100000000,
-                expected_value=99,
-                feature_specs=schema.FeatureSpec(
-                    feature_ids=[1, 100, 1001]
-                )
-            )
-        )
-
-        set_request_only(input_record)
-        self.assertEqual(input_record.metadata.categorical_limit, 100000000)
-        self.assertEqual(input_record.metadata.expected_value, 99)
-        self.assertEqual(
-            input_record.metadata.feature_specs.feature_ids,
-            [1, 100, 1001]
-        )
-
-    @given(
-        X=hu.arrays(dims=[5, 5]),  # Shape of X is irrelevant
-        dropout_for_eval=st.booleans(),
-    )
-    def testDropout(self, X, dropout_for_eval):
-        input_record = self.new_record(schema.Scalar((np.float32, (1,))))
-        schema.FeedRecord(input_record, [X])
-        d_output = self.model.Dropout(
-            input_record,
-            dropout_for_eval=dropout_for_eval
-        )
-        self.assertEqual(schema.Scalar((np.float32, (1,))), d_output)
-        self.model.output_schema = schema.Struct()
-
-        train_init_net, train_net = self.get_training_nets()
-
-        input_blob = input_record.field_blobs()[0]
-        output_blob = d_output.field_blobs()[0]
-
-        with_d_spec = OpSpec(
-            "Dropout",
-            [input_blob],
-            [output_blob, None],
-            {'is_test': 0, 'ratio': 0.5}
-        )
-
-        without_d_spec = OpSpec(
-            "Dropout",
-            [input_blob],
-            [output_blob, None],
-            {'is_test': 1, 'ratio': 0.5}
-        )
-
-        self.assertNetContainOps(
-            train_net,
-            [with_d_spec]
-        )
-
-        eval_net = self.get_eval_net()
-        predict_net = self.get_predict_net()
-
-        if dropout_for_eval:
-            self.assertNetContainOps(
-                eval_net,
-                [with_d_spec]
-            )
-            self.assertNetContainOps(
-                predict_net,
-                [with_d_spec]
-            )
-        else:
-            self.assertNetContainOps(
-                eval_net,
-                [without_d_spec]
-            )
-            self.assertNetContainOps(
-                predict_net,
-                [without_d_spec]
-            )
-
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-
-        schema.FeedRecord(input_record, [X])
-        workspace.RunNetOnce(eval_net)
-
-        schema.FeedRecord(input_record, [X])
-        workspace.RunNetOnce(predict_net)
-
-    @given(
-        num_inputs=st.integers(1, 3),
-        batch_size=st.integers(5, 10)
-    )
-    def testMergeIdListsLayer(self, num_inputs, batch_size):
-        inputs = []
-        for _ in range(num_inputs):
-            lengths = np.random.randint(5, size=batch_size).astype(np.int32)
-            size = lengths.sum()
-            values = np.random.randint(1, 10, size=size).astype(np.int64)
-            inputs.append(lengths)
-            inputs.append(values)
-        input_schema = schema.Tuple(
-            *[schema.List(
-                schema.Scalar(dtype=np.int64, metadata=schema.Metadata(
-                    categorical_limit=20
-                ))) for _ in range(num_inputs)]
-        )
-
-        input_record = schema.NewRecord(self.model.net, input_schema)
-        schema.FeedRecord(input_record, inputs)
-        output_schema = self.model.MergeIdLists(input_record)
-        assert schema.equal_schemas(
-            output_schema, IdList,
-            check_field_names=False)
-
-    @given(
-        batch_size=st.integers(min_value=2, max_value=10),
-        input_dims=st.integers(min_value=5, max_value=10),
-        output_dims=st.integers(min_value=5, max_value=10),
-        bandwidth=st.floats(min_value=0.1, max_value=5),
-    )
-    def testRandomFourierFeatures(self, batch_size, input_dims, output_dims, bandwidth):
-
-        def _rff_hypothesis_test(rff_output, X, W, b, scale):
-            '''
-            Runs hypothesis test for Semi Random Features layer.
-
-            Inputs:
-                rff_output -- output of net after running random fourier features layer
-                X -- input data
-                W -- weight parameter from train_init_net
-                b -- bias parameter from train_init_net
-                scale -- value by which to scale the output vector
-            '''
-            output = workspace.FetchBlob(rff_output)
-            output_ref = scale * np.cos(np.dot(X, np.transpose(W)) + b)
-            npt.assert_allclose(output, output_ref, rtol=1e-3, atol=1e-3)
-
-        X = np.random.random((batch_size, input_dims)).astype(np.float32)
-        scale = np.sqrt(2.0 / output_dims)
-        input_record = self.new_record(schema.Scalar((np.float32, (input_dims,))))
-        schema.FeedRecord(input_record, [X])
-        input_blob = input_record.field_blobs()[0]
-        rff_output = self.model.RandomFourierFeatures(input_record,
-                                                      output_dims,
-                                                      bandwidth)
-        self.model.output_schema = schema.Struct()
-
-        self.assertEqual(
-            schema.Scalar((np.float32, (output_dims, ))),
-            rff_output
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-        # Init net assertions
-        init_ops_list = [
-            OpSpec("GaussianFill", None, None),
-            OpSpec("UniformFill", None, None),
-        ]
-        init_ops = self._test_net(train_init_net, init_ops_list)
-        W = workspace.FetchBlob(self.model.layers[0].w)
-        b = workspace.FetchBlob(self.model.layers[0].b)
-
-        # Operation specifications
-        fc_spec = OpSpec("FC", [input_blob, init_ops[0].output[0],
-                         init_ops[1].output[0]], None)
-        cosine_spec = OpSpec("Cos", None, None)
-        scale_spec = OpSpec("Scale", None, rff_output.field_blobs(),
-                            {'scale': scale})
-        ops_list = [
-            fc_spec,
-            cosine_spec,
-            scale_spec
-        ]
-
-        # Train net assertions
-        self._test_net(train_net, ops_list)
-        _rff_hypothesis_test(rff_output(), X, W, b, scale)
-
-        # Eval net assertions
-        eval_net = self.get_eval_net()
-        self._test_net(eval_net, ops_list)
-        _rff_hypothesis_test(rff_output(), X, W, b, scale)
-
-        # Predict net assertions
-        predict_net = self.get_predict_net()
-        self._test_net(predict_net, ops_list)
-        _rff_hypothesis_test(rff_output(), X, W, b, scale)
-
-    @given(
-        batch_size=st.integers(min_value=2, max_value=10),
-        input_dims=st.integers(min_value=5, max_value=10),
-        output_dims=st.integers(min_value=5, max_value=10),
-        s=st.integers(min_value=0, max_value=3),
-        scale=st.floats(min_value=0.1, max_value=5),
-        set_weight_as_global_constant=st.booleans()
-    )
-    def testArcCosineFeatureMap(self, batch_size, input_dims, output_dims, s, scale,
-                                set_weight_as_global_constant):
-
-        def _arc_cosine_hypothesis_test(ac_output, X, W, b, s):
-            '''
-            Runs hypothesis test for Arc Cosine layer.
-
-            Inputs:
-                ac_output -- output of net after running arc cosine layer
-                X -- input data
-                W -- weight parameter from train_init_net
-                b -- bias parameter from train_init_net
-                s -- degree parameter
-            '''
-            # Get output from net
-            net_output = workspace.FetchBlob(ac_output)
-
-            # Computing output directly
-            x_rand = np.matmul(X, np.transpose(W)) + b
-            x_pow = np.power(x_rand, s)
-            if s > 0:
-                h_rand_features = np.piecewise(x_rand,
-                                               [x_rand <= 0, x_rand > 0],
-                                               [0, 1])
-            else:
-                h_rand_features = np.piecewise(x_rand,
-                                               [x_rand <= 0, x_rand > 0],
-                                               [0, lambda x: x / (1 + x)])
-            output_ref = np.multiply(x_pow, h_rand_features)
-
-            # Comparing net output and computed output
-            npt.assert_allclose(net_output, output_ref, rtol=1e-3, atol=1e-3)
-
-        X = np.random.normal(size=(batch_size, input_dims)).astype(np.float32)
-        input_record = self.new_record(schema.Scalar((np.float32, (input_dims,))))
-        schema.FeedRecord(input_record, [X])
-        input_blob = input_record.field_blobs()[0]
-
-        ac_output = self.model.ArcCosineFeatureMap(
-            input_record,
-            output_dims,
-            s=s,
-            scale=scale,
-            set_weight_as_global_constant=set_weight_as_global_constant
-        )
-        self.model.output_schema = schema.Struct()
-        self.assertEqual(
-            schema.Scalar((np.float32, (output_dims, ))),
-            ac_output
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-        # Run create_init_net to initialize the global constants, and W and b
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(self.model.create_init_net(name='init_net'))
-
-        if set_weight_as_global_constant:
-            W = workspace.FetchBlob(
-                self.model.global_constants['arc_cosine_feature_map_fixed_rand_W']
-            )
-            b = workspace.FetchBlob(
-                self.model.global_constants['arc_cosine_feature_map_fixed_rand_b']
-            )
-        else:
-            W = workspace.FetchBlob(self.model.layers[0].random_w)
-            b = workspace.FetchBlob(self.model.layers[0].random_b)
-
-        # Operation specifications
-        fc_spec = OpSpec("FC", [input_blob, None, None], None)
-        softsign_spec = OpSpec("Softsign", None, None)
-        relu_spec = OpSpec("Relu", None, None)
-        relu_spec_output = OpSpec("Relu", None, ac_output.field_blobs())
-        pow_spec = OpSpec("Pow", None, None, {'exponent': float(s - 1)})
-        mul_spec = OpSpec("Mul", None, ac_output.field_blobs())
-
-        if s == 0:
-            ops_list = [
-                fc_spec,
-                softsign_spec,
-                relu_spec_output,
-            ]
-        elif s == 1:
-            ops_list = [
-                fc_spec,
-                relu_spec_output,
-            ]
-        else:
-            ops_list = [
-                fc_spec,
-                relu_spec,
-                pow_spec,
-                mul_spec,
-            ]
-
-        # Train net assertions
-        self._test_net(train_net, ops_list)
-        _arc_cosine_hypothesis_test(ac_output(), X, W, b, s)
-
-        # Eval net assertions
-        eval_net = self.get_eval_net()
-        self._test_net(eval_net, ops_list)
-        _arc_cosine_hypothesis_test(ac_output(), X, W, b, s)
-
-        # Predict net assertions
-        predict_net = self.get_predict_net()
-        self._test_net(predict_net, ops_list)
-        _arc_cosine_hypothesis_test(ac_output(), X, W, b, s)
-
-    @given(
-        batch_size=st.integers(min_value=2, max_value=10),
-        input_dims=st.integers(min_value=5, max_value=10),
-        output_dims=st.integers(min_value=5, max_value=10),
-        s=st.integers(min_value=0, max_value=3),
-        scale=st.floats(min_value=0.1, max_value=5),
-        set_weight_as_global_constant=st.booleans(),
-        use_struct_input=st.booleans(),
-    )
-    def testSemiRandomFeatures(self, batch_size, input_dims, output_dims, s, scale,
-                               set_weight_as_global_constant, use_struct_input):
-
-        def _semi_random_hypothesis_test(srf_output, X_full, X_random, rand_w,
-                                         rand_b, s):
-            '''
-            Runs hypothesis test for Semi Random Features layer.
-
-            Inputs:
-                srf_output -- output of net after running semi random features layer
-                X_full -- full input data
-                X_random -- random-output input data
-                rand_w -- random-initialized weight parameter from train_init_net
-                rand_b -- random-initialized bias parameter from train_init_net
-                s -- degree parameter
-
-            '''
-            # Get output from net
-            net_output = workspace.FetchBlob(srf_output)
-
-            # Fetch learned parameter blobs
-            learned_w = workspace.FetchBlob(self.model.layers[0].learned_w)
-            learned_b = workspace.FetchBlob(self.model.layers[0].learned_b)
-
-            # Computing output directly
-            x_rand = np.matmul(X_random, np.transpose(rand_w)) + rand_b
-            x_learn = np.matmul(X_full, np.transpose(learned_w)) + learned_b
-            x_pow = np.power(x_rand, s)
-            if s > 0:
-                h_rand_features = np.piecewise(x_rand,
-                                               [x_rand <= 0, x_rand > 0],
-                                               [0, 1])
-            else:
-                h_rand_features = np.piecewise(x_rand,
-                                               [x_rand <= 0, x_rand > 0],
-                                               [0, lambda x: x / (1 + x)])
-            output_ref = np.multiply(np.multiply(x_pow, h_rand_features), x_learn)
-
-            # Comparing net output and computed output
-            npt.assert_allclose(net_output, output_ref, rtol=1e-3, atol=1e-3)
-
-        X_full = np.random.normal(size=(batch_size, input_dims)).astype(np.float32)
-        if use_struct_input:
-            X_random = np.random.normal(size=(batch_size, input_dims)).\
-                astype(np.float32)
-            input_data = [X_full, X_random]
-            input_record = self.new_record(schema.Struct(
-                ('full', schema.Scalar(
-                    (np.float32, (input_dims,))
-                )),
-                ('random', schema.Scalar(
-                    (np.float32, (input_dims,))
-                ))
-            ))
-        else:
-            X_random = X_full
-            input_data = [X_full]
-            input_record = self.new_record(schema.Scalar(
-                (np.float32, (input_dims,))
-            ))
-
-        schema.FeedRecord(input_record, input_data)
-        srf_output = self.model.SemiRandomFeatures(
-            input_record,
-            output_dims,
-            s=s,
-            scale_random=scale,
-            scale_learned=scale,
-            set_weight_as_global_constant=set_weight_as_global_constant
-        )
-
-        self.model.output_schema = schema.Struct()
-
-        self.assertEqual(
-            schema.Struct(
-                ('full', schema.Scalar(
-                    (np.float32, (output_dims,))
-                )),
-                ('random', schema.Scalar(
-                    (np.float32, (output_dims,))
-                ))
-            ),
-            srf_output
-        )
-
-        init_ops_list = [
-            OpSpec("GaussianFill", None, None),
-            OpSpec("UniformFill", None, None),
-            OpSpec("GaussianFill", None, None),
-            OpSpec("UniformFill", None, None),
-        ]
-        train_init_net, train_net = self.get_training_nets()
-
-        # Need to run to initialize the global constants for layer
-        workspace.RunNetOnce(self.model.create_init_net(name='init_net'))
-
-        if set_weight_as_global_constant:
-            # If weight params are global constants, they won't be in train_init_net
-            init_ops = self._test_net(train_init_net, init_ops_list[:2])
-            rand_w = workspace.FetchBlob(
-                self.model.global_constants['semi_random_features_fixed_rand_W']
-            )
-            rand_b = workspace.FetchBlob(
-                self.model.global_constants['semi_random_features_fixed_rand_b']
-            )
-
-            # Operation specifications
-            fc_random_spec = OpSpec("FC", [None, None, None], None)
-            fc_learned_spec = OpSpec("FC", [None, init_ops[0].output[0],
-                                     init_ops[1].output[0]], None)
-        else:
-            init_ops = self._test_net(train_init_net, init_ops_list)
-            rand_w = workspace.FetchBlob(self.model.layers[0].random_w)
-            rand_b = workspace.FetchBlob(self.model.layers[0].random_b)
-
-            # Operation specifications
-            fc_random_spec = OpSpec("FC", [None, init_ops[0].output[0],
-                                    init_ops[1].output[0]], None)
-            fc_learned_spec = OpSpec("FC", [None, init_ops[2].output[0],
-                                     init_ops[3].output[0]], None)
-
-        softsign_spec = OpSpec("Softsign", None, None)
-        relu_spec = OpSpec("Relu", None, None)
-        relu_output_spec = OpSpec("Relu", None, srf_output.random.field_blobs())
-        pow_spec = OpSpec("Pow", None, None, {'exponent': float(s - 1)})
-        mul_interim_spec = OpSpec("Mul", None, srf_output.random.field_blobs())
-        mul_spec = OpSpec("Mul", None, srf_output.full.field_blobs())
-
-        if s == 0:
-            ops_list = [
-                fc_learned_spec,
-                fc_random_spec,
-                softsign_spec,
-                relu_output_spec,
-                mul_spec,
-            ]
-        elif s == 1:
-            ops_list = [
-                fc_learned_spec,
-                fc_random_spec,
-                relu_output_spec,
-                mul_spec,
-            ]
-        else:
-            ops_list = [
-                fc_learned_spec,
-                fc_random_spec,
-                relu_spec,
-                pow_spec,
-                mul_interim_spec,
-                mul_spec,
-            ]
-
-        # Train net assertions
-        self._test_net(train_net, ops_list)
-        _semi_random_hypothesis_test(srf_output.full(), X_full, X_random,
-                                     rand_w, rand_b, s)
-
-        # Eval net assertions
-        eval_net = self.get_eval_net()
-        self._test_net(eval_net, ops_list)
-        _semi_random_hypothesis_test(srf_output.full(), X_full, X_random,
-                                     rand_w, rand_b, s)
-
-        # Predict net assertions
-        predict_net = self.get_predict_net()
-        self._test_net(predict_net, ops_list)
-        _semi_random_hypothesis_test(srf_output.full(), X_full, X_random,
-                                     rand_w, rand_b, s)
-
-    def testConv(self):
-        batch_size = 50
-        H = 1
-        W = 10
-        C = 50
-        output_dims = 32
-        kernel_h = 1
-        kernel_w = 3
-        stride_h = 1
-        stride_w = 1
-        pad_t = 0
-        pad_b = 0
-        pad_r = None
-        pad_l = None
-
-        input_record = self.new_record(schema.Scalar((np.float32, (H, W, C))))
-        X = np.random.random((batch_size, H, W, C)).astype(np.float32)
-        schema.FeedRecord(input_record, [X])
-        conv = self.model.Conv(
-            input_record,
-            output_dims,
-            kernel_h=kernel_h,
-            kernel_w=kernel_w,
-            stride_h=stride_h,
-            stride_w=stride_w,
-            pad_t=pad_t,
-            pad_b=pad_b,
-            pad_r=pad_r,
-            pad_l=pad_l,
-            order='NHWC'
-        )
-
-        self.assertEqual(
-            schema.Scalar((np.float32, (output_dims,))),
-            conv
-        )
-
-        self.run_train_net_forward_only()
-        output_record = schema.FetchRecord(conv)
-        # check the number of output channels is the same as input in this example
-        assert output_record.field_types()[0].shape == (H, W, output_dims)
-        assert output_record().shape == (batch_size, H, W, output_dims)
-
-        train_init_net, train_net = self.get_training_nets()
-        # Init net assertions
-        init_ops = self.assertNetContainOps(
-            train_init_net,
-            [
-                OpSpec("XavierFill", None, None),
-                OpSpec("ConstantFill", None, None),
-            ]
-        )
-        conv_spec = OpSpec(
-            "Conv",
-            [
-                input_record.field_blobs()[0],
-                init_ops[0].output[0],
-                init_ops[1].output[0],
-            ],
-            conv.field_blobs()
-        )
-
-        # Train net assertions
-        self.assertNetContainOps(train_net, [conv_spec])
-
-        # Predict net assertions
-        predict_net = self.get_predict_net()
-        self.assertNetContainOps(predict_net, [conv_spec])
-
-        # Eval net assertions
-        eval_net = self.get_eval_net()
-        self.assertNetContainOps(eval_net, [conv_spec])
-
-    @given(
-        num=st.integers(min_value=10, max_value=100),
-        feed_weight=st.booleans(),
-        use_inv_var_parameterization=st.booleans(),
-        use_log_barrier=st.booleans(),
-        enable_diagnose=st.booleans(),
-        **hu.gcs
-    )
-    @settings(deadline=1000)
-    def testAdaptiveWeight(
-        self, num, feed_weight, use_inv_var_parameterization, use_log_barrier,
-        enable_diagnose, gc, dc
-    ):
-        input_record = self.new_record(schema.RawTuple(num))
-        data = np.random.random(num)
-        schema.FeedRecord(
-            input_record, [np.array(x).astype(np.float32) for x in data]
-        )
-        weights = np.random.random(num) if feed_weight else None
-        result = self.model.AdaptiveWeight(
-            input_record,
-            weights=weights,
-            estimation_method=(
-                'inv_var' if use_inv_var_parameterization else 'log_std'
-            ),
-            pos_optim_method=(
-                'log_barrier' if use_log_barrier else 'pos_grad_proj'
-            ),
-            enable_diagnose=enable_diagnose
-        )
-        train_init_net, train_net = self.get_training_nets(True)
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-        result = workspace.FetchBlob(result())
-        if not feed_weight:
-            weights = np.array([1. / num for _ in range(num)])
-        expected = np.sum(weights * data + 0.5 * np.log(1. / 2. / weights))
-        npt.assert_allclose(expected, result, atol=1e-4, rtol=1e-4)
-        if enable_diagnose:
-            assert len(self.model.ad_hoc_plot_blobs) == num
-            reconst_weights_from_ad_hoc = np.array(
-                [workspace.FetchBlob(b) for b in self.model.ad_hoc_plot_blobs]
-            ).flatten()
-            npt.assert_allclose(
-                reconst_weights_from_ad_hoc, weights, atol=1e-4, rtol=1e-4
-            )
-        else:
-            assert len(self.model.ad_hoc_plot_blobs) == 0
-
-    @given(num=st.integers(min_value=10, max_value=100), **hu.gcs)
-    def testConstantWeight(self, num, gc, dc):
-        input_record = self.new_record(schema.RawTuple(num))
-        data = np.random.random(num)
-        schema.FeedRecord(
-            input_record, [np.array(x).astype(np.float32) for x in data]
-        )
-        weights = np.random.random(num)
-        result = self.model.ConstantWeight(input_record, weights=weights)
-        train_init_net, train_net = self.get_training_nets(True)
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-        result = workspace.FetchBlob(result())
-        expected = np.sum(weights * data)
-        npt.assert_allclose(expected, result, atol=1e-4, rtol=1e-4)
-
-    @given(**hu.gcs)
-    @settings(deadline=10000)
-    def testHomotopyWeight(self, gc, dc):
-        input_record = self.new_record(schema.RawTuple(2))
-        data = np.random.random(2)
-        schema.FeedRecord(
-            input_record, [np.array(x).astype(np.float32) for x in data]
-        )
-        # ensure: quad_life > 2 * half_life
-        half_life = int(np.random.random() * 1e2 + 1)
-        quad_life = int(np.random.random() * 1e3 + 2 * half_life + 1)
-        min_weight = np.random.random()
-        max_weight = np.random.random() + min_weight + 1e-5
-        result = self.model.HomotopyWeight(
-            input_record,
-            min_weight=min_weight,
-            max_weight=max_weight,
-            half_life=half_life,
-            quad_life=quad_life,
-        )
-        train_init_net, train_net = self.get_training_nets(True)
-        workspace.RunNetOnce(train_init_net)
-        workspace.CreateNet(train_net)
-        workspace.RunNet(train_net.Name(), num_iter=half_life)
-        half_life_result = workspace.FetchBlob(result())
-        workspace.RunNet(train_net.Name(), num_iter=quad_life - half_life)
-        quad_life_result = workspace.FetchBlob(result())
-
-        alpha = (min_weight + max_weight) / 2.
-        beta = (min_weight + max_weight) / 2.
-        expected_half_life_result = alpha * data[0] + beta * data[1]
-        alpha = (3 * min_weight + max_weight) / 4.
-        beta = (min_weight + 3 * max_weight) / 4.
-        expected_quad_life_result = alpha * data[0] + beta * data[1]
-        npt.assert_allclose(
-            expected_half_life_result, half_life_result, atol=1e-2, rtol=1e-2
-        )
-        npt.assert_allclose(
-            expected_quad_life_result, quad_life_result, atol=1e-2, rtol=1e-2
-        )
-
-    def _testLabelSmooth(self, categories, binary_prob_label, bsz):
-        label = self.new_record(schema.Scalar((np.float32, (1, ))))
-        label_np = np.random.randint(categories, size=bsz).astype(np.float32)
-        schema.FeedRecord(label, [label_np])
-        smooth_matrix_shape = (
-            2 if binary_prob_label else (categories, categories)
-        )
-        smooth_matrix = np.random.random(smooth_matrix_shape)
-        smoothed_label = self.model.LabelSmooth(label, smooth_matrix)
-        train_init_net, train_net = self.get_training_nets(True)
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-        smoothed_label_np = workspace.FetchBlob(smoothed_label())
-        if binary_prob_label:
-            expected = np.array(
-                [
-                    smooth_matrix[0] if x == 0.0 else smooth_matrix[1]
-                    for x in label_np
-                ]
-            )
-        else:
-            expected = np.array([smooth_matrix[int(x)] for x in label_np])
-        npt.assert_allclose(expected, smoothed_label_np, atol=1e-4, rtol=1e-4)
-
-    @given(
-        categories=st.integers(min_value=2, max_value=10),
-        bsz=st.integers(min_value=10, max_value=100),
-        **hu.gcs
-    )
-    def testLabelSmoothForCategoricalLabel(self, categories, bsz, gc, dc):
-        self._testLabelSmooth(categories, False, bsz)
-
-    @given(
-        bsz=st.integers(min_value=10, max_value=100),
-        **hu.gcs
-    )
-    def testLabelSmoothForBinaryProbLabel(self, bsz, gc, dc):
-        self._testLabelSmooth(2, True, bsz)
-
-    @given(
-        num_inputs=st.integers(min_value=2, max_value=10),
-        batch_size=st.integers(min_value=2, max_value=10),
-        input_dim=st.integers(min_value=5, max_value=10),
-        seed=st.integers(1, 10),
-    )
-    def testBlobWeightedSum(self, num_inputs, batch_size, input_dim, seed):
-
-        def get_blob_weighted_sum():
-            weights = []
-            for i in range(num_inputs):
-                w_blob_name = 'blob_weighted_sum/w_{0}'.format(i)
-                assert workspace.HasBlob(w_blob_name), (
-                    "cannot fine blob {}".format(w_blob_name)
-                )
-                w = workspace.FetchBlob(w_blob_name)
-                weights.append(w)
-
-            result = np.sum([
-                input_data[idx] * weights[idx] for idx in range(num_inputs)
-            ], axis=0)
-            return result
-
-        np.random.seed(seed)
-        expected_output_schema = schema.Scalar((np.float32, (input_dim,)))
-        input_schema = schema.Tuple(
-            *[expected_output_schema for _ in range(num_inputs)]
-        )
-        input_data = [
-            np.random.random((batch_size, input_dim)).astype(np.float32)
-            for _ in range(num_inputs)
-        ]
-        input_record = self.new_record(input_schema)
-        schema.FeedRecord(input_record, input_data)
-
-        # test output schema
-        ws_output = self.model.BlobWeightedSum(input_record)
-        self.assertEqual(len(self.model.layers), 1)
-        assert schema.equal_schemas(ws_output, expected_output_schema)
-
-        # test train net
-        train_init_net, train_net = self.get_training_nets()
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-        output = workspace.FetchBlob(ws_output())
-        npt.assert_almost_equal(get_blob_weighted_sum(), output, decimal=5)
-
-        self.run_train_net_forward_only()
-        output = workspace.FetchBlob(ws_output())
-        npt.assert_almost_equal(get_blob_weighted_sum(), output, decimal=5)
-
-        # test eval net
-        eval_net = self.get_eval_net()
-        workspace.RunNetOnce(eval_net)
-        output = workspace.FetchBlob(ws_output())
-        npt.assert_almost_equal(get_blob_weighted_sum(), output, decimal=5)
-
-        # test pred net
-        pred_net = self.get_predict_net()
-        workspace.RunNetOnce(pred_net)
-        output = workspace.FetchBlob(ws_output())
-        npt.assert_almost_equal(get_blob_weighted_sum(), output, decimal=5)
-
-    def testFeatureSparseToDenseGetAccessedFeatures(self):
-        float_features_column = "float_features"
-        float_features_type = "FLOAT"
-        float_features_ids = [1, 2, 3]
-
-        id_list_features_column = "id_list_features"
-        id_list_features_type = "ID_LIST"
-        id_list_features_ids = [4, 5, 6]
-
-        id_score_list_features_column = "id_score_list_features"
-        id_score_list_features_type = "ID_SCORE_LIST"
-        id_score_list_features_ids = [7, 8 , 9]
-
-        feature_names = ["a", "b", "c"]
-
-        input_record = self.new_record(schema.Struct(
-            (float_features_column, schema.Map(np.int32, np.float32)),
-            (id_list_features_column,
-                schema.Map(np.int32, schema.List(np.int64))),
-            (id_score_list_features_column,
-                schema.Map(np.int32, schema.Map(np.int64, np.float32))),
-        ))
-
-        input_specs = [
-            (
-                float_features_column,
-                schema.FeatureSpec(
-                    feature_type=float_features_type,
-                    feature_ids=float_features_ids,
-                    feature_names=feature_names,
-                ),
-            ),
-            (
-                id_list_features_column,
-                schema.FeatureSpec(
-                    feature_type=id_list_features_type,
-                    feature_ids=id_list_features_ids,
-                    feature_names=feature_names,
-                ),
-            ),
-            (
-                id_score_list_features_column,
-                schema.FeatureSpec(
-                    feature_type=id_score_list_features_type,
-                    feature_ids=id_score_list_features_ids,
-                    feature_names=feature_names,
-                ),
-            ),
-        ]
-
-        self.model.FeatureSparseToDense(input_record, input_specs)
-
-        expected_accessed_features = {
-            float_features_column: [
-                AccessedFeatures(float_features_type, set(float_features_ids))],
-            id_list_features_column: [
-                AccessedFeatures(id_list_features_type, set(id_list_features_ids))],
-            id_score_list_features_column: [
-                AccessedFeatures(id_score_list_features_type, set(id_score_list_features_ids))],
-        }
-
-        self.assertEqual(len(self.model.layers), 1)
-        self.assertEqual(
-            self.model.layers[0].get_accessed_features(),
-            expected_accessed_features
-        )
-
-    def test_get_key(self):
-        def _is_id_list(input_record):
-            return almost_equal_schemas(input_record, IdList)
-
-
-        def _is_id_score_list(input_record):
-            return almost_equal_schemas(input_record,
-                                        IdScoreList,
-                                        check_field_types=False)
-
-        def old_get_sparse_key_logic(input_record):
-            if _is_id_list(input_record):
-                sparse_key = input_record.items()
-            elif _is_id_score_list(input_record):
-                sparse_key = input_record.keys()
-            else:
-                raise NotImplementedError()
-            return sparse_key
-
-        id_score_list_record = schema.NewRecord(
-            self.model.net,
-            schema.Map(
-                schema.Scalar(
-                    np.int64,
-                    metadata=schema.Metadata(
-                        categorical_limit=1000
-                    ),
-                ),
-                np.float32
-            )
-        )
-
-        self.assertEqual(
-            get_key(id_score_list_record)(),
-            old_get_sparse_key_logic(id_score_list_record)
-        )
-
-        id_list_record = schema.NewRecord(
-            self.model.net,
-            schema.List(
-                schema.Scalar(
-                    np.int64,
-                    metadata=schema.Metadata(categorical_limit=1000)
-                )
-            )
-        )
-
-        self.assertEqual(
-            get_key(id_list_record)(),
-            old_get_sparse_key_logic(id_list_record)
-        )
-
-    def testSparseLookupWithAttentionWeightOnIdScoreList(self):
-        record = schema.NewRecord(
-            self.model.net,
-            schema.Map(
-                schema.Scalar(
-                    np.int64,
-                    metadata=schema.Metadata(categorical_limit=1000),
-                ),
-                np.float32,
-            ),
-        )
-        embedding_dim = 64
-        embedding_after_pooling = self.model.SparseLookup(
-            record, [embedding_dim], "Sum", use_external_weights=True
-        )
-        self.model.output_schema = schema.Struct()
-        self.assertEqual(
-            schema.Scalar((np.float32, (embedding_dim,))), embedding_after_pooling
-        )
-
-        train_init_net, train_net = self.get_training_nets()
-
-        init_ops = self.assertNetContainOps(
-            train_init_net,
-            [OpSpec("UniformFill", None, None), OpSpec("ConstantFill", None, None)],
-        )
-        sparse_lookup_op_spec = OpSpec(
-            "SparseLengthsWeightedSum",
-            [
-                init_ops[0].output[0],
-                record.values(),
-                record.keys(),
-                record.lengths(),
-            ],
-            [embedding_after_pooling()],
-        )
-        self.assertNetContainOps(train_net, [sparse_lookup_op_spec])
-
-        predict_net = self.get_predict_net()
-        self.assertNetContainOps(predict_net, [sparse_lookup_op_spec])
-
-    def testSparseItemwiseDropoutWithReplacement(self):
-        input_record = schema.NewRecord(self.model.net, IdList)
-        self.model.output_schema = schema.Struct()
-
-        lengths_blob = input_record.field_blobs()[0]
-        values_blob = input_record.field_blobs()[1]
-        lengths = np.array([1] * 10).astype(np.int32)
-        values = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.int64)
-        workspace.FeedBlob(lengths_blob, lengths)
-        workspace.FeedBlob(values_blob, values)
-
-        out = self.model.SparseItemwiseDropoutWithReplacement(
-            input_record, 0.0, 0.5, 1.0, -1, output_names_or_num=1)
-        self.assertEqual(schema.List(schema.Scalar(np.int64,)), out)
-
-        train_init_net, train_net = self.get_training_nets()
-        eval_net = self.get_eval_net()
-        predict_net = self.get_predict_net()
-
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-        out_values = workspace.FetchBlob(out.items())
-        out_lengths = workspace.FetchBlob(out.lengths())
-        self.assertBlobsEqual(out_values, values)
-        self.assertBlobsEqual(out_lengths, lengths)
-
-        workspace.RunNetOnce(eval_net)
-
-        workspace.RunNetOnce(predict_net)
-        predict_values = workspace.FetchBlob("values_auto_0")
-        predict_lengths = workspace.FetchBlob("lengths_auto_0")
-        self.assertBlobsEqual(predict_values, np.array([-1] * 10).astype(np.int64))
-        self.assertBlobsEqual(predict_lengths, lengths)
diff --git a/caffe2/python/lazy.py b/caffe2/python/lazy.py
deleted file mode 100644
index 8e61f41767c8..000000000000
--- a/caffe2/python/lazy.py
+++ /dev/null
@@ -1,14 +0,0 @@
-## @package workspace
-# Module caffe2.python.lazy
-
-_import_lazy_calls = []
-
-def RegisterLazyImport(lazy):
-    global _import_lazy_calls
-    _import_lazy_calls += [lazy]
-
-
-def TriggerLazyImport():
-    global _import_lazy_calls
-    for lazy in _import_lazy_calls:
-        lazy()
diff --git a/caffe2/python/lazy_dyndep.py b/caffe2/python/lazy_dyndep.py
deleted file mode 100644
index e53d4fda350b..000000000000
--- a/caffe2/python/lazy_dyndep.py
+++ /dev/null
@@ -1,84 +0,0 @@
-## @package lazy_dyndep
-# Module caffe2.python.lazy_dyndep
-
-
-
-
-
-import os
-from caffe2.python import dyndep, lazy
-
-
-def RegisterOpsLibrary(name):
-    """Registers a dynamic library that contains custom operators into Caffe2.
-
-    Since Caffe2 uses static variable registration, you can optionally load a
-    separate .so file that contains custom operators and registers that into
-    the caffe2 core binary. In C++, this is usually done by either declaring
-    dependency during compilation time, or via dynload. This allows us to do
-    registration similarly on the Python side.
-
-    Unlike dyndep.InitOpsLibrary, this does not actually parse the c++ file
-    and refresh operators until caffe2 is called in a fashion which requires
-    operators. In some large codebases this saves a large amount of time
-    during import.
-
-    It is safe to use within a program that also uses dyndep.InitOpsLibrary
-
-    Args:
-        name: a name that ends in .so, such as "my_custom_op.so". Otherwise,
-            the command will simply be ignored.
-    Returns:
-        None
-    """
-    if not os.path.exists(name):
-        # Note(jiayq): if the name does not exist, instead of immediately
-        # failing we will simply print a warning, deferring failure to the
-        # time when an actual call is made.
-        print('Ignoring {} as it is not a valid file.'.format(name))
-        return
-    global _LAZY_IMPORTED_DYNDEPS
-    _LAZY_IMPORTED_DYNDEPS.add(name)
-
-
-_LAZY_IMPORTED_DYNDEPS = set()
-_error_handler = None
-
-
-def SetErrorHandler(handler):
-    """Registers an error handler for errors from registering operators
-
-    Since the lazy registration may happen at a much later time, having a dedicated
-    error handler allows for custom error handling logic. It is highly
-    recomended to set this to prevent errors from bubbling up in weird parts of the
-    code.
-
-    Args:
-        handler: a function that takes an exception as a single handler.
-    Returns:
-        None
-    """
-
-    global _error_handler
-    _error_handler = handler
-
-
-def GetImportedOpsLibraries():
-    _import_lazy()
-    return dyndep.GetImportedOpsLibraries()
-
-
-def _import_lazy():
-    global _LAZY_IMPORTED_DYNDEPS
-    if not _LAZY_IMPORTED_DYNDEPS:
-        return
-    for name in list(_LAZY_IMPORTED_DYNDEPS):
-        try:
-            dyndep.InitOpLibrary(name, trigger_lazy=False)
-        except BaseException as e:
-            if _error_handler:
-                _error_handler(e)
-        finally:
-            _LAZY_IMPORTED_DYNDEPS.remove(name)
-
-lazy.RegisterLazyImport(_import_lazy)
diff --git a/caffe2/python/lazy_dyndep_test.py b/caffe2/python/lazy_dyndep_test.py
deleted file mode 100644
index f75432550332..000000000000
--- a/caffe2/python/lazy_dyndep_test.py
+++ /dev/null
@@ -1,133 +0,0 @@
-#!/usr/bin/env python3
-
-
-
-
-
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-from multiprocessing import Process
-
-import numpy as np
-import tempfile
-import shutil
-
-import caffe2.python.hypothesis_test_util as hu
-import unittest
-
-op_engine = 'GLOO'
-
-class TemporaryDirectory:
-    def __enter__(self):
-        self.tmpdir = tempfile.mkdtemp()
-        return self.tmpdir
-
-    def __exit__(self, type, value, traceback):
-        shutil.rmtree(self.tmpdir)
-
-
-def allcompare_process(filestore_dir, process_id, data, num_procs):
-    from caffe2.python import core, data_parallel_model, workspace, lazy_dyndep
-    from caffe2.python.model_helper import ModelHelper
-    from caffe2.proto import caffe2_pb2
-    lazy_dyndep.RegisterOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
-
-    workspace.RunOperatorOnce(
-        core.CreateOperator(
-            "FileStoreHandlerCreate", [], ["store_handler"], path=filestore_dir
-        )
-    )
-    rendezvous = dict(
-        kv_handler="store_handler",
-        shard_id=process_id,
-        num_shards=num_procs,
-        engine=op_engine,
-        exit_nets=None
-    )
-
-    model = ModelHelper()
-    model._rendezvous = rendezvous
-
-    workspace.FeedBlob("test_data", data)
-
-    data_parallel_model._RunComparison(
-        model, "test_data", core.DeviceOption(caffe2_pb2.CPU, 0)
-    )
-
-
-class TestLazyDynDepAllCompare(hu.HypothesisTestCase):
-    @given(
-        d=st.integers(1, 5), n=st.integers(2, 11), num_procs=st.integers(1, 8)
-    )
-    @settings(deadline=None)
-    def test_allcompare(self, d, n, num_procs):
-        dims = []
-        for _ in range(d):
-            dims.append(np.random.randint(1, high=n))
-        test_data = np.random.ranf(size=tuple(dims)).astype(np.float32)
-
-        with TemporaryDirectory() as tempdir:
-            processes = []
-            for idx in range(num_procs):
-                process = Process(
-                    target=allcompare_process,
-                    args=(tempdir, idx, test_data, num_procs)
-                )
-                processes.append(process)
-                process.start()
-
-            while len(processes) > 0:
-                process = processes.pop()
-                process.join()
-
-class TestLazyDynDepError(unittest.TestCase):
-    def test_errorhandler(self):
-        from caffe2.python import core, lazy_dyndep
-        import tempfile
-
-        with tempfile.NamedTemporaryFile() as f:
-            lazy_dyndep.RegisterOpsLibrary(f.name)
-
-            def handler(e):
-                raise ValueError("test")
-            lazy_dyndep.SetErrorHandler(handler)
-            with self.assertRaises(ValueError, msg="test"):
-                core.RefreshRegisteredOperators()
-
-    def test_importaftererror(self):
-        from caffe2.python import core, lazy_dyndep
-        import tempfile
-
-        with tempfile.NamedTemporaryFile() as f:
-            lazy_dyndep.RegisterOpsLibrary(f.name)
-
-            def handler(e):
-                raise ValueError("test")
-            lazy_dyndep.SetErrorHandler(handler)
-            with self.assertRaises(ValueError):
-                core.RefreshRegisteredOperators()
-
-            def handlernoop(e):
-                raise
-            lazy_dyndep.SetErrorHandler(handlernoop)
-            lazy_dyndep.RegisterOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
-            core.RefreshRegisteredOperators()
-
-    def test_workspacecreatenet(self):
-        from caffe2.python import workspace, lazy_dyndep
-        import tempfile
-
-        with tempfile.NamedTemporaryFile() as f:
-            lazy_dyndep.RegisterOpsLibrary(f.name)
-            called = False
-
-            def handler(e):
-                raise ValueError("test")
-            lazy_dyndep.SetErrorHandler(handler)
-            with self.assertRaises(ValueError, msg="test"):
-                workspace.CreateNet("fake")
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py b/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py
deleted file mode 100644
index 718b7fb3a987..000000000000
--- a/caffe2/python/lengths_reducer_fused_8bit_rowwise_ops_test.py
+++ /dev/null
@@ -1,202 +0,0 @@
-
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, workspace
-from hypothesis import given
-
-
-def compare_rowwise(emb_orig, emb_reconstructed, fp16):
-    # there is an absolute error introduced per row through int8 quantization
-    # and a relative error introduced when quantizing back from fp32 to fp16
-    assert emb_orig.shape == emb_reconstructed.shape
-    rtol = 1e-8
-    if fp16:
-        rtol = 1e-3
-    erange = np.amax(emb_orig, axis=1) - np.amin(emb_orig, axis=1)
-
-    threshold = erange / 255.0 / 1.9
-
-    for i in range(emb_orig.shape[0]):
-        r_orig = emb_orig[i, :]
-        r_reconstructed = emb_reconstructed[i, :]
-
-        isclose = np.isclose(r_orig, r_reconstructed, atol=threshold[i], rtol=rtol)
-        n_violated = isclose.size - isclose.sum()
-
-        if n_violated > 0:
-            print(isclose, threshold[i])
-            print(i, r_orig, r_reconstructed, threshold[i], r_orig - r_reconstructed)
-        assert n_violated == 0
-
-
-class TestLengthsReducerOpsFused8BitRowwise(hu.HypothesisTestCase):
-    @given(
-        num_rows=st.integers(1, 20),
-        blocksize=st.sampled_from([8, 16, 32, 64, 85, 96, 128, 163]),
-        weighted=st.booleans(),
-        seed=st.integers(0, 2 ** 32 - 1),
-        empty_indices=st.booleans(),
-        fp16=st.booleans(),
-    )
-    def test_sparse_lengths_sum(
-        self, num_rows, blocksize, weighted, seed, empty_indices, fp16
-    ):
-        net = core.Net("bench")
-
-        np.random.seed(seed)
-
-        if fp16:
-            input_data = np.random.rand(num_rows, blocksize).astype(np.float16)
-        else:
-            input_data = np.random.rand(num_rows, blocksize).astype(np.float32)
-        if empty_indices:
-            lengths = np.zeros(num_rows, dtype=np.int32)
-            num_indices = 0
-        else:
-            num_indices = np.random.randint(len(input_data))
-            # the number of indices per sample
-            lengths_split = np.clip(num_indices // 2, 1, 10)
-            lengths = (
-                np.ones([num_indices // lengths_split], dtype=np.int32) * lengths_split
-            )
-            # readjust num_indices when lengths_split doesn't divide num_indices
-            num_indices = num_indices // lengths_split * lengths_split
-        indices = np.random.randint(
-            low=0, high=len(input_data), size=[num_indices], dtype=np.int32
-        )
-        weights = np.random.uniform(size=[len(indices)]).astype(np.float32)
-
-        if fp16:
-            quantized_data = net.HalfFloatToFused8BitRowwiseQuantized(
-                "input_data", "quantized_data"
-            )
-            dequantized_data = net.Fused8BitRowwiseQuantizedToHalfFloat(
-                quantized_data, "dequantized_data"
-            )
-        else:
-            quantized_data = net.FloatToFused8BitRowwiseQuantized(
-                "input_data", "quantized_data"
-            )
-            dequantized_data = net.Fused8BitRowwiseQuantizedToFloat(
-                quantized_data, "dequantized_data"
-            )
-
-        if weighted:
-            net.SparseLengthsWeightedSum(
-                [dequantized_data, "weights", "indices", "lengths"], "sum_reference"
-            )
-            net.SparseLengthsWeightedSumFused8BitRowwise(
-                [quantized_data, "weights", "indices", "lengths"], "sum_quantized"
-            )
-        else:
-            net.SparseLengthsSum(
-                [dequantized_data, "indices", "lengths"], "sum_reference"
-            )
-            net.SparseLengthsSumFused8BitRowwise(
-                [quantized_data, "indices", "lengths"], "sum_quantized"
-            )
-
-        workspace.FeedBlob("input_data", input_data)
-        workspace.FeedBlob("weights", weights)
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-
-        workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"])
-        workspace.CreateNet(net)
-        workspace.RunNetOnce(net)
-
-        dequantized_data = workspace.FetchBlob("dequantized_data")
-        np.testing.assert_array_almost_equal(
-            input_data, workspace.FetchBlob("input_data")
-        )
-        compare_rowwise(input_data, dequantized_data, fp16)
-
-        sum_reference = workspace.FetchBlob("sum_reference")
-        sum_quantized = workspace.FetchBlob("sum_quantized")
-        if fp16:
-            np.testing.assert_array_almost_equal(
-                sum_reference, sum_quantized, decimal=3
-            )
-        else:
-            np.testing.assert_array_almost_equal(sum_reference, sum_quantized)
-
-    @given(
-        num_rows=st.integers(1, 20),
-        blocksize=st.sampled_from([8, 16, 32, 64, 85, 96, 128, 163]),
-        seed=st.integers(0, 2 ** 32 - 1),
-        empty_indices=st.booleans(),
-        fp16=st.booleans(),
-    )
-    def test_sparse_lengths_mean(self, num_rows, blocksize, seed, empty_indices, fp16):
-        net = core.Net("bench")
-
-        np.random.seed(seed)
-
-        if fp16:
-            input_data = np.random.rand(num_rows, blocksize).astype(np.float16)
-        else:
-            input_data = np.random.rand(num_rows, blocksize).astype(np.float32)
-
-        if empty_indices:
-            lengths = np.zeros(num_rows, dtype=np.int32)
-            num_indices = 0
-        else:
-            num_indices = np.random.randint(len(input_data))
-            # the number of indices per sample
-            lengths_split = np.clip(num_indices // 2, 1, 10)
-            lengths = (
-                np.ones([num_indices // lengths_split], dtype=np.int32) * lengths_split
-            )
-            # readjust num_indices when lengths_split doesn't divide num_indices
-            num_indices = num_indices // lengths_split * lengths_split
-        indices = np.random.randint(
-            low=0, high=len(input_data), size=[num_indices], dtype=np.int32
-        )
-        print(indices, lengths)
-
-        if fp16:
-            quantized_data = net.HalfFloatToFused8BitRowwiseQuantized(
-                "input_data", "quantized_data"
-            )
-            dequantized_data = net.Fused8BitRowwiseQuantizedToHalfFloat(
-                quantized_data, "dequantized_data"
-            )
-        else:
-            quantized_data = net.FloatToFused8BitRowwiseQuantized(
-                "input_data", "quantized_data"
-            )
-            dequantized_data = net.Fused8BitRowwiseQuantizedToFloat(
-                quantized_data, "dequantized_data"
-            )
-
-        net.SparseLengthsMean(
-            [dequantized_data, "indices", "lengths"], "mean_reference"
-        )
-        net.SparseLengthsMeanFused8BitRowwise(
-            [quantized_data, "indices", "lengths"], "mean_quantized"
-        )
-
-        workspace.FeedBlob("input_data", input_data)
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-
-        workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"])
-        workspace.CreateNet(net)
-        workspace.RunNetOnce(net)
-
-        dequantized_data = workspace.FetchBlob("dequantized_data")
-        np.testing.assert_array_almost_equal(
-            input_data, workspace.FetchBlob("input_data")
-        )
-        compare_rowwise(input_data, dequantized_data, fp16)
-
-        mean_reference = workspace.FetchBlob("mean_reference")
-        mean_quantized = workspace.FetchBlob("mean_quantized")
-        if fp16:
-            np.testing.assert_array_almost_equal(
-                mean_reference, mean_quantized, decimal=3
-            )
-        else:
-            np.testing.assert_array_almost_equal(mean_reference, mean_quantized)
diff --git a/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py b/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py
deleted file mode 100644
index a38d442dd952..000000000000
--- a/caffe2/python/lengths_reducer_rowwise_8bit_ops_test.py
+++ /dev/null
@@ -1,151 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-
-import numpy as np
-
-
-def FakeQuantization8BitsRowwise(data):
-    min_el = np.min(data, axis=1)
-    max_el = np.max(data, axis=1)
-    scale = (max_el - min_el) / 255.
-    bias = min_el
-    inv_scale = 1. / scale
-    data = data.T
-    data = np.round((data - bias) * inv_scale) * scale + bias
-    return data.T
-
-
-class TestQuantize8bits(hu.HypothesisTestCase):
-
-    def test_quantize_op(self):
-        op = core.CreateOperator(
-            'FloatToRowwiseQuantized8Bits',
-            ['input_data'],
-            ['quantized_input', 'scale_bias'])
-        input_data = np.float32(np.asarray([[801., 786, 235.2, 2353.3434],
-                                            [5., 11., 9., -2.]]))
-        workspace.FeedBlob('input_data', input_data)
-        workspace.RunOperatorOnce(op)
-        op1 = core.CreateOperator(
-            'Rowwise8BitQuantizedToFloat',
-            ['quantized_input', 'scale_bias'],
-            ['dequantized_input'])
-        workspace.RunOperatorOnce(op1)
-        result = workspace.FetchBlob('dequantized_input')
-        ground_truth = FakeQuantization8BitsRowwise(input_data)
-        np.testing.assert_array_almost_equal(
-            result, ground_truth)
-
-    def test_quantize_tensor_with_const_row_op(self):
-        op = core.CreateOperator(
-            'FloatToRowwiseQuantized8Bits',
-            ['input_data'],
-            ['quantized_input', 'scale_bias'])
-        input_data = np.float32(np.asarray([[801., 786, 235.2, 2353.3434],
-                                            [9., 9., 9., 9.]]))
-        workspace.FeedBlob('input_data', input_data)
-        workspace.RunOperatorOnce(op)
-        op1 = core.CreateOperator(
-            'Rowwise8BitQuantizedToFloat',
-            ['quantized_input', 'scale_bias'],
-            ['dequantized_input'])
-        workspace.RunOperatorOnce(op1)
-        result = workspace.FetchBlob('dequantized_input')
-        ground_truth = FakeQuantization8BitsRowwise(input_data)
-        ground_truth[1, :] = 9.
-        np.testing.assert_array_almost_equal(
-            result, ground_truth)
-
-    def test_SparseSegmentUint8(self):
-
-        init_net = core.Net("init")
-        net = core.Net("bench")
-        size = 10**3
-        isize = 10**2
-
-        # input preparation
-        d = init_net.UniformFill([], shape=[size, 32])
-        w = init_net.UniformFill([], shape=[isize, ])
-        i = init_net.UniformIntFill([], shape=[isize], max=size - 1)
-        i = init_net.Cast([i], to=core.DataType.INT64)
-        l = init_net.ConstantFill(
-            [],
-            ['l'],
-            shape=[isize // 10],
-            value=10,
-            dtype=core.DataType.INT32,
-        )
-        net.FloatToRowwiseQuantized8Bits([d],
-                                         ['quantized_data', 'scale_bias'])
-        net.Rowwise8BitQuantizedToFloat(['quantized_data', 'scale_bias'],
-                                        ['dequantized_data'])
-
-        # SparseLengthsWeightedSum
-        net.SparseLengthsWeightedSum(['dequantized_data', w, i, l],
-                                     ['PositionWeighted_0'], engine='fp16')
-        net.SparseLengthsWeightedSum8BitsRowwise(
-            ['quantized_data', w, i, l, 'scale_bias'],
-            ['PositionWeighted_1'])
-
-        # SparseLengthsSum
-        net.SparseLengthsSum(['dequantized_data', i, l],
-                             ['Sum_0'], engine='fp16')
-
-        net.SparseLengthsSum8BitsRowwise(
-            ['quantized_data', i, l, 'scale_bias'],
-            ['Sum_1'])
-
-        # SparseLengthsWeightedMean
-        # net.SparseLengthsWeightedMean(['dequantized_data', w, i, l],
-        #                              ['WeightedMean_0'])
-        # net.SparseLengthsWeightedMean8BitsRowwise(
-        #     ['quantized_data', w, i, l, 'scale_bias'],
-        #     ['WeightedMean_1'])
-
-        # SparseLengthsMean
-        net.SparseLengthsMean(['dequantized_data', i, l],
-                              ['Mean_0'], engine='fp16')
-
-        net.SparseLengthsMean8BitsRowwise(
-            ['quantized_data', i, l, 'scale_bias'],
-            ['Mean_1'])
-
-        gathered_w = net.Gather(['quantized_data', i],
-                                engine='fp16')
-
-        gathered_scale_bias = net.Gather(['scale_bias', i],
-                                         engine='fp16')
-        net.Rowwise8BitQuantizedToFloat(
-            [gathered_w, gathered_scale_bias],
-            'Gathered_1')
-
-        net.Gather(['dequantized_data', i], 'Gathered_0')
-
-        workspace.GlobalInit(['caffe2', '--caffe2_log_level=0'])
-        workspace.RunNetOnce(init_net)
-        workspace.CreateNet(net)
-        workspace.RunNetOnce(net)
-
-        PositionWeighted_1 = workspace.FetchBlob('PositionWeighted_1')
-        ground_truth_posw = workspace.FetchBlob('PositionWeighted_0')
-        np.testing.assert_array_almost_equal(PositionWeighted_1,
-                                             ground_truth_posw, decimal=5)
-        Sum_1 = workspace.FetchBlob('Sum_1')
-        ground_truth_sum = workspace.FetchBlob('Sum_0')
-        np.testing.assert_array_almost_equal(Sum_1,
-                                             ground_truth_sum, decimal=5)
-
-        Mean_1 = workspace.FetchBlob('Mean_1')
-        ground_truth_mean = workspace.FetchBlob('Mean_0')
-        np.testing.assert_array_almost_equal(Mean_1,
-                                             ground_truth_mean, decimal=5)
-
-        Gathered_1 = workspace.FetchBlob('Gathered_1')
-        ground_truth_gathered = workspace.FetchBlob('Gathered_0')
-        np.testing.assert_array_almost_equal(Gathered_1,
-                                             ground_truth_gathered, decimal=5)
diff --git a/caffe2/python/lstm_benchmark.py b/caffe2/python/lstm_benchmark.py
deleted file mode 100644
index 29f819ec622e..000000000000
--- a/caffe2/python/lstm_benchmark.py
+++ /dev/null
@@ -1,347 +0,0 @@
-## @package lstm_benchmark
-# Module caffe2.python.lstm_benchmark
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import workspace, core, utils, rnn_cell, model_helper
-from caffe2.python import recurrent
-
-import argparse
-import numpy as np
-import time
-
-import logging
-
-logging.basicConfig()
-log = logging.getLogger("lstm_bench")
-log.setLevel(logging.DEBUG)
-
-
-def generate_data(T, shape, num_labels, fixed_shape):
-    '''
-    Fill a queue with input data
-    '''
-    log.info("Generating T={} sequence batches".format(T))
-
-    generate_input_init_net = core.Net('generate_input_init')
-    queue = generate_input_init_net.CreateBlobsQueue(
-        [], "inputqueue", num_blobs=1, capacity=T,
-    )
-    label_queue = generate_input_init_net.CreateBlobsQueue(
-        [], "labelqueue", num_blobs=1, capacity=T,
-    )
-
-    workspace.RunNetOnce(generate_input_init_net)
-    generate_input_net = core.Net('generate_input')
-
-    generate_input_net.EnqueueBlobs([queue, "scratch"], ["scratch"])
-    generate_input_net.EnqueueBlobs([label_queue, "label_scr"], ["label_scr"])
-    np.random.seed(2603)
-
-    entry_counts = []
-    for t in range(T):
-        if (t % (max(10, T // 10)) == 0):
-            print("Generating data {}/{}".format(t, T))
-        # Randomize the seqlength
-        random_shape = (
-            [np.random.randint(1, shape[0])] + shape[1:]
-            if t > 0 and not fixed_shape else shape
-        )
-        X = np.random.rand(*random_shape).astype(np.float32)
-        batch_size = random_shape[1]
-        L = num_labels * batch_size
-        labels = (np.random.rand(random_shape[0]) * L).astype(np.int32)
-        workspace.FeedBlob("scratch", X)
-        workspace.FeedBlob("label_scr", labels)
-        workspace.RunNetOnce(generate_input_net.Proto())
-        entry_counts.append(random_shape[0] * random_shape[1])
-
-    log.info("Finished data generation")
-
-    return queue, label_queue, entry_counts
-
-
-def create_model(args, queue, label_queue, input_shape):
-    model = model_helper.ModelHelper(name="LSTM_bench")
-    seq_lengths, target = \
-        model.net.AddExternalInputs(
-            'seq_lengths',
-            'target',
-        )
-
-    input_blob = model.net.DequeueBlobs(queue, "input_data")
-    labels = model.net.DequeueBlobs(label_queue, "label")
-
-    init_blobs = []
-    if args.implementation in ["own", "static", "static_dag"]:
-        T = None
-        if "static" in args.implementation:
-            assert args.fixed_shape, \
-                "Random input length is not static RNN compatible"
-            T = args.seq_length
-            print("Using static RNN of size {}".format(T))
-
-        for i in range(args.num_layers):
-            hidden_init, cell_init = model.net.AddExternalInputs(
-                "hidden_init_{}".format(i),
-                "cell_init_{}".format(i)
-            )
-            init_blobs.extend([hidden_init, cell_init])
-
-        output, last_hidden, _, last_state = rnn_cell.LSTM(
-            model=model,
-            input_blob=input_blob,
-            seq_lengths=seq_lengths,
-            initial_states=init_blobs,
-            dim_in=args.input_dim,
-            dim_out=[args.hidden_dim] * args.num_layers,
-            scope="lstm1",
-            memory_optimization=args.memory_optimization,
-            forward_only=args.forward_only,
-            drop_states=True,
-            return_last_layer_only=True,
-            static_rnn_unroll_size=T,
-        )
-
-        if "dag" in args.implementation:
-            print("Using DAG net type")
-            model.net.Proto().type = 'dag'
-            model.net.Proto().num_workers = 4
-
-    elif args.implementation == "cudnn":
-        # We need to feed a placeholder input so that RecurrentInitOp
-        # can infer the dimensions.
-        init_blobs = model.net.AddExternalInputs("hidden_init", "cell_init")
-        model.param_init_net.ConstantFill([], input_blob, shape=input_shape)
-        output, last_hidden, _ = rnn_cell.cudnn_LSTM(
-            model=model,
-            input_blob=input_blob,
-            initial_states=init_blobs,
-            dim_in=args.input_dim,
-            dim_out=args.hidden_dim,
-            scope="cudnnlstm",
-            num_layers=args.num_layers,
-        )
-
-    else:
-        assert False, "Unknown implementation"
-
-    weights = model.net.UniformFill(labels, "weights")
-    softmax, loss = model.net.SoftmaxWithLoss(
-        [model.Flatten(output), labels, weights],
-        ['softmax', 'loss'],
-    )
-
-    if not args.forward_only:
-        model.AddGradientOperators([loss])
-
-    # carry states over
-    for init_blob in init_blobs:
-        model.net.Copy(last_hidden, init_blob)
-
-        sz = args.hidden_dim
-        if args.implementation == "cudnn":
-            sz *= args.num_layers
-        workspace.FeedBlob(init_blob, np.zeros(
-            [1, args.batch_size, sz], dtype=np.float32
-        ))
-
-    if args.rnn_executor:
-        for op in model.net.Proto().op:
-            if op.type.startswith('RecurrentNetwork'):
-                recurrent.set_rnn_executor_config(
-                    op,
-                    num_threads=args.rnn_executor_num_threads,
-                    max_cuda_streams=args.rnn_executor_max_cuda_streams,
-                )
-    return model, output
-
-
-def Caffe2LSTM(args):
-    T = args.data_size // args.batch_size
-
-    input_blob_shape = [args.seq_length, args.batch_size, args.input_dim]
-    queue, label_queue, entry_counts = generate_data(T // args.seq_length,
-                                       input_blob_shape,
-                                       args.hidden_dim,
-                                       args.fixed_shape)
-
-    workspace.FeedBlob(
-        "seq_lengths",
-        np.array([args.seq_length] * args.batch_size, dtype=np.int32)
-    )
-
-    model, output = create_model(args, queue, label_queue, input_blob_shape)
-
-    workspace.RunNetOnce(model.param_init_net)
-    workspace.CreateNet(model.net)
-
-    start_time = time.time()
-    num_iters = T // args.seq_length
-    total_iters = 0
-
-    # Run the Benchmark
-    log.info("------ Warming up ------")
-    workspace.RunNet(model.net.Proto().name)
-
-    if (args.gpu):
-        log.info("Memory stats:")
-        stats = utils.GetGPUMemoryUsageStats()
-        log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
-
-    log.info("------ Starting benchmark ------")
-    start_time = time.time()
-    last_time = time.time()
-    for iteration in range(1, num_iters, args.iters_to_report):
-        iters_once = min(args.iters_to_report, num_iters - iteration)
-        total_iters += iters_once
-        workspace.RunNet(model.net.Proto().name, iters_once)
-
-        new_time = time.time()
-        log.info(
-            "Iter: {} / {}. Entries Per Second: {}k.".format(
-                iteration,
-                num_iters,
-                np.sum(entry_counts[iteration:iteration + iters_once]) /
-                (new_time - last_time) // 100 / 10,
-            )
-        )
-        last_time = new_time
-
-    log.info("Done. Total EPS excluding 1st iteration: {}k {}".format(
-         np.sum(entry_counts[1:]) / (time.time() - start_time) // 100 / 10,
-         " (with RNN executor)" if args.rnn_executor else "",
-    ))
-
-    if (args.gpu):
-        log.info("Memory stats:")
-        stats = utils.GetGPUMemoryUsageStats()
-        log.info("GPU memory:\t{} MB".format(stats['max_total'] / 1024 / 1024))
-        if (stats['max_total'] != stats['total']):
-            log.warning(
-                "Max usage differs from current total usage: {} > {}".
-                format(stats['max_total'], stats['total'])
-            )
-            log.warning("This means that costly deallocations occurred.")
-
-    return time.time() - start_time
-
-
-@utils.debug
-def Benchmark(args):
-    return Caffe2LSTM(args)
-
-
-def GetArgumentParser():
-    parser = argparse.ArgumentParser(description="LSTM benchmark.")
-
-    parser.add_argument(
-        "--hidden_dim",
-        type=int,
-        default=800,
-        help="Hidden dimension",
-    )
-    parser.add_argument(
-        "--input_dim",
-        type=int,
-        default=40,
-        help="Input dimension",
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=128,
-        help="The batch size."
-    )
-    parser.add_argument(
-        "--seq_length",
-        type=int,
-        default=20,
-        help="Max sequence length"
-    )
-    parser.add_argument(
-        "--data_size",
-        type=int,
-        default=1000000,
-        help="Number of data points to generate"
-    )
-    parser.add_argument(
-        "--iters_to_report",
-        type=int,
-        default=20,
-        help="Number of iteration to report progress"
-    )
-    parser.add_argument(
-        "--gpu",
-        action="store_true",
-        help="Run all on GPU",
-    )
-    parser.add_argument(
-        "--implementation",
-        type=str,
-        default="own",
-        help="'cudnn', 'own', 'static' or 'static_dag'",
-    )
-    parser.add_argument(
-        "--fixed_shape",
-        action="store_true",
-        help=("Whether to randomize shape of input batches. "
-              "Static RNN requires fixed shape"),
-    )
-    parser.add_argument(
-        "--memory_optimization",
-        action="store_true",
-        help="Whether to use memory optimized LSTM or not",
-    )
-    parser.add_argument(
-        "--forward_only",
-        action="store_true",
-        help="Whether to run only forward pass"
-    )
-    parser.add_argument(
-        "--num_layers",
-        type=int,
-        default=1,
-        help="Number of LSTM layers. All output dimensions are going to be"
-             "of hidden_dim size",
-    )
-    parser.add_argument(
-        "--rnn_executor",
-        action="store_true",
-        help="Whether to use RNN executor"
-    )
-    parser.add_argument(
-        "--rnn_executor_num_threads",
-        type=int,
-        default=None,
-        help="Number of threads used by CPU RNN Executor"
-    )
-    parser.add_argument(
-        "--rnn_executor_max_cuda_streams",
-        type=int,
-        default=None,
-        help="Maximum number of CUDA streams used by RNN executor on GPU"
-    )
-    return parser
-
-
-if __name__ == '__main__':
-    args, extra_args = GetArgumentParser().parse_known_args()
-
-    rnn_executor_opt = 1 if args.rnn_executor else 0
-
-    workspace.GlobalInit([
-        'caffe2',
-        '--caffe2_log_level=0',
-        '--caffe2_print_blob_sizes_at_exit=0',
-        '--caffe2_rnn_executor={}'.format(rnn_executor_opt),
-        '--caffe2_gpu_memory_tracking=1'] + extra_args)
-
-    device = core.DeviceOption(
-        workspace.GpuDeviceType if args.gpu else caffe2_pb2.CPU, 4)
-
-    with core.DeviceScope(device):
-        Benchmark(args)
diff --git a/caffe2/python/memonger.py b/caffe2/python/memonger.py
deleted file mode 100644
index 11174a91a63f..000000000000
--- a/caffe2/python/memonger.py
+++ /dev/null
@@ -1,1001 +0,0 @@
-## @package memonger
-# Module caffe2.python.memonger
-
-
-
-
-
-import networkx as nx
-import collections
-import time
-import copy
-from caffe2.python import workspace, core
-from caffe2.proto import caffe2_pb2
-import enum
-import logging
-import caffe2.python._import_c_extension as C
-
-log = logging.getLogger("memonger")
-log.setLevel(logging.INFO)
-LiveRange = collections.namedtuple('LiveRange', ["defined", "used", "size"])
-
-
-def share_grad_blobs(
-    net,
-    losses,
-    param_grads,
-    namescope,
-    dont_share_blobs=None,
-    share_activations=False,
-    blob_shapes=None,
-):
-    '''
-    Implements similar optimization as Torch's shareGradInput():
-    for the gradients that are passed between layers, share blobs between
-    operators when possible. This yields significant memory savings with
-    deep networks.
-
-    Returns an optimized protobuf (assign to net._net)
-    '''
-    def is_grad_blob(b):
-        name = str(b)
-        # Note: need to look at _{namescope} pattern as it matches
-        # to handle the auto-split gradients
-        return name.endswith("_grad") and (name.startswith((namescope, "_" + namescope))) and name not in param_grads
-
-    def is_grad_op(op):
-        # TODO: something smarter
-        for b in list(op.input) + list(op.output):
-            if is_grad_blob(b):
-                return True
-        return False
-
-    log.warn("NOTE: Executing memonger to optimize gradient memory")
-
-    # Collect ops that have something to do with gradients
-    if namescope != "" and not namescope.endswith("/"):
-        namescope += "/"
-
-    netproto = copy.deepcopy(net.Proto())
-    activations = []
-    external_output = set(net.Proto().external_output)
-
-    # Hacky way to get activations, think of a better way
-    for op in net.Proto().op:
-        for b in op.output:
-            if b + "_w" in op.input and b not in external_output:
-                activations.append(b)
-
-    # Remove last activations, as they are usually accessed externally
-    activations = set(activations[:-2])
-
-    # Gradient ops
-    grad_op_indices = []
-    for idx, op in enumerate(netproto.op):
-        if (is_grad_op(op)):
-            grad_op_indices.append(idx)
-
-    shared_blobs = set()
-    for op in net.Proto().op:
-        for b in list(op.input) + list(op.output):
-            if is_grad_blob(b) or (share_activations and b in activations):
-                shared_blobs.add(b)
-    start_time = time.time()
-    optim_str = C.memonger_compute_blob_recycling_for_dag(
-        netproto.SerializeToString(),
-        [str(s).encode('utf-8') for s in losses],
-        grad_op_indices,
-        set(str(s).encode('utf-8') for s in shared_blobs),
-        namescope.encode('utf-8'),
-        set() if dont_share_blobs is None else dont_share_blobs,
-        {} if blob_shapes is None else blob_shapes
-    )
-
-    log.info("Memonger memory optimization took {} secs".format(
-        time.time() - start_time),
-    )
-
-    optim = caffe2_pb2.NetDef()
-    optim.ParseFromString(optim_str)
-    assert verify_graph_equality(net.Proto(), optim), \
-        "Memonger graph is not equal to original."
-    assert verify_inplace_blobs(net.Proto(), optim), \
-        "Inplace assignments differ in memonger net."
-    return optim
-
-
-def optimize_inference_for_dag(net, input_blobs, namescope=""):
-    netproto = copy.deepcopy(net.Proto())
-    external_input = set(net.Proto().external_input)
-    external_output = set(net.Proto().external_output)
-
-    def is_activation_blob(b):
-        return b not in external_input and b not in external_output
-
-    activation_blobs = set()
-    seen_as_output = set()
-    ops = list(net.Proto().op)
-    op_indices = [index for index, op in enumerate(net.Proto().op)]
-
-    # Sanity check: check that all external inputs are properly accounted
-    # and that no gradient ops are included in 'net'
-    for op in ops:
-        for b in op.input:
-            if is_activation_blob(b):
-                activation_blobs.add(b)
-                if b not in seen_as_output:
-                    raise AssertionError("{} not in external input".format(b))
-        for b in op.output:
-            if is_activation_blob(b):
-                activation_blobs.add(b)
-        seen_as_output = seen_as_output.union(set(op.output))
-        assert not op.is_gradient_op, \
-            "You can only pass inference-only nets to optimize_inference_for_dag"
-    start_time = time.time()
-    optim_str = C.memonger_compute_blob_recycling_for_dag(
-        netproto.SerializeToString(),
-        [str(s).encode('utf-8') for s in input_blobs],
-        op_indices,
-        set(str(s).encode('utf-8') for s in activation_blobs),
-        namescope.encode('utf-8'),
-        set(),
-        {}
-    )
-
-    log.info("Memonger memory optimization took {} secs".format(
-        time.time() - start_time),
-    )
-
-    optim = caffe2_pb2.NetDef()
-    optim.ParseFromString(optim_str)
-
-    assert verify_graph_equality(net.Proto(), optim), \
-        "Memonger graph is not equal to original."
-    assert verify_inplace_blobs(net.Proto(), optim), \
-        "Inplace assignments differ in memonger net."
-    return optim
-
-
-def estimate_memory_usage(protos, shapes, types, devicescope):
-    import numpy as np
-    '''
-    Estimate memory usage of a model. This is an estimate because
-    we assume a single threaded execution and miss some internal
-    memory usage of operators. Only estimates the memory for a given
-    device scope.
-
-    Also, currently it does not handle correctly if blob sizes vary
-    during execution, as it uses only the final blob size.
-
-    Returns (total, highwater, by op type) memory allocation in bytes.
-    '''
-    sizeofs = {
-        caffe2_pb2.TensorProto.DOUBLE: 8,
-        caffe2_pb2.TensorProto.FLOAT: 4,
-        caffe2_pb2.TensorProto.FLOAT16: 2,
-        caffe2_pb2.TensorProto.INT32: 4,
-        caffe2_pb2.TensorProto.INT8: 1,
-        caffe2_pb2.TensorProto.UINT8: 1,
-        caffe2_pb2.TensorProto.UINT16: 2,
-        caffe2_pb2.TensorProto.INT16: 2,
-        caffe2_pb2.TensorProto.BOOL: 1,
-        caffe2_pb2.TensorProto.INT64: 8,
-    }
-
-    def split_net(proto):
-        ops = [op for op in proto.op if
-               op.device_option == devicescope or op.type in {"Free", "Alias"}]
-        del proto.op[:]
-        proto.op.extend(ops)
-        return proto
-
-    def num_bytes(blob):
-        if blob not in shapes or blob not in types:
-            log.warning("Unknown blob encountered: {}".format(blob))
-            return 0
-        sizeof = sizeofs[types[blob]]
-        return sizeof * np.prod(shapes[blob])
-
-    protos = [split_net(proto) for proto in protos]
-    allocs_by_ops = collections.defaultdict(lambda: 0)
-
-    # Evaluate
-    current_allocated = 0
-    max_allocated = 0
-    total_allocated = 0
-    allocated = set()
-    for proto in protos:
-        for op in proto.op:
-            if op.type == "Free" or op.type == "Alias":
-                for o in op.output:
-                    if o in allocated:
-                        current_allocated -= num_bytes(o)
-                        allocated.remove(o)
-            else:
-                for output in op.output:
-                    if output not in allocated:
-                        nbytes = num_bytes(output)
-                        total_allocated += nbytes
-                        current_allocated += nbytes
-                        max_allocated = max(max_allocated, current_allocated)
-                        allocated.add(output)
-                        allocs_by_ops[op.type] += nbytes
-
-    return (total_allocated, max_allocated, allocs_by_ops)
-
-
-def release_blobs_when_used(netproto, dont_free_blobs, selector_fun=None):
-    '''
-    Insert Free-ops after a blob has been used the last time, so that its
-    memory can be reclaimed. Use this only with efficient caching memory
-    managers (such as CUB, --caffe2_cuda_memory_pool=cub).
-
-    Blobs used with Alias op won't be freed.
-
-    @dont_free_blobs:  is a set of blobs that should not be freed
-    @selector_fun:     optional lambda that return True if blob name
-                       can be released. Use for easy special filtering, like
-                       excluding blobs with "loss" in the name.
-
-    Returns a new protobuffer. To use with a model, use:
-        model.net._net = memonger.release_blobs_when_used(..)
-    '''
-    input_blobs = set()
-    can_release = set()
-    alias_blobs = set()
-    netproto = copy.deepcopy(netproto)
-
-    for op in netproto.op:
-        if op.type == 'Alias':
-            alias_blobs.add(op.input[0])
-            continue
-        for inp in op.input:
-            input_blobs.add(inp)
-        for outp in op.output:
-            if outp not in input_blobs:
-                if selector_fun is None or selector_fun(outp):
-                    can_release.add(outp)
-
-    # Remove such blobs that are not input at all and external outputs
-    can_release = can_release - set(netproto.external_output)
-    can_release = can_release.intersection(input_blobs)
-    can_release = can_release - dont_free_blobs
-    can_release = can_release - alias_blobs
-
-    ops = list(netproto.op)
-
-    # .. then find last use of each can-release blob, and insert a Free op
-    for j in reversed(range(0, len(netproto.op))):
-        op = netproto.op[j]
-        for inp in op.input:
-            if inp in can_release:
-                can_release.remove(inp)
-                ops.insert(j + 1, core.CreateOperator("Free", [inp], [inp]))
-
-    del netproto.op[:]
-    netproto.op.extend(ops)
-    return netproto
-
-
-def _find_source_nodes(g):
-    ''' Return nodes without predecessors '''
-    ret = []
-    for cn in g:
-        cur_pred = list(g.predecessors(cn))
-        if not cur_pred:
-            ret.append(cn)
-    return ret
-
-
-def _find_target_nodes(g):
-    ''' Return nodes without successors '''
-    ret = []
-    for cn in g:
-        cur_succ = list(g.successors(cn))
-        if not cur_succ:
-            ret.append(cn)
-    return ret
-
-
-def _add_single_target_ifneeded(g):
-    targets = _find_target_nodes(g)
-    assert len(targets) >= 1
-    if len(targets) == 1:
-        return g
-    ret = copy.deepcopy(g)
-
-    def _next_available_idx(g):
-        ret = -1
-        for cn in g:
-            if cn > ret:
-                ret = cn
-        ret += 1
-        return ret
-
-    target_node_idx = _next_available_idx(g)
-    ret.add_node(target_node_idx)
-    for cn in targets:
-        ret.add_edge(cn, target_node_idx)
-
-    return ret
-
-
-def _get_path(pred_list, dist_list):
-    ''' Get the path from nx.bellman_ford()'s output '''
-
-    # distances are negative
-    assert all(dist_list[x] <= 0 for x in dist_list)
-    # node with longest distance to source is the target
-    target = min(dist_list, key=lambda x: dist_list[x])
-
-    ret = []
-    cur = target
-
-    while cur is not None:
-        ret.append(cur)
-        # Hack to get networkx 2.0 happy: it uses list in pred.
-        # TODO(tulloch): are there cases with multiple predecessors?
-        try:
-            cur = pred_list[cur][0] if pred_list[cur] else None
-        except TypeError:
-            cur = pred_list[cur]
-
-    return list(reversed(ret))
-
-
-def _get_longest_paths(g, source_nodes):
-    ''' Get the longest path for nodes in 'source_nodes'
-        Find with bellman_ford() by setting weight = -1
-    '''
-
-    ng = copy.deepcopy(g)
-    for u, v in ng.edges():
-        ng[u][v]["weight"] = -1
-
-    ret = {}
-    for cn in source_nodes:
-        pred, dist = nx.bellman_ford_predecessor_and_distance(ng, cn, weight="weight")
-        path = _get_path(pred, dist)
-        assert path[0] == cn
-        assert len(path) - 1 == -dist[path[-1]]
-        ret[cn] = path
-
-    return ret
-
-
-def _build_tree(paths):
-    ''' Build a tree for given paths based on common elements.
-        Last elements of all paths are the same, which is the root of the tree.
-    '''
-    assert all(cp[-1] == paths[0][-1] for cp in paths)
-    g = nx.DiGraph()
-    node_set = {y for x in paths for y in x}
-    g.add_nodes_from(node_set)
-    for cp in paths:
-        for ce in zip(cp[0:-1], cp[1:]):
-            g.add_edge(ce[1], ce[0])
-
-    root = paths[0][-1]
-    _compute_tree_height(g, root)
-
-    return (g, root)
-
-
-def _compute_tree_height(g, root):
-    ''' Compute the heights of the tree for all nodes
-        Height of leaves are 0
-    '''
-    def _get_height(root):
-        children = list(g.successors(root))
-        height = 0
-        if children:
-            child_heights = [_get_height(x) for x in children]
-            height = max(child_heights) + 1
-        g.nodes[root]["height"] = height
-        return height
-
-    _get_height(root)
-
-
-def _sort_tree_leaves(g, root):
-    ''' For each node, sort its child nodes based on the height of the nodes.
-        Return the leaf nodes of the tree after sorting.
-    '''
-    def _get_height(root):
-        return g.nodes[root]["height"]
-
-    def _get_sorted_leaves(root):
-        children = list(g.successors(root))
-        if not children:
-            return [root]
-        child_heights = [_get_height(x) for x in children]
-        order = sorted(range(len(children)), key=lambda x: child_heights[x])
-        ret = []
-        for co in order:
-            cr = children[co]
-            ret += _get_sorted_leaves(cr)
-
-        return ret
-
-    return _get_sorted_leaves(root)
-
-
-def topological_sort_traversal_longest_path(g):
-    ''' The graph 'g' may contain several source nodes (nodes without incoming
-        edge), which could be in any order and still be a valid
-        topological sorting result. We would like to arrange these source nodes
-        so that the average live spans of the computed blobs are shorter.
-        The idea is to sort the source nodes based on the length of their path to
-        the target node so that the one with longer path is used first.
-        This is done by:
-        - Add a single target node if there are multiple target nodes in 'g'.
-        - Find the longest path between each source and the target node.
-        - Convert the longest paths to a tree with the target node being the root
-          and source nodes being the leaves.
-        - Sort the nodes of the tree based on the height of the tree.
-    '''
-    gt = _add_single_target_ifneeded(g)
-    source_nodes = _find_source_nodes(gt)
-    lpaths = _get_longest_paths(gt, source_nodes)
-    tree, root = _build_tree(list(lpaths.values()))
-    sorted_sources = _sort_tree_leaves(tree, root)
-    assert(sorted(sorted_sources) == sorted(source_nodes))
-
-    if nx.__version__ < '2.0':
-        ret = nx.topological_sort(g, sorted_sources)
-    else:
-        # Manually making a sorted descendent list
-        dependency_order = list(sorted_sources)
-        seen_nodes = set(sorted_sources)
-        for s in sorted_sources:
-            desc = nx.descendants(g, s)
-            for d in desc:
-                if d not in seen_nodes:
-                    seen_nodes.add(d)
-                    dependency_order.append(d)
-        sort_key = dict((v, len(dependency_order) - i) for i, v in enumerate(dependency_order))
-        ret = nx.algorithms.dag.lexicographical_topological_sort(
-            g, key=lambda x: sort_key[x])
-        ret = list(ret)
-    assert(len(ret) == len(g.nodes))
-    return ret
-
-
-def topological_sort_traversal(g):
-    return list(nx.topological_sort(g))
-
-
-def compute_ranges(linearized_ops, blob_sizes=None):
-    if not blob_sizes:
-        log.warning('Provide blob sizes to get more accurate assignments.')
-
-    blobs = collections.defaultdict(
-        lambda: LiveRange(defined=None, used=None, size=None))
-    for i, op in enumerate(linearized_ops):
-        for blob in op.input:
-            used = blobs[blob].used
-            if used is None:
-                used = i
-            else:
-                used = max(used, i)
-            blobs[blob] = blobs[blob]._replace(used=used)
-            blob_size = blob_sizes[blob] if blob_sizes else None
-            assert not blob_sizes or blob_size is not None
-            blobs[blob] = blobs[blob]._replace(size=blob_size)
-        for blob in op.output:
-            defined = blobs[blob].defined
-            if defined is None:
-                defined = i
-            else:
-                defined = min(defined, i)
-            blobs[blob] = blobs[blob]._replace(defined=defined)
-            blob_size = blob_sizes[blob] if blob_sizes else None
-            assert not blob_sizes or blob_size is not None
-            blobs[blob] = blobs[blob]._replace(size=blob_size)
-
-    return blobs
-
-
-def is_compatible(candidate_range, assignment, static_blobs):
-    (name, range_) = assignment[-1]
-    if name in static_blobs:
-        return False
-    if candidate_range.defined is None or range_.defined is None \
-      or range_.used is None:
-        return False
-    return candidate_range.defined > range_.used
-
-
-def compute_blob_assignments(assignments):
-    blob_assignments = {}
-    for assignment in assignments:
-        if len(assignment) == 1:
-            continue
-        last_blob, _ = assignment[-1]
-        for (blob, _) in assignment:
-            blob_assignments[blob] = last_blob
-    return blob_assignments
-
-
-def _get_max_size(assignment):
-    if not assignment:
-        return 0
-    ret = max([x[1].size for x in assignment])
-    ret = 0 if ret is None else ret
-    return ret
-
-
-def get_memory_usage(assignments):
-    ret = 0
-    for cur in assignments:
-        ret += _get_max_size(cur)
-    return ret
-
-
-def compute_assignments_greedy(ranges_sorted, init_assignments=None):
-    assignments = init_assignments or []
-    visited = {y[0] for x in assignments for y in x}
-
-    for (name, range_) in ranges_sorted:
-        if name in visited:
-            continue
-        assigned = False
-        best_assignment = 0
-        min_dist = float("inf")
-        candidate_size = range_.size or 0
-        for idx, assignment in enumerate(assignments):
-            if is_compatible(range_, assignment, []):
-                assigned = True
-                dist = abs(_get_max_size(assignment) - candidate_size)
-                if dist < min_dist:
-                    min_dist = dist
-                    best_assignment = idx
-        if assigned:
-            assignment = assignments[best_assignment]
-            assignment.append((name, range_))
-        else:
-            assignments.append([(name, range_)])
-    return assignments
-
-
-def _get_count(assignments):
-    ''' Return number of blobs in assignments '''
-    if assignments:
-        return sum([len(x) for x in assignments])
-    return 0
-
-
-def compute_assignments_dp(ranges_sorted, init_assignment, counter=None):
-    ''' Compute assignment for blobs in 'ranges_sorted' on top of 'init_assignment'
-        using dynamic programming + recursion.
-
-        ranges_sorted: blobs sorted by 'used'
-        init_assignment: assignment to start with, blobs in 'ranges_sorted' should
-                         not be used in 'init_assignment'
-
-        Using f(b, k, init) to represent the best assignment for blobs b[0:k]
-        given initial assignment 'init', we have
-            f(b, k, init) = f(b, j, init) +
-                            find_best(b[j:k], f(b, j, init))
-        where j is the index of the last best assignment that is independent of
-        blob b[k - 1] (b[k - 1] is compatible with all assignments in
-        f(b, j, init)), and find_best(b1, init1) gives the best assignment
-        for blobs in 'b1' based on the initial assignment 'init1', and blobs
-        b1[0:-1] should be incompatible with b1[-1]. f(b, len(b), []) gives
-        the best assignment for blobs 'b'.
-
-        For find_best(b, init), since b[0:-1] are not compatible with b[-1], we
-        could reduce it to a smaller problem to find best assignment for b[0:-1]
-        as
-            find_best(b, init) = min {
-                f(b[0:-1], len(b) - 1, init - x) + [x, b[-1]] for x in init, or
-                f(b[0:-1], len(b) - 1, init) + [b[-1]]
-            }
-        where min{} gives the assignment with minimum memory usage.
-    '''
-
-    def _get_compatible_prev(candidate_range, best_assignments, cur_idx):
-        ''' Find closest position k of best_assignments that is independent of
-            candidate_range that candiate_range is compatible with all assignments
-            in best_assignments[k].
-            Return -1 if not found.
-        '''
-        def is_compatible_all(candidate_range, assignments):
-            ''' return true if compatible for all assignments in assignments '''
-            return all([is_compatible(candidate_range[1], x, []) for x in assignments])
-
-        ii = cur_idx - 1
-        while ii >= 0:
-            cba = best_assignments[ii]
-            if is_compatible_all(candidate_range, cba):
-                return ii
-            ii -= 1
-        return -1
-
-    def _find_best(ranges, init_assignment, prev_best_assignment, counter):
-        ''' Find the best assignment for blobs 'ranges' given an initialized
-            assignment 'init_assignment'.
-
-            Blobs in ranges[0:-1] should be incompatible with blob range[-1].
-            'prev_best_assignment': best assignment for blobs in ranges[:-1]
-
-            By assigning ranges[-1] to each assignment k in 'init_assignment' or
-            in a new assignment, the problem becomes a smaller problem to find
-            the best assignment for ranges[0:-1] given the initial assignment
-            init_assigment[0:k, (k+1):-1].
-        '''
-        # Blob to check
-        find_range = ranges[-1]
-        # Blobs in ranges[0:-1] are incompatible with ranges[-1] so that we can
-        # reduce it to a smaller problem.
-        assert all(not is_compatible(x[1], [find_range], []) for x in ranges[0:-1])
-
-        sz = len(init_assignment)
-        best_candidates = []
-        # Try to assign 'find_range' to each assignment in init_assignment
-        for ii in range(sz):
-            if not is_compatible(find_range[1], init_assignment[ii], []):
-                continue
-            cur_best = copy.deepcopy(init_assignment)
-            cur_best[ii].append(find_range)
-            if len(ranges) > 1:
-                cur_best_tmp = [x for i, x in enumerate(cur_best) if i != ii]
-                # reduce to a smaller dp problem
-                cur_best_tmp = compute_assignments_dp(
-                    ranges[:-1], cur_best_tmp, counter)
-                cur_best = cur_best_tmp + [cur_best[ii]]
-            best_candidates.append(cur_best)
-        # Try to put 'find_range' in a new assignment
-        best_candidates.append(prev_best_assignment + [[find_range]])
-
-        ret = min(best_candidates, key=get_memory_usage)
-        return ret
-
-    if not counter:
-        counter = [0]
-    counter[0] += 1
-
-    if counter and counter[0] % 5000 == 0:
-        rs = [ranges_sorted[0][1].defined, ranges_sorted[-1][1].used]
-        log.info('Finding assignments {} ({} -> {})...'.format(
-            counter[0], rs[0], rs[1]))
-
-    init_assignment = init_assignment or []
-    # best_assignments[k]: best assignments for first k blobs ranges_sorted[0:(k+1)]
-    best_assignments = []
-    # Find best assignment for blobs ranges_sorted[0:ii]
-    for ii, cur_range in enumerate(ranges_sorted):
-        # closest best_assignment that is independent of ranges_sorted[ii]
-        prev_idx = _get_compatible_prev(cur_range, best_assignments, ii)
-        prev_best = copy.deepcopy(init_assignment) if prev_idx < 0 else \
-                    copy.deepcopy(best_assignments[prev_idx])
-        # Need to find best assignment for blobs in 'ranges_part'
-        ranges_part = ranges_sorted[(prev_idx + 1):(ii + 1)]
-        cur_best = _find_best(
-            ranges_part, prev_best,
-            best_assignments[-1] if best_assignments else init_assignment,
-            counter)
-        assert _get_count(cur_best) == _get_count(prev_best) + len(ranges_part)
-        best_assignments.append(copy.deepcopy(cur_best))
-
-    assert len(best_assignments) == len(ranges_sorted)
-
-    best = best_assignments[-1]
-
-    return best
-
-
-def get_updated_ranges(ranges, max_live=None):
-    ''' Set LiveRange.defined = -1 if it is None
-        Set LiveRange.used = max_live if it is None
-        Set LiveRanee.size = 1 if it is None
-    '''
-
-    def _get_max_live(ranges):
-        max_live = max(x[1].used for x in ranges if x[1].used) + 1
-        return max_live
-
-    def _update_range(x, max_live, size):
-        cx = x
-        if x[1].defined is None:
-            cx = (cx[0], cx[1]._replace(defined=-1))
-        if x[1].used is None:
-            cx = (cx[0], cx[1]._replace(used=max_live))
-        if x[1].size is None:
-            cx = (cx[0], cx[1]._replace(size=size))
-        return cx
-
-    if max_live is None:
-        max_live = _get_max_live(ranges)
-    ranges = [_update_range(x, max_live, 1) for x in ranges]
-
-    return ranges
-
-
-def compute_assignments(ranges, static_blobs, algo):
-    '''
-    algo: Method used to find assignments (AssignmentAlgorithm.GREEDY or
-          AssignmentAlgorithm.DYNAMIC_PROGRAMMING).
-          AssignmentAlgorithm.DYNAMIC_PROGRAMMING gives optimal solution at the
-          cost of more computation.
-          AssignmentAlgorithm.GREEDY may be better in the case 'blob_sizes' is
-          not provided.
-    '''
-
-    # Sort the ranges based on when they are last used.
-    # If LiveRange.used is None, then the blob is never used and could
-    # be consumed externally. Sort these to the end of the list as opposed
-    # to the beginning so that they can be shared as well.
-    ranges = sorted(
-        ranges.items(),
-        key=lambda p: (p[1].used is None, p[1].used),
-    )
-    # Update None values
-    ranges = get_updated_ranges(ranges)
-
-    # Sharable blobs
-    ranges_sharable = [x for x in ranges if x[0] not in static_blobs]
-    # Static blobs, not sharable
-    ranges_static = [x for x in ranges if x[0] in static_blobs]
-
-    log.info("Total sharable blobs {}".format(len(ranges_sharable)))
-
-    best_assignment = []
-    if algo == AssignmentAlgorithm.DYNAMIC_PROGRAMMING:
-        best_assignment = compute_assignments_dp(ranges_sharable, [])
-    elif algo == AssignmentAlgorithm.GREEDY:
-        best_assignment = compute_assignments_greedy(ranges_sharable, [])
-    else:
-        assert "Invalid algo name {}".format(algo)
-    best_assignment += [[x] for x in ranges_static]
-
-    # verify_assignments(best_assignment)
-
-    return best_assignment
-
-
-def verify_assignments(assignments):
-    for cur in assignments:
-        for x, y in zip(cur[0:-1], cur[1:]):
-            assert x[1].used < y[1].defined
-
-
-def compute_interference_graph(ops):
-    g = nx.DiGraph()
-    for i, op in enumerate(ops):
-        g.add_node(i, op=op)
-    for i, parent_op in enumerate(ops):
-        for j, child_op in enumerate(ops):
-            if i >= j:
-                continue
-            if any(output in child_op.input for output in parent_op.output):
-                deps = set(child_op.input).intersection(parent_op.output)
-                g.add_edge(i, j, deps=deps)
-                assert nx.is_directed_acyclic_graph(g), child_op
-    return g
-
-
-Optimization = collections.namedtuple(
-    'Optimization', ['net', 'assignments', 'blob_assignments'])
-
-
-def apply_assignments(net, blob_assignments):
-    def canonical_name(blob):
-        if blob not in blob_assignments:
-            return blob
-        return blob_assignments[blob]
-
-    for op in net.op:
-        # Descend into subnets of the recurrent network
-        if op.type.startswith('RecurrentNetwork'):
-            apply_recurrent_blob_assignments(op, blob_assignments, canonical_name)
-
-        for i, input_ in enumerate(op.input):
-            op.input[i] = canonical_name(input_)
-        for i, output in enumerate(op.output):
-            op.output[i] = canonical_name(output)
-
-
-def apply_recurrent_blob_assignments(op, blob_assignments, canonical_name):
-    log.debug("Applying assignments to recurrent op: {}".format(op.type))
-
-    # Apply on alias_dst
-    alias_dst_args = [a for a in op.arg if a.name.endswith("alias_dst")]
-    for alias_dst in alias_dst_args:
-        for i, blob in enumerate(alias_dst.strings):
-            alias_dst.strings[i] = canonical_name(blob.decode()).encode()
-
-    # Apply on link_external
-    link_external_args = [a for a in op.arg if a.name.endswith("link_external")]
-    for link_external in link_external_args:
-        for i, blob in enumerate(link_external.strings):
-            link_external.strings[i] = canonical_name(blob.decode()).encode()
-
-    # Recurse into step nets
-    step_args = [a for a in op.arg if a.name.endswith("step_net")]
-    for step_arg in step_args:
-        apply_assignments(step_arg.n, blob_assignments)
-        for i, einp in enumerate(step_arg.n.external_input):
-            if einp in blob_assignments:
-                step_arg.n.external_input[i] = canonical_name(einp)
-
-    # Store renamings
-    for blob, renamed in blob_assignments.items():
-        if blob in list(op.input) + list(op.output):
-            a = caffe2_pb2.Argument()
-            a.name = blob + ".rename"
-            a.s = str(renamed).encode("ascii")
-            op.arg.extend([a])
-
-
-class AssignmentAlgorithm(enum.Enum):
-    GREEDY = 0
-    DYNAMIC_PROGRAMMING = 1
-
-
-def optimize_inference_fast(net, static_blobs):
-    optim = caffe2_pb2.NetDef()
-    optim_str = C.memonger_optimize_inference_net(
-        net.SerializeToString(),
-        [str(s).encode('utf-8') for s in static_blobs]
-    )
-    optim.ParseFromString(optim_str)
-    return optim
-
-
-def optimize_interference(net, static_blobs,
-                          ordering_function=topological_sort_traversal,
-                          blob_sizes=None,
-                          algo=AssignmentAlgorithm.GREEDY):
-    """
-    ordering_function: topological_sort_traversal or
-                       topological_sort_traversal_longest_path.
-                       topological_sort_traversal_longest_path gives better
-                       results but needs a bit more computation.
-    algo: Method used to find assignments (AssignmentAlgorithm.GREEDY or
-          AssignmentAlgorithm.DYNAMIC_PROGRAMMING).
-          AssignmentAlgorithm.DYNAMIC_PROGRAMMING gives optimal solution at the
-          cost of more computation.
-          AssignmentAlgorithm.GREEDY may be better in the case 'blob_sizes' is
-          not provided.
-    """
-
-    """
-    1) Use a BFS traversal of the execution graph to generate an
-       ordering of the node executions.
-    2) Generate use-def ranges for each `blob` in the BFS traversal
-       order.
-    3) Assign blobs to `canonical blobs`
-    4) Rename blobs to canonical blobs
-    """
-
-    net = copy.deepcopy(net)
-    g = compute_interference_graph(net.op)
-    ordering = ordering_function(g)
-    linearized_ops = [net.op[i] for i in ordering]
-
-    # Reorder ops in net based on the computed linearlized order.
-    # If the graph has multiple topological orderings and if the NetDef's
-    # ordering differs from the order used to compute ranges, then the
-    # runtime might end up overwriting blobs before they are used.
-    del net.op[:]
-    net.op.extend(linearized_ops)
-
-    ranges = compute_ranges(linearized_ops, blob_sizes)
-    assignments = compute_assignments(ranges, static_blobs, algo)
-    blob_assignments = compute_blob_assignments(assignments)
-    apply_assignments(net, blob_assignments)
-    return Optimization(
-        net=net,
-        blob_assignments=blob_assignments,
-        assignments=assignments)
-
-
-def verify_inplace_blobs(net_a, net_b):
-    """
-    Verifies that net_a and net_b have the same in-place blob assignments.
-    Particularly, that memonger did not add an in-place assignment when that
-    did not exist before.
-    """
-    def get_inplaces(op):
-        out = list(op.output)
-        inplaces = []
-        for j, inp in enumerate(op.input):
-            if inp in out:
-                inplaces.append([j, out.index(inp)])
-        return inplaces
-
-    for op_a, op_b in zip(net_a.op, net_b.op):
-        if op_a.type != op_b.type:
-            return False
-        if get_inplaces(op_a) != get_inplaces(op_b):
-            return False
-    return True
-
-
-def verify_graph_equality(net_a, net_b):
-    """
-    Determines if the execution of two graphs are identical.
-    That is, all inputs blobs are mapped to the same output blobs
-    for each operator in their respective positions.
-
-    This is meant to check the output of memonger with the original graph.
-    It assumes that the nets have same external input and output.
-
-    O(E) runtime + O(1) amortized cost to hash for python dict
-    """
-
-    def parent_list(ops):
-        parent_list = [[] for _ in ops]
-        edge_owner = {}
-        for i, op in enumerate(ops):
-            for blob in op.input:
-                parent_id = edge_owner.get(blob)
-                if parent_id is not None:
-                    parent_list[i].append(parent_id)
-            for blob in op.output:
-                edge_owner[blob] = i
-
-        return parent_list
-
-    # Operator wise equality checks
-    if (len(net_a.op) != len(net_b.op)):
-        return False
-    for op_a, op_b in zip(net_a.op, net_b.op):
-        if (op_a.type != op_b.type or
-                op_a.device_option != op_b.device_option or
-                op_a.engine != op_b.engine):
-            return False
-
-    # Print debug info
-    parent_list_a = parent_list(net_a.op)
-    parent_list_b = parent_list(net_b.op)
-    if parent_list_a != parent_list_b:
-        j = 0
-        for a, b in zip(parent_list_a, parent_list_b):
-            if a != b:
-                print("Difference {} vs {} \n {}".format(
-                    j, net_a.op[j], net_b.op[j]))
-                print("Parents: {} vs {}".format(a, b))
-
-            j += 1
-
-    # Net wise equality check
-    return parent_list_a == parent_list_b
-
-
-Statistics = collections.namedtuple(
-    'Statistics', ['baseline_nbytes', 'optimized_nbytes'])
-
-
-def blob_nbytes(blob):
-    sz = 0
-    try:
-        sz = workspace.FetchBlob(blob).nbytes
-    except Exception:
-        log.warning('Error when fetching blob {}'.format(blob))
-    return sz
-
-
-def compute_statistics(assignments):
-    blob_bytes = {
-        blob: blob_nbytes(blob) for assignment in assignments
-        for (blob, _) in assignment}
-    baseline_nbytes = sum(blob_bytes.values())
-    optimized_nbytes = sum(
-        max(blob_bytes[blob] for (blob, _) in assignment)
-        for assignment in assignments)
-    return Statistics(
-        baseline_nbytes=baseline_nbytes,
-        optimized_nbytes=optimized_nbytes)
-
-
-def collect_blob_sizes(net):
-    blobs = {}
-    for op in net.op:
-        for blob in op.input:
-            blobs[blob] = blob_nbytes(blob)
-        for blob in op.output:
-            blobs[blob] = blob_nbytes(blob)
-
-    return blobs
diff --git a/caffe2/python/memonger_test.py b/caffe2/python/memonger_test.py
deleted file mode 100644
index b4f7a62a6893..000000000000
--- a/caffe2/python/memonger_test.py
+++ /dev/null
@@ -1,841 +0,0 @@
-import numpy as np
-
-from caffe2.python import workspace, memonger, core, model_helper, brew
-from caffe2.proto import caffe2_pb2
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import unittest
-
-
-def has_blob(proto, needle):
-    for op in proto.op:
-        for inp in op.input:
-            if inp == needle:
-                return True
-        for outp in op.output:
-            if outp == needle:
-                return True
-    return False
-
-
-def count_blobs(proto):
-    blobs = set()
-    for op in proto.op:
-        blobs = blobs.union(set(op.input)).union(set(op.output))
-    return len(blobs)
-
-
-class MemongerTest(hu.HypothesisTestCase):
-    @given(input_dim=st.integers(min_value=1, max_value=10),
-           output_dim=st.integers(min_value=1, max_value=10),
-           batch_size=st.integers(min_value=1, max_value=10),
-           do=st.sampled_from(hu.device_options),
-           algo=st.sampled_from(memonger.AssignmentAlgorithm))
-    @settings(max_examples=5, deadline=None)
-    def test_simple_memonger(self, input_dim, output_dim, batch_size, do, algo):
-        m = model_helper.ModelHelper()
-        fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
-        fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
-        fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
-
-        fc3.Relu([], fc3)\
-           .Softmax([], "pred") \
-           .LabelCrossEntropy(["label"], ["xent"]) \
-           .AveragedLoss([], "loss")
-        input_to_grad = m.AddGradientOperators(["loss"])
-        m.net.Proto().device_option.CopyFrom(do)
-        m.param_init_net.Proto().device_option.CopyFrom(do)
-        static_blobs = \
-            [o for op in m.param_init_net.Proto().op for o in op.output] + \
-            ["data", "label", "loss", input_to_grad["fc1_w"]]
-
-        optimization = memonger.optimize_interference(
-            m.Proto(), static_blobs, algo=algo)
-        data = np.random.randn(batch_size, input_dim).astype(np.float32)
-        label = np.random.randint(
-            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
-        workspace.RunNetOnce(m.param_init_net)
-        workspace.FeedBlob("data", data, device_option=do)
-        workspace.FeedBlob("label", label, device_option=do)
-        workspace.RunNetOnce(m.net)
-        loss = workspace.FetchBlob("loss")
-        grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
-        workspace.RunNetOnce(optimization.net)
-        optimized_loss = workspace.FetchBlob("loss")
-        optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
-        np.testing.assert_almost_equal(loss, optimized_loss)
-        np.testing.assert_almost_equal(grad, optimized_grad)
-        stats = memonger.compute_statistics(optimization.assignments)
-        self.assertLess(stats.optimized_nbytes, stats.baseline_nbytes)
-
-        # run with blob sizes
-        blob_sizes = memonger.collect_blob_sizes(m.Proto())
-        optimization1 = memonger.optimize_interference(
-            m.Proto(), static_blobs, blob_sizes=blob_sizes, algo=algo)
-        workspace.RunNetOnce(optimization1.net)
-        optimized_loss = workspace.FetchBlob("loss")
-        optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
-        np.testing.assert_almost_equal(loss, optimized_loss)
-        np.testing.assert_almost_equal(grad, optimized_grad)
-        stats = memonger.compute_statistics(optimization1.assignments)
-        self.assertLessEqual(stats.optimized_nbytes, stats.baseline_nbytes)
-
-    @given(input_dim=st.integers(min_value=1, max_value=10),
-           output_dim=st.integers(min_value=1, max_value=10),
-           batch_size=st.integers(min_value=1, max_value=10),
-           do=st.sampled_from(hu.device_options))
-    @settings(max_examples=5, deadline=None)
-    def test_fast_memonger(self, input_dim, output_dim, batch_size, do):
-        m = model_helper.ModelHelper()
-        fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
-        fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
-        fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
-
-        fc3.Relu([], fc3)\
-           .Softmax([], "pred") \
-           .LabelCrossEntropy(["label"], ["xent"]) \
-           .AveragedLoss([], "loss")
-        input_to_grad = m.AddGradientOperators(["loss"])
-        m.net.Proto().device_option.CopyFrom(do)
-        m.param_init_net.Proto().device_option.CopyFrom(do)
-        static_blobs = \
-            [o for op in m.param_init_net.Proto().op for o in op.output] + \
-            ["data", "label", "loss", input_to_grad["fc1_w"]]
-
-        optimized_net = memonger.optimize_inference_fast(
-            m.Proto(), static_blobs)
-        data = np.random.randn(batch_size, input_dim).astype(np.float32)
-        label = np.random.randint(
-            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
-        workspace.RunNetOnce(m.param_init_net)
-        workspace.FeedBlob("data", data, device_option=do)
-        workspace.FeedBlob("label", label, device_option=do)
-        workspace.RunNetOnce(m.net)
-        loss = workspace.FetchBlob("loss")
-        grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
-        workspace.RunNetOnce(optimized_net)
-        optimized_loss = workspace.FetchBlob("loss")
-        optimized_grad = workspace.FetchBlob(str(input_to_grad["fc1_w"]))
-        np.testing.assert_almost_equal(loss, optimized_loss)
-        np.testing.assert_almost_equal(grad, optimized_grad)
-
-        self.assertLess(count_blobs(optimized_net), count_blobs(m.Proto()))
-
-    def test_fast_memonger_unique_outputs(self):
-        m = model_helper.ModelHelper()
-        fc = []
-        for i in range(2):
-            z = brew.fc(
-                m, "data{}".format(i), "fc".format(i), dim_in=2, dim_out=2)
-            fc.append(z)
-        r = []
-        # Trick is here to have same input appear twice in a same Sum
-        for x in fc:
-            for y in fc:
-                r.append(brew.sum(m, [x, y], 1))
-        concated = brew.concat(m, r, "concated")
-        brew.relu(m, concated, "merged")
-
-        static_blobs = \
-            [o for op in m.param_init_net.Proto().op for o in op.output] + \
-            ["merged"] + ["data{}".format(i) for i in range(len(fc))]
-
-        optimized_net = memonger.optimize_inference_fast(
-            m.Proto(), static_blobs)
-        for op in optimized_net.op:
-            self.assertEqual(len(op.output), len(set(op.output)), str(op))
-
-    @given(input_dim=st.integers(min_value=1, max_value=4),
-           output_dim=st.integers(min_value=1, max_value=4),
-           batch_size=st.integers(min_value=1, max_value=4))
-    def test_gradient_optim(self, input_dim, output_dim, batch_size):
-        m = model_helper.ModelHelper()
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
-            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
-            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
-            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
-            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
-            fc5.Relu([], fc5)\
-               .Softmax([], "pred") \
-               .LabelCrossEntropy(["label"], ["xent"]) \
-               .AveragedLoss([], "loss")
-        input_to_grad = m.AddGradientOperators(["name_x/loss"])
-
-        blobs_before = count_blobs(m.net.Proto())
-        optim_proto = memonger.share_grad_blobs(
-            m.net,
-            ["name_x/loss"],
-            set(m.param_to_grad.values()),
-            "name_x/",
-            share_activations=False,
-        )
-        blobs_after = count_blobs(optim_proto)
-        self.assertLess(blobs_after, blobs_before)
-
-        optim_proto_wacts = memonger.share_grad_blobs(
-            m.net,
-            ["name_x/loss"],
-            set(m.param_to_grad.values()),
-            "name_x/",
-            share_activations=True,
-            dont_share_blobs=set([str(input_to_grad["name_x/fc1_w"])]),
-        )
-        blobs_wact_optim = count_blobs(optim_proto_wacts)
-        self.assertLessEqual(blobs_wact_optim, blobs_after)
-
-        # Check that the last activations are not shared
-        self.assertTrue(has_blob(optim_proto, "name_x/fc5"))
-        self.assertTrue(
-            has_blob(optim_proto_wacts, "name_x/fc5"),
-            "Dont remap final activation",
-        )
-
-        # Test networks produce exactly same gradients
-        data = np.random.randn(batch_size, input_dim).astype(np.float32)
-        label = np.random.randint(
-            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
-        workspace.RunNetOnce(m.param_init_net)
-        workspace.FeedBlob("name_x/data", data)
-        workspace.FeedBlob("name_x/label", label)
-        workspace.RunNetOnce(m.net)
-        loss = workspace.FetchBlob("name_x/loss")
-        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
-        workspace.RunNetOnce(optim_proto)
-        optimized_loss = workspace.FetchBlob("name_x/loss")
-        optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
-        np.testing.assert_almost_equal(loss, optimized_loss)
-        np.testing.assert_almost_equal(grad, optimized_grad)
-
-        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))
-
-        # Run with the forward optimization
-        workspace.RunNetOnce(optim_proto_wacts)
-        optimized_loss = workspace.FetchBlob("name_x/loss")
-        optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
-        np.testing.assert_almost_equal(loss, optimized_loss)
-        np.testing.assert_almost_equal(grad, optimized_grad)
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
-    def test_memonger_mix_cpu_gpu(self):
-        '''
-        Check that memonger does not make blobs cross CPU/GPU boundary
-        '''
-        m = model_helper.ModelHelper()
-        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
-            fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2)
-            fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2)
-            fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2)
-            fc4 = brew.fc(m, fc3, "fc4", dim_in=2, dim_out=2)
-            fc4_cpu = m.net.CopyGPUToCPU(fc4, "fc4_cpu")
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
-            fc5_cpu = brew.fc(m, fc4_cpu, "fc5_cpu", dim_in=2, dim_out=2)
-            fc6_cpu = brew.fc(m, fc5_cpu, "fc6_cpu", dim_in=2, dim_out=2)
-            fc7_cpu = brew.fc(m, fc6_cpu, "fc7_cpu", dim_in=2, dim_out=2)
-            fc7_cpu.Relu([], fc7_cpu) \
-               .Softmax([], "pred") \
-               .LabelCrossEntropy(["label"], ["xent"]) \
-               .AveragedLoss([], "loss")
-        m.AddGradientOperators(["loss"])
-
-        blobs_before = count_blobs(m.net.Proto())
-        optim_proto = memonger.share_grad_blobs(
-            m.net,
-            ["loss"],
-            set(m.param_to_grad.values()),
-            "",
-            share_activations=True,
-            dont_share_blobs=set(),
-        )
-        blobs_after = count_blobs(optim_proto)
-        self.assertLess(blobs_after, blobs_before)
-
-        # Create set of blobs on CPU side and GPU side and check they don't
-        # overlap
-        device_blobs = {caffe2_pb2.CPU: set(), workspace.GpuDeviceType: set()}
-        for op in optim_proto.op:
-            if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]:
-                dev = op.device_option.device_type
-                for b in list(op.input) + list(op.output):
-                    device_blobs[dev].add(b)
-
-        device_crossers = device_blobs[caffe2_pb2.CPU].intersection(
-            device_blobs[workspace.GpuDeviceType]
-        )
-        self.assertEqual(device_crossers, set())
-
-    @given(input_dim=st.integers(min_value=4, max_value=4),
-           output_dim=st.integers(min_value=4, max_value=4),
-           batch_size=st.integers(min_value=4, max_value=4))
-    @settings(deadline=1000)
-    def test_gradient_optim_tree(self, input_dim, output_dim, batch_size):
-        m = model_helper.ModelHelper()
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
-            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
-            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
-            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
-            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
-            fc5.Relu([], fc5) \
-               .Softmax([], "pred1") \
-               .LabelCrossEntropy(["label"], ["xent1"]) \
-               .AveragedLoss([], "loss1")
-            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
-            fc6.Relu([], fc6) \
-               .Softmax([], "pred2") \
-               .LabelCrossEntropy(["label"], ["xent2"]) \
-               .AveragedLoss([], "loss2")
-        input_to_grad = m.AddGradientOperators(["name_x/loss1", "name_x/loss2"])
-
-        blobs_before = count_blobs(m.net.Proto())
-        optim_proto = memonger.share_grad_blobs(
-            m.net,
-            ["name_x/loss1", "name_x/loss2"],
-            set(m.param_to_grad.values()),
-            "name_x",  # "name_x//shared_gradinp_0_shared" if using "name_x/"
-            share_activations=True,
-            dont_share_blobs=set(['name_x/fc6', 'name_x/fc5',
-                                   str(input_to_grad["name_x/fc1_w"])]),
-        )
-        blobs_after = count_blobs(optim_proto)
-        self.assertLess(blobs_after, blobs_before)
-        self.assertTrue(has_blob(optim_proto, "name_x/fc6"))
-
-        # Test networks produce exactly same gradients
-        data = np.random.randn(batch_size, input_dim).astype(np.float32)
-        label = np.random.randint(
-            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
-        workspace.RunNetOnce(m.param_init_net)
-        workspace.FeedBlob("name_x/data", data)
-        workspace.FeedBlob("name_x/label", label)
-        workspace.RunNetOnce(m.net)
-        loss1 = workspace.FetchBlob("name_x/loss1")
-        loss2 = workspace.FetchBlob("name_x/loss2")
-        grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
-        workspace.FeedBlob(str(input_to_grad["name_x/fc1_w"]), np.array([0.0]))
-
-        workspace.RunNetOnce(optim_proto)
-        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
-        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
-        optimized_grad = workspace.FetchBlob(str(input_to_grad["name_x/fc1_w"]))
-        np.testing.assert_almost_equal(loss1, optimized_loss1)
-        np.testing.assert_almost_equal(loss2, optimized_loss2)
-        np.testing.assert_almost_equal(grad, optimized_grad)
-
-    @given(input_dim=st.integers(min_value=4, max_value=4),
-           output_dim=st.integers(min_value=4, max_value=4),
-           batch_size=st.integers(min_value=4, max_value=4))
-    @settings(deadline=1000)
-    def test_forward_optim_tree_daggy(self, input_dim, output_dim, batch_size):
-        m = model_helper.ModelHelper()
-        m.Proto().type = "dag"
-        m.Proto().num_workers = 4
-
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
-            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
-
-            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
-            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
-            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
-
-            # Branch
-            fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim)
-            fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim)
-            fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim)
-
-            fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum")
-
-            fc5.Relu([], fc5sum) \
-               .Softmax([], "pred1") \
-               .LabelCrossEntropy(["label"], ["xent1"]) \
-               .AveragedLoss([], "loss1")
-            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
-            fc6.Relu([], fc6) \
-               .Softmax([], "pred2") \
-               .LabelCrossEntropy(["label"], ["xent2"]) \
-               .AveragedLoss([], "loss2")
-
-        blobs_before = count_blobs(m.net.Proto())
-        optim_proto = memonger.optimize_inference_for_dag(
-            m.net, ["name_x/data"], "name_x"
-        )
-        blobs_after = count_blobs(optim_proto)
-        self.assertLess(blobs_after, blobs_before)
-
-        # Test networks produce exactly same results
-        data = np.random.randn(batch_size, input_dim).astype(np.float32)
-        label = np.random.randint(
-            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
-        workspace.RunNetOnce(m.param_init_net)
-        workspace.FeedBlob("name_x/data", data)
-        workspace.FeedBlob("name_x/label", label)
-        workspace.RunNetOnce(m.net)
-        loss1 = workspace.FetchBlob("name_x/loss1")
-        loss2 = workspace.FetchBlob("name_x/loss2")
-        workspace.RunNetOnce(optim_proto)
-        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
-        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
-        np.testing.assert_almost_equal(loss1, optimized_loss1)
-        np.testing.assert_almost_equal(loss2, optimized_loss2)
-
-    @given(input_dim=st.integers(min_value=4, max_value=4),
-           output_dim=st.integers(min_value=4, max_value=4),
-           batch_size=st.integers(min_value=4, max_value=4))
-    @settings(deadline=10000)
-    def test_forward_optim_tree_harder(self, input_dim, output_dim, batch_size):
-        m = model_helper.ModelHelper()
-        m.net.Proto().type = "dag"
-        m.net.Proto().num_workers = 4
-        m.net.AddExternalInput("label")
-        m.net.AddExternalInput("data")
-
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
-            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
-
-            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
-            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
-            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
-
-            # Branch
-            fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim)
-            fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim)
-            fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim)
-
-            fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum")
-            fc5sum.Relu([], "relu1") \
-               .Softmax([], "pred1") \
-               .LabelCrossEntropy(["label"], ["xent1"]) \
-               .AveragedLoss([], "loss1")
-            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
-            fc6.Relu([], fc6) \
-               .Softmax([], "pred2") \
-               .LabelCrossEntropy(["label"], ["xent2"]) \
-               .AveragedLoss([], "loss2")
-
-        blobs_before = count_blobs(m.net.Proto())
-        optim_proto = memonger.optimize_inference_for_dag(
-            m.net, ["name_x/data"], "name_x/"
-        )
-
-        blobs_after = count_blobs(optim_proto)
-
-        # Extra test with when one of the parameters is also an input.
-        # This caused a bug before.
-        optim_proto_extra_input = memonger.optimize_inference_for_dag(
-            m.net, ["name_x/data", "name_x/fc1_w"], "name_x/"
-        )
-        blobs_after_extra_input = count_blobs(optim_proto_extra_input)
-        self.assertEqual(blobs_after, blobs_after_extra_input)
-        ###
-
-        print(str(optim_proto))
-        self.assertLess(blobs_after, blobs_before)
-
-        # Test networks produce exactly same results
-        data = np.random.randn(batch_size, input_dim).astype(np.float32)
-        label = np.random.randint(
-            low=0, high=output_dim, size=(batch_size,)).astype(np.int32)
-        workspace.RunNetOnce(m.param_init_net)
-        workspace.FeedBlob("name_x/data", data)
-        workspace.FeedBlob("name_x/label", label)
-        workspace.RunNetOnce(m.net)
-        loss1 = workspace.FetchBlob("name_x/loss1")
-        loss2 = workspace.FetchBlob("name_x/loss2")
-        workspace.RunNetOnce(optim_proto)
-        optimized_loss1 = workspace.FetchBlob("name_x/loss1")
-        optimized_loss2 = workspace.FetchBlob("name_x/loss2")
-        np.testing.assert_almost_equal(loss1, optimized_loss1)
-        np.testing.assert_almost_equal(loss2, optimized_loss2)
-
-    # This test reproduces scenario where dag traversal for finding
-    # shared blobs was not always starting from ops with in degree of 0
-    @settings(deadline=10000)
-    def test_forward_optim_tree_dag_traversal(self):
-        input_dim = 4
-        output_dim = 4
-        batch_size = 4
-
-        m = model_helper.ModelHelper()
-        m.Proto().type = "dag"
-        m.Proto().num_workers = 4
-
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
-            fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
-
-            fc3 = brew.fc(m, fc2, "fc3", dim_in=output_dim, dim_out=output_dim)
-            fc4 = brew.fc(m, fc3, "fc4", dim_in=output_dim, dim_out=output_dim)
-            fc5 = brew.fc(m, fc4, "fc5", dim_in=output_dim, dim_out=output_dim)
-
-            # Branch
-            fc3b = brew.fc(m, fc2, "fc3b", dim_in=output_dim, dim_out=output_dim)
-            fc4b = brew.fc(m, fc3b, "fc4b", dim_in=output_dim, dim_out=output_dim)
-            fc5b = brew.fc(m, fc4b, "fc5b", dim_in=output_dim, dim_out=output_dim)
-
-            fc5sum = brew.sum(m, [fc5, fc5b], "fc5sum")
-
-            fc5.Relu([], fc5sum) \
-               .Softmax([], "pred1") \
-               .LabelCrossEntropy(["label"], ["xent1"]) \
-               .AveragedLoss([], "loss1")
-            fc6 = brew.fc(m, fc5, "fc6", dim_in=output_dim, dim_out=output_dim)
-            fc6.Relu([], fc6) \
-               .Softmax([], "pred2") \
-               .LabelCrossEntropy(["label"], ["xent2"]) \
-               .AveragedLoss([], "loss2")
-
-        blobs_before = count_blobs(m.net.Proto())
-        # adding name_x/fc5_w as heads (which belongs to non-root op)
-        # to make sure that dag traversal always starts from root ops
-        optim_proto = memonger.optimize_inference_for_dag(
-            m.net, ["name_x/fc5_w", "name_x/data"], "name_x"
-        )
-        blobs_after = count_blobs(optim_proto)
-        self.assertLess(blobs_after, blobs_before)
-
-    # This is specifically to verify the op schema check being done in memonger
-    def test_forward_optim_tree_enforce_inplace_op_invalid(self):
-        m = model_helper.ModelHelper()
-        m.Proto().type = "dag"
-        m.Proto().num_workers = 4
-
-        net = m.net
-        net.IndexFreeze("A", "B")  # enforce inplace op
-        net.Sum(["B", "B"], "C")
-        net.Relu("C", "D")
-        net.Sum(["D", "D"], "E")
-
-        with self.assertRaises(RuntimeError):
-            memonger.optimize_inference_for_dag(net, ["A"], "")
-
-    # Here inplace op is specifically a root op to repro the scenario where dag
-    # memonger could treat all the output blobs as shareable blobs and fails
-    # assertion of input blob with the same name not allowed to share
-    def test_forward_optim_tree_enforce_inplace_op_valid_and_as_head(self):
-        m = model_helper.ModelHelper()
-        m.Proto().type = "dag"
-        m.Proto().num_workers = 4
-
-        net = m.net
-        net.IndexFreeze("A", "A")  # enforce inplace op
-        net.Sum(["A", "A"], "B")
-        net.Relu("B", "C")
-        net.Relu("C", "D")
-        net.Sum(["D", "D"], "E")
-
-        blobs_before = count_blobs(m.net.Proto())
-        optim_proto = memonger.optimize_inference_for_dag(
-            net, ["A"], ""
-        )
-        blobs_after = count_blobs(optim_proto)
-        self.assertLess(blobs_after, blobs_before)
-
-    def test_rnn(self):
-        from caffe2.python import rnn_cell
-        T = 5
-        model = model_helper.ModelHelper()
-        seq_lengths, labels = \
-            model.net.AddExternalInputs(
-                'seq_lengths', 'labels',
-            )
-        init_blobs = []
-        for i in range(2):
-            hidden_init, cell_init = model.net.AddExternalInputs(
-                "hidden_init_{}".format(i),
-                "cell_init_{}".format(i)
-            )
-            init_blobs.extend([hidden_init, cell_init])
-        model.param_init_net.ConstantFill([], ["input"], shape=[T, 4, 10])
-        output, last_hidden, _, last_state = rnn_cell.LSTM(
-            model=model,
-            input_blob="input",
-            seq_lengths=seq_lengths,
-            initial_states=init_blobs,
-            dim_in=10,
-            dim_out=[10, 10],
-            scope="lstm1",
-            forward_only=False,
-            drop_states=True,
-            return_last_layer_only=True,
-        )
-        softmax, loss = model.net.SoftmaxWithLoss(
-            [model.Flatten(output), "labels"],
-            ['softmax', 'loss'],
-        )
-
-        model.AddGradientOperators([loss])
-        blobs_before = count_blobs(model.net.Proto())
-        optim_proto = memonger.share_grad_blobs(
-            model.net,
-            ["loss"],
-            set(model.param_to_grad.values()),
-            "",
-            share_activations=True,
-            dont_share_blobs=set(),
-        )
-        blobs_after = count_blobs(optim_proto)
-        self.assertLess(blobs_after, blobs_before)
-
-        # Run once to see all blobs are set up correctly
-        for init_blob in init_blobs:
-            workspace.FeedBlob(init_blob, np.zeros(
-                [1, 4, 10], dtype=np.float32
-            ))
-        workspace.FeedBlob("seq_lengths", np.array([T] * 4, dtype=np.int32))
-        workspace.FeedBlob("labels", np.random.rand(T).astype(np.int32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-    def test_compute_interference_graph_inplace_ops(self):
-        m = model_helper.ModelHelper()
-        m.Copy("b1", "b1")
-        m.Copy("b1", "b1")
-        m.Copy("b1", "b1")
-        g = memonger.compute_interference_graph(m.net.Proto().op)
-        self.assertEqual(list(g.edges()), [(0, 1), (0, 2), (1, 2)])
-
-    def test_topological_sort_longest_path(self):
-        m = model_helper.ModelHelper()
-        # 0
-        m.Copy("conv0_w_comp", "conv0_w")
-        # 1
-        conv0 = brew.conv(m, "data", "conv0", 32, 32, 4)
-        # 2
-        m.Copy("conv2_w", "conv2_w")
-        # 3
-        brew.conv(m, conv0, "conv2", 16, 32, 4)
-
-        g = memonger.compute_interference_graph(m.net.Proto().op)
-
-        orders_org = memonger.topological_sort_traversal(g)
-        orders_gt_org = [2, 0, 1, 3]
-        self.assertEqual(orders_gt_org, list(orders_org))
-
-        orders = memonger.topological_sort_traversal_longest_path(g)
-        # longer path is in front of the shorter one
-        orders_gt = [0, 1, 2, 3]
-        self.assertEqual(orders_gt, list(orders))
-
-    def test_topological_sort_longest_path_multi_target(self):
-        # two outputs: conv2 and data4
-        m = model_helper.ModelHelper()
-        # 0
-        m.Copy("conv0_w_comp", "conv0_w")
-        # 1
-        conv0 = brew.conv(m, "data", "conv0", 32, 32, 4)
-        # 2
-        m.Copy("conv2_w", "conv2_w")
-        # 3
-        brew.conv(m, conv0, "conv2", 16, 32, 4)
-        # 4
-        m.Copy("data1", "data2")
-        # 5
-        m.Copy("data2", "data3")
-
-        g = memonger.compute_interference_graph(m.net.Proto().op)
-
-        orders_org = memonger.topological_sort_traversal(g)
-        orders_gt_org = [4, 5, 2, 0, 1, 3]
-        self.assertEqual(orders_gt_org, list(orders_org))
-
-        orders = memonger.topological_sort_traversal_longest_path(g)
-        # longer path is in front of the shorter one
-        orders_gt = [0, 1, 2, 3, 4, 5]
-        self.assertEqual(orders_gt, list(orders))
-
-    def test_topological_sort_longest_path_single_node(self):
-        # single node
-        m = model_helper.ModelHelper()
-        # 0
-        m.Copy("conv0_w_comp", "conv0_w")
-
-        g = memonger.compute_interference_graph(m.net.Proto().op)
-
-        orders_org = memonger.topological_sort_traversal(g)
-        orders_gt_org = [0]
-        self.assertEqual(orders_gt_org, list(orders_org))
-
-        orders = memonger.topological_sort_traversal_longest_path(g)
-        # longer path is in front of the shorter one
-        orders_gt = [0]
-        self.assertEqual(orders_gt, list(orders))
-
-    def test_compute_assignments_greedy(self):
-        LiveRange = memonger.LiveRange
-        ranges_sorted = [
-            ('b1', LiveRange(1, 3, 10)),
-            ('b2', LiveRange(3, 4, 1)),
-            ('b3', LiveRange(5, 6, 1)),
-            ('b4', LiveRange(5, 7, 10)),
-        ]
-        assignment_gt = [
-            [ranges_sorted[0], ranges_sorted[3]],
-            [ranges_sorted[1], ranges_sorted[2]],
-        ]
-
-        best = memonger.compute_assignments_greedy(ranges_sorted, None)
-        self.assertEqual(memonger.get_memory_usage(best), 11)
-        self.assertEqual(best, assignment_gt)
-
-    def test_compute_assignments_dp(self):
-        LiveRange = memonger.LiveRange
-        ranges_sorted = [
-            ('b1', LiveRange(1, 3, 10)),
-            ('b2', LiveRange(3, 4, 1)),
-            ('b3', LiveRange(5, 6, 1)),
-            ('b4', LiveRange(5, 7, 10)),
-        ]
-
-        best = memonger.compute_assignments_dp(ranges_sorted, None)
-        self.assertEqual(memonger.get_memory_usage(best), 11)
-
-    def test_compute_assignments_dp1(self):
-        LiveRange = memonger.LiveRange
-        ranges_sorted = [
-            ('b1', LiveRange(1, 2, 10)),
-            ('b2', LiveRange(4, 6, 1)),
-            ('b3', LiveRange(5, 6, 10)),
-        ]
-
-        best = memonger.compute_assignments_dp(ranges_sorted, [])
-        self.assertEqual(memonger.get_memory_usage(best), 11)
-
-    @given(input_dim=st.integers(min_value=4, max_value=4),
-           output_dim=st.integers(min_value=4, max_value=4),
-           batch_size=st.integers(min_value=4, max_value=4))
-    def test_verify_graph_equality(self, input_dim, output_dim, batch_size):
-        m = model_helper.ModelHelper()
-        m.Proto().type = "dag"
-        m.Proto().num_workers = 4
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
-            fc2 = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
-            fc3 = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
-            brew.sum(m, [fc2, fc3], "out")
-
-        m2 = model_helper.ModelHelper()
-        m2.Proto().type = "dag"
-        m2.Proto().num_workers = 4
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m2, "data", "other_x", dim_in=input_dim, dim_out=output_dim)
-            fc2 = brew.fc(m2, fc1, "other_y", dim_in=output_dim, dim_out=output_dim)
-            fc3 = brew.fc(m2, fc1, "other_z", dim_in=output_dim, dim_out=output_dim)
-            brew.sum(m2, [fc2, fc3], "out")
-
-        self.assertTrue(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
-
-    @given(input_dim=st.integers(min_value=4, max_value=4),
-           output_dim=st.integers(min_value=4, max_value=4),
-           batch_size=st.integers(min_value=4, max_value=4))
-    def test_verify_graph_equality_harder(self, input_dim, output_dim, batch_size):
-        m = model_helper.ModelHelper()
-        m.Proto().type = "dag"
-        m.Proto().num_workers = 4
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
-            fc2a = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
-            fc2b = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
-            fc3a = brew.fc(m, fc2a, "u", dim_in=output_dim, dim_out=output_dim)
-            fc3b = brew.fc(m, fc2b, "v", dim_in=output_dim, dim_out=output_dim)
-            brew.sum(m, [fc3a, fc3b], "out")
-
-        m2 = model_helper.ModelHelper()
-        m2.Proto().type = "dag"
-        m2.Proto().num_workers = 4
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim)
-            fc2a = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
-            fc2b = brew.fc(m2, fc1, "z", dim_in=output_dim, dim_out=output_dim)
-            fc3a = brew.fc(m2, fc2a, "y", dim_in=output_dim, dim_out=output_dim)
-            fc3b = brew.fc(m2, fc2b, "z", dim_in=output_dim, dim_out=output_dim)
-            brew.sum(m2, [fc3a, fc3b], "out")
-
-        self.assertTrue(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
-
-    @given(input_dim=st.integers(min_value=4, max_value=4),
-           output_dim=st.integers(min_value=4, max_value=4),
-           batch_size=st.integers(min_value=4, max_value=4))
-    def test_verify_graph_inequality(self, input_dim, output_dim, batch_size):
-        m = model_helper.ModelHelper()
-        m.Proto().type = "dag"
-        m.Proto().num_workers = 4
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
-            fc2 = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
-            fc3 = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
-            brew.sum(m, [fc2, fc3], "out")
-
-        m2 = model_helper.ModelHelper()
-        m2.Proto().type = "dag"
-        m2.Proto().num_workers = 4
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim)
-            fc2 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
-            fc3 = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
-            brew.sum(m2, [fc2, fc3], "out")
-
-        self.assertFalse(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
-
-    @given(input_dim=st.integers(min_value=4, max_value=4),
-           output_dim=st.integers(min_value=4, max_value=4),
-           batch_size=st.integers(min_value=4, max_value=4))
-    def test_verify_graph_inequality_harder(self, input_dim, output_dim, batch_size):
-        m = model_helper.ModelHelper()
-        m.Proto().type = "dag"
-        m.Proto().num_workers = 4
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m, "data", "x", dim_in=input_dim, dim_out=output_dim)
-            fc2a = brew.fc(m, fc1, "y", dim_in=output_dim, dim_out=output_dim)
-            fc2b = brew.fc(m, fc1, "z", dim_in=output_dim, dim_out=output_dim)
-            fc3a = brew.fc(m, fc2a, "u", dim_in=output_dim, dim_out=output_dim)
-            fc3b = brew.fc(m, fc2b, "v", dim_in=output_dim, dim_out=output_dim)
-            brew.sum(m, [fc3a, fc3b], "out")
-
-        m2 = model_helper.ModelHelper()
-        m2.Proto().type = "dag"
-        m2.Proto().num_workers = 4
-        with core.NameScope("name_x"):
-            fc1 = brew.fc(m2, "data", "x", dim_in=input_dim, dim_out=output_dim)
-            fc2a = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
-            fc2b = brew.fc(m2, fc1, "y", dim_in=output_dim, dim_out=output_dim)
-            fc3a = brew.fc(m2, fc2a, "u", dim_in=output_dim, dim_out=output_dim)
-            fc3b = brew.fc(m2, fc2b, "v", dim_in=output_dim, dim_out=output_dim)
-            brew.sum(m2, [fc3a, fc3b], "out")
-
-        self.assertFalse(memonger.verify_graph_equality(m.net.Proto(), m2.net.Proto()))
-
-    def test_release_blobs_when_used(self):
-        m = model_helper.ModelHelper()
-        fc1 = brew.fc(m, "data", "x", dim_in=2, dim_out=2)
-        fc2 = brew.fc(m, fc1, "y", dim_in=2, dim_out=2)
-        fc3 = brew.fc(m, fc1, "z", dim_in=2, dim_out=2)
-        fc4 = brew.fc(m, fc2, "u", dim_in=2, dim_out=2)
-        m.net.Alias(["u"], ["u_alias"])
-
-        brew.sum(m, [fc3, fc4], "out")
-
-        with_frees = memonger.release_blobs_when_used(m.net.Proto(), set("data"))
-
-        expect_frees = {"x", "y", "z"}  # out is external output
-                                        # and u is aliased so cannot be freed
-        found_frees = set()
-        for op in with_frees.op:
-            if op.type == "Free":
-                self.assertFalse(op.input[0] in found_frees)  # no double frees
-                found_frees.add(op.input[0])
-            else:
-                # Check a freed blob is not used anymore
-                for inp in op.input:
-                    self.assertFalse(inp in found_frees)
-                for outp in op.output:
-                    self.assertFalse(outp in found_frees)
-
-        self.assertEqual(expect_frees, found_frees)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/mint/__init__.py b/caffe2/python/mint/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/mint/app.py b/caffe2/python/mint/app.py
deleted file mode 100644
index 868fcc6769a3..000000000000
--- a/caffe2/python/mint/app.py
+++ /dev/null
@@ -1,189 +0,0 @@
-## @package app
-# Module caffe2.python.mint.app
-import argparse
-import flask
-import glob
-import numpy as np
-import nvd3
-import os
-import sys
-# pyre-fixme[21]: Could not find module `tornado.httpserver`.
-import tornado.httpserver
-# pyre-fixme[21]: Could not find a module corresponding to import `tornado.wsgi`
-import tornado.wsgi
-
-__folder__ = os.path.abspath(os.path.dirname(__file__))
-
-app = flask.Flask(
-    __name__,
-    template_folder=os.path.join(__folder__, "templates"),
-    static_folder=os.path.join(__folder__, "static")
-)
-args = None
-
-
-def jsonify_nvd3(chart):
-    chart.buildcontent()
-    # Note(Yangqing): python-nvd3 does not seem to separate the built HTML part
-    # and the script part. Luckily, it seems to be the case that the HTML part is
-    # only a <div>, which can be accessed by chart.container; the script part,
-    # while the script part occupies the rest of the html content, which we can
-    # then find by chart.htmlcontent.find['<script>'].
-    script_start = chart.htmlcontent.find('<script>') + 8
-    script_end = chart.htmlcontent.find('</script>')
-    return flask.jsonify(
-        result=chart.container,
-        script=chart.htmlcontent[script_start:script_end].strip()
-    )
-
-
-def visualize_summary(filename):
-    try:
-        data = np.loadtxt(filename)
-    except Exception as e:
-        return 'Cannot load file {}: {}'.format(filename, str(e))
-    chart_name = os.path.splitext(os.path.basename(filename))[0]
-    chart = nvd3.lineChart(
-        name=chart_name + '_summary_chart',
-        height=args.chart_height,
-        y_axis_format='.03g'
-    )
-    if args.sample < 0:
-        step = max(data.shape[0] / -args.sample, 1)
-    else:
-        step = args.sample
-    xdata = np.arange(0, data.shape[0], step)
-    # data should have 4 dimensions.
-    chart.add_serie(x=xdata, y=data[xdata, 0], name='min')
-    chart.add_serie(x=xdata, y=data[xdata, 1], name='max')
-    chart.add_serie(x=xdata, y=data[xdata, 2], name='mean')
-    chart.add_serie(x=xdata, y=data[xdata, 2] + data[xdata, 3], name='m+std')
-    chart.add_serie(x=xdata, y=data[xdata, 2] - data[xdata, 3], name='m-std')
-    return jsonify_nvd3(chart)
-
-
-def visualize_print_log(filename):
-    try:
-        data = np.loadtxt(filename)
-        if data.ndim == 1:
-            data = data[:, np.newaxis]
-    except Exception as e:
-        return 'Cannot load file {}: {}'.format(filename, str(e))
-    chart_name = os.path.splitext(os.path.basename(filename))[0]
-    chart = nvd3.lineChart(
-        name=chart_name + '_log_chart',
-        height=args.chart_height,
-        y_axis_format='.03g'
-    )
-    if args.sample < 0:
-        step = max(data.shape[0] / -args.sample, 1)
-    else:
-        step = args.sample
-    xdata = np.arange(0, data.shape[0], step)
-    # if there is only one curve, we also show the running min and max
-    if data.shape[1] == 1:
-        # We also print the running min and max for the steps.
-        trunc_size = data.shape[0] / step
-        running_mat = data[:trunc_size * step].reshape((trunc_size, step))
-        chart.add_serie(
-            x=xdata[:trunc_size],
-            y=running_mat.min(axis=1),
-            name='running_min'
-        )
-        chart.add_serie(
-            x=xdata[:trunc_size],
-            y=running_mat.max(axis=1),
-            name='running_max'
-        )
-        chart.add_serie(x=xdata, y=data[xdata, 0], name=chart_name)
-    else:
-        for i in range(0, min(data.shape[1], args.max_curves)):
-            # data should have 4 dimensions.
-            chart.add_serie(
-                x=xdata,
-                y=data[xdata, i],
-                name='{}[{}]'.format(chart_name, i)
-            )
-
-    return jsonify_nvd3(chart)
-
-
-def visualize_file(filename):
-    fullname = os.path.join(args.root, filename)
-    if filename.endswith('summary'):
-        return visualize_summary(fullname)
-    elif filename.endswith('log'):
-        return visualize_print_log(fullname)
-    else:
-        return flask.jsonify(
-            result='Unsupport file: {}'.format(filename),
-            script=''
-        )
-
-
-@app.route('/')
-def index():
-    files = glob.glob(os.path.join(args.root, "*.*"))
-    files.sort()
-    names = [os.path.basename(f) for f in files]
-    return flask.render_template(
-        'index.html',
-        root=args.root,
-        names=names,
-        debug_messages=names
-    )
-
-
-@app.route('/visualization/<string:name>')
-def visualization(name):
-    ret = visualize_file(name)
-    return ret
-
-
-def main(argv):
-    parser = argparse.ArgumentParser("The mint visualizer.")
-    parser.add_argument(
-        '-p',
-        '--port',
-        type=int,
-        default=5000,
-        help="The flask port to use."
-    )
-    parser.add_argument(
-        '-r',
-        '--root',
-        type=str,
-        default='.',
-        help="The root folder to read files for visualization."
-    )
-    parser.add_argument(
-        '--max_curves',
-        type=int,
-        default=5,
-        help="The max number of curves to show in a dump tensor."
-    )
-    parser.add_argument(
-        '--chart_height',
-        type=int,
-        default=300,
-        help="The chart height for nvd3."
-    )
-    parser.add_argument(
-        '-s',
-        '--sample',
-        type=int,
-        default=-200,
-        help="Sample every given number of data points. A negative "
-        "number means the total points we will sample on the "
-        "whole curve. Default 100 points."
-    )
-    global args
-    args = parser.parse_args(argv)
-    server = tornado.httpserver.HTTPServer(tornado.wsgi.WSGIContainer(app))
-    server.listen(args.port)
-    print("Tornado server starting on port {}.".format(args.port))
-    tornado.ioloop.IOLoop.instance().start()
-
-
-if __name__ == '__main__':
-    main(sys.argv[1:])
diff --git a/caffe2/python/mint/static/css/simple-sidebar.css b/caffe2/python/mint/static/css/simple-sidebar.css
deleted file mode 100644
index 5d06c6d12156..000000000000
--- a/caffe2/python/mint/static/css/simple-sidebar.css
+++ /dev/null
@@ -1,125 +0,0 @@
-/*!
- * Start Bootstrap - Simple Sidebar HTML Template (http://startbootstrap.com)
- * Code licensed under the Apache License v2.0.
- * For details, see http://www.apache.org/licenses/LICENSE-2.0.
- */
-
-/* Toggle Styles */
-
-#wrapper {
-    padding-left: 0;
-    -webkit-transition: all 0.5s ease;
-    -moz-transition: all 0.5s ease;
-    -o-transition: all 0.5s ease;
-    transition: all 0.5s ease;
-}
-
-#wrapper.toggled {
-    padding-left: 250px;
-}
-
-#sidebar-wrapper {
-    z-index: 1000;
-    position: fixed;
-    left: 250px;
-    width: 0;
-    height: 100%;
-    margin-left: -250px;
-    overflow-y: auto;
-    background: rgb(193,237,201);
-    -webkit-transition: all 0.5s ease;
-    -moz-transition: all 0.5s ease;
-    -o-transition: all 0.5s ease;
-    transition: all 0.5s ease;
-}
-
-#wrapper.toggled #sidebar-wrapper {
-    width: 250px;
-}
-
-#page-content-wrapper {
-    width: 100%;
-    position: absolute;
-    padding: 15px;
-}
-
-#wrapper.toggled #page-content-wrapper {
-    position: absolute;
-    margin-right: -250px;
-}
-
-/* Sidebar Styles */
-
-.sidebar-nav {
-    position: absolute;
-    top: 0;
-    width: 250px;
-    margin-bottom: 40px;
-    padding: 0;
-    list-style: none;
-}
-
-.sidebar-nav li {
-    text-indent: 20px;
-    line-height: 30px;
-}
-
-.sidebar-nav li a {
-    display: block;
-    text-decoration: none;
-    color: #999999;
-}
-
-.sidebar-nav li a:hover {
-    text-decoration: none;
-    color: #000;
-    background: rgba(255,255,255,0.8);
-}
-
-.sidebar-nav li a:active,
-.sidebar-nav li a:focus {
-    text-decoration: none;
-}
-
-.sidebar-nav > .sidebar-brand {
-    height: 65px;
-    font-size: 18px;
-    line-height: 60px;
-}
-
-.sidebar-nav > .sidebar-brand a {
-    color: #999999;
-}
-
-.sidebar-nav > .sidebar-brand a:hover {
-    color: #fff;
-    background: none;
-}
-
-@media(min-width:768px) {
-    #wrapper {
-        padding-left: 250px;
-    }
-
-    #wrapper.toggled {
-        padding-left: 0;
-    }
-
-    #sidebar-wrapper {
-        width: 250px;
-    }
-
-    #wrapper.toggled #sidebar-wrapper {
-        width: 0;
-    }
-
-    #page-content-wrapper {
-        padding: 20px;
-        position: relative;
-    }
-
-    #wrapper.toggled #page-content-wrapper {
-        position: relative;
-        margin-right: 0;
-    }
-}
diff --git a/caffe2/python/mint/templates/index.html b/caffe2/python/mint/templates/index.html
deleted file mode 100644
index f3a1a7dde0eb..000000000000
--- a/caffe2/python/mint/templates/index.html
+++ /dev/null
@@ -1,134 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-  <head>
-    <meta charset="utf-8">
-    <meta http-equiv="X-UA-Compatible" content="IE=edge">
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <title>Mint</title>
-    <!-- Latest compiled and minified CSS -->
-    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
-    <!--  NVD3 css -->
-    <link href="http://nvd3.org/assets/css/nv.d3.css" type="text/css" rel="stylesheet"/>
-    <!-- static sidebar -->
-    <link href="{{ url_for('static', filename='css/simple-sidebar.css') }}" type="text/css" rel="stylesheet"/>
-  </head>
-  <body>
-    <script src="http://nvd3.org/assets/lib/d3.v3.js"></script>
-    <script src="http://nvd3.org/assets/js/nv.d3.js"></script>
-    <!-- Simple Ajax content -->
-    <script>
-    function getCurrentTimeString() {
-      var now = new Date();
-      return now.toLocaleString();
-    }
-    function loadVisualization(content) {
-      var xmlhttp = new XMLHttpRequest();
-      xmlhttp.onreadystatechange=function() {
-        if (xmlhttp.readyState==4 && xmlhttp.status==200) {
-          var response = JSON.parse(xmlhttp.responseText);
-          document.getElementById("visualization_" + content).innerHTML =
-              response['result'];
-          document.getElementById("visualization_" + content + "_time").innerHTML =
-              getCurrentTimeString();
-          eval(response['script']);
-        }
-      }
-      document.getElementById("visualization_" + content).innerHTML =
-          "Loading...";
-      xmlhttp.open("GET", "visualization/" + content, true);
-      xmlhttp.send();
-    }
-    </script>
-
-    <div id="wrapper">
-      <!-- Sidebar -->
-      <div id="sidebar-wrapper">
-        <ul class="sidebar-nav">
-          <li class="sidebar-brand">
-            <h3><span class="glyphicon glyphicon-leaf" aria-hidden="true"></span>&nbspCaffe-Mint</h3>
-          </li>
-          <li><a href="#page-content-wrapper"><strong>Top</strong></a></li>
-          {% for name in names %}
-            <li><a href="#visualization_{{name}}_panel"> {{name}} </a></li>
-          {% endfor %}
-        </ul>
-      </div> <!-- /#sidebar-wrapper -->
-
-      <div id="page-content-wrapper">
-        <p>
-          Visualizing folder: {{ root }}.<br/>
-          <a class="btn btn-default" id="menu-toggle">Toggle sidebar</a>
-          <a class="btn btn-default" id="menu-toggle" onclick="refreshAll()">Refresh all</a>
-        </p>
-        <div role="tabpanel">
-          <!-- Nav tabs -->
-          <ul class="nav nav-tabs" role="tablist">
-            <li role="presentation" class="active"><a href="#visualizations" aria-controls="visualizations" role="tab" data-toggle="tab">Visualizations</a></li>
-            <li role="presentation"><a href="#debug" aria-controls="debug" role="tab" data-toggle="tab">Debug</a></li>
-          </ul>
-
-          <!-- Tab panes -->
-          <div class="tab-content">
-            <p></p>
-            <div role="tabpanel" class="tab-pane active" id="visualizations">
-              {% for name in names %}
-                <div class="panel panel-default" id="visualization_{{name}}_panel">
-                  <div class="panel-heading">
-                    {{ name }}
-                    <a onclick="loadVisualization('{{name}}')">
-                      <span class="glyphicon glyphicon-refresh" aria-hidden="true"></span>
-                    </a>
-                    <a href="#page-content-wrapper" class="pull-right"> Top </a>
-                  </div>
-                  <div class="panel-body">
-                    <div id="visualization_{{name}}"> Loading... </div>
-                    <p> Last updated: <span id="visualization_{{name}}_time">NA</span></p>
-                  </div>
-                </div>
-              {% endfor %}
-            </div>
-            <div role="tabpanel" class="tab-pane" id="debug">
-              <ul>
-                {% for message in debug_messages %}
-                  <li>{{ message }}</li>
-                {% endfor %}
-              </ul>
-            </div>
-          </div>
-        </div>
-
-
-        <hr>
-        <div id="footer">
-          <div class="container">
-            <p>
-              Mint is a minimal Caffe2 visualization tool by
-              <a href="http://daggerfs.com/" target="_blank">Yangqing</a>.
-            </p>
-          </div>
-        </div>
-      </div> <!-- /#page-content-wrapper -->
-    </div><!-- /#wrapper -->
-
-    <!-- jQuery (necessary for Bootstrap's JavaScript plugins) -->
-    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script>
-    <!-- Latest compiled and minified JavaScript -->
-    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script>
-    <!-- Menu Toggle Script -->
-    <script>
-    $("#menu-toggle").click(function(e) {
-        e.preventDefault();
-        $("#wrapper").toggleClass("toggled");
-    });
-    </script>
-    <!-- load all contents for the first time when this page loads. -->
-    <script>
-      function refreshAll() {
-        {% for name in names %}
-          loadVisualization("{{name}}");
-        {% endfor %}
-      }
-      refreshAll();
-    </script>
-  </body>
-</html>
diff --git a/caffe2/python/mkl/__init__.py b/caffe2/python/mkl/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/mkl/mkl_LRN_op_test.py b/caffe2/python/mkl/mkl_LRN_op_test.py
deleted file mode 100644
index fddb20e6bb14..000000000000
--- a/caffe2/python/mkl/mkl_LRN_op_test.py
+++ /dev/null
@@ -1,47 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-
-
-class MKLLRNTest(hu.HypothesisTestCase):
-    @given(input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           im_size=st.integers(1, 10),
-           order=st.sampled_from(["NCHW"]),
-           **mu.gcs)
-
-    def test_mkl_LRN(self, input_channels,
-                            batch_size, im_size, order,
-                             gc, dc):
-        op = core.CreateOperator(
-            "LRN",
-            ["X"],
-            ["Y", "Y_scale"],
-            size=5,
-            alpha=0.001,
-            beta=0.75,
-            bias=2.0,
-            order=order,
-        )
-        X = np.random.rand(
-            batch_size, input_channels, im_size, im_size).astype(np.float32)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_LRN_speed_test.py b/caffe2/python/mkl/mkl_LRN_speed_test.py
deleted file mode 100644
index c192137dc28c..000000000000
--- a/caffe2/python/mkl/mkl_LRN_speed_test.py
+++ /dev/null
@@ -1,79 +0,0 @@
-
-
-
-
-import unittest
-
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, test_util
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
-class TestMKLBasic(test_util.TestCase):
-    def testLRNSpeed(self):
-        # We randomly select a shape to test the speed. Intentionally we
-        # test a batch size of 1 since this may be the most frequent use
-        # case for MKL during deployment time.
-        X = np.random.rand(1, 2, 224, 224).astype(np.float32)
-        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
-        # Makes sure that feed works.
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
-        net = core.Net("test")
-        # Makes sure that we can run relu.
-        net.LRN("X", ["Y", "Y_Scale"], size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW")
-        net.LRN("X_mkl", ["Y_mkl", "Y_Scale_mkl"], size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW", device_option=mkl_do)
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-
-        # makes sure that the results are good.
-        np.testing.assert_allclose(
-            workspace.FetchBlob("Y"),
-            workspace.FetchBlob("Y_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
-
-        print("LRN CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
-
-    def testConvReluLRNSpeed(self):
-        # We randomly select a shape to test the speed. Intentionally we
-        # test a batch size of 1 since this may be the most frequent use
-        # case for MKL during deployment time.
-        X = np.random.rand(1, 3, 224, 224).astype(np.float32) - 0.5
-        W = np.random.rand(64, 3, 11, 11).astype(np.float32) - 0.5
-        b = np.random.rand(64).astype(np.float32) - 0.5
-
-        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
-        # Makes sure that feed works.
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("W", W)
-        workspace.FeedBlob("b", b)
-        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
-        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
-        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
-
-        net = core.Net("test")
-
-        net.Conv(["X", "W", "b"], "C", pad=1, stride=1, kernel=11)
-        net.Conv(["X_mkl", "W_mkl", "b_mkl"], "C_mkl",
-                 pad=1, stride=1, kernel=11, device_option=mkl_do)
-        net.Relu("C", "R")
-        net.Relu("C_mkl", "R_mkl", device_option=mkl_do)
-        net.LRN("R", ["Y", "Y_Scale"], size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW")
-        net.LRN("R_mkl", ["Y_mkl", "Y_Scale_mkl"],size=5, alpha=0.001, beta=0.75, bias=2.0, order="NCHW", device_option=mkl_do)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-        # makes sure that the results are good.
-        np.testing.assert_allclose(
-            workspace.FetchBlob("Y"),
-            workspace.FetchBlob("Y_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_concat_op_test.py b/caffe2/python/mkl/mkl_concat_op_test.py
deleted file mode 100644
index 8b01f8885b1c..000000000000
--- a/caffe2/python/mkl/mkl_concat_op_test.py
+++ /dev/null
@@ -1,45 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(
-    not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn."
-)
-class MKLConcatTest(hu.HypothesisTestCase):
-    @given(
-        batch_size=st.integers(1, 10),
-        channel_splits=st.lists(st.integers(1, 10), min_size=1, max_size=3),
-        height=st.integers(1, 10),
-        width=st.integers(1, 10),
-        **mu.gcs
-    )
-    def test_mkl_concat(
-        self, batch_size, channel_splits, height, width, gc, dc
-    ):
-        Xs = [
-            np.random.rand(batch_size, channel,
-                           height, width).astype(np.float32)
-            for channel in channel_splits
-        ]
-        op = core.CreateOperator(
-            "Concat",
-            ["X_{}".format(i) for i in range(len(Xs))],
-            ["concat_result", "split_info"],
-            order="NCHW",
-        )
-        self.assertDeviceChecks(dc, op, Xs, [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_conv_op_test.py b/caffe2/python/mkl/mkl_conv_op_test.py
deleted file mode 100644
index 74c4f2c6cde9..000000000000
--- a/caffe2/python/mkl/mkl_conv_op_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-class MKLConvTest(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(8, 20),
-           input_channels=st.integers(1, 16),
-           output_channels=st.integers(1, 16),
-           batch_size=st.integers(1, 3),
-           use_bias=st.booleans(),
-           group=st.integers(1, 8),
-           **mu.gcs)
-    def test_mkl_convolution(self, stride, pad, kernel, size,
-                             input_channels, output_channels,
-                             batch_size, use_bias, group, gc, dc):
-        op = core.CreateOperator(
-            "Conv",
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-            group=group
-        )
-        X = np.random.rand(
-            batch_size, input_channels * group, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-                output_channels * group, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels * group).astype(np.float32) - 0.5
-
-        inputs = [X, w, b] if use_bias else [X, w]
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_copy_op_test.py b/caffe2/python/mkl/mkl_copy_op_test.py
deleted file mode 100644
index b2baeb9ef1af..000000000000
--- a/caffe2/python/mkl/mkl_copy_op_test.py
+++ /dev/null
@@ -1,68 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-import caffe2.proto.caffe2_pb2 as pb2
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-class MKCopyTest(hu.HypothesisTestCase):
-    @given(width=st.integers(7, 9),
-           height=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           **mu.gcs)
-    def test_mkl_copy(self,
-                      width,
-                      height,
-                      input_channels,
-                      batch_size,
-                      gc, dc):
-        X = np.random.rand(
-            batch_size, input_channels, width, height).astype(np.float32)
-        self.ws.create_blob("X").feed(X, pb2.DeviceOption())
-        self.ws.run(core.CreateOperator(
-            "CopyCPUToMKL",
-            ["X"],
-            ["X_MKL"],
-            device_option=pb2.DeviceOption(device_type=pb2.MKLDNN)
-        ))
-        self.ws.run(core.CreateOperator(
-            "CopyMKLToCPU",
-            ["X_MKL"],
-            ["X_copy"],
-            device_option=pb2.DeviceOption(device_type=pb2.MKLDNN)
-        ))
-        np.testing.assert_array_equal(X, self.ws.blobs["X_copy"].fetch())
-
-    @given(n=st.sampled_from([0, 10]))
-    def test_mkl_zero_copy(self, n):
-        shape = (0, n)
-        X = np.zeros(shape=shape).astype(np.float32)
-        self.ws.create_blob("X").feed(X, pb2.DeviceOption())
-        self.ws.run(core.CreateOperator(
-            "CopyCPUToMKL",
-            ["X"],
-            ["X_MKL"],
-            device_option=pb2.DeviceOption(device_type=pb2.MKLDNN)
-        ))
-        self.ws.run(core.CreateOperator(
-            "CopyMKLToCPU",
-            ["X_MKL"],
-            ["X_copy"],
-            device_option=pb2.DeviceOption(device_type=pb2.MKLDNN)
-        ))
-        np.testing.assert_equal(shape, self.ws.blobs["X_copy"].fetch().shape)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_elementwise_add_op_test.py b/caffe2/python/mkl/mkl_elementwise_add_op_test.py
deleted file mode 100644
index 0709b5afd9f6..000000000000
--- a/caffe2/python/mkl/mkl_elementwise_add_op_test.py
+++ /dev/null
@@ -1,41 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-class MKLElementwiseAddTest(hu.HypothesisTestCase):
-    @given(size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           inplace=st.booleans(),
-           **mu.gcs)
-    def test_mkl_elementwise_add(self,
-                                 size,
-                                 input_channels,
-                                 batch_size,
-                                 inplace,
-                                 gc,
-                                 dc):
-        op = core.CreateOperator(
-            "Add",
-            ["X0", "X1"],
-            ["X0" if inplace else "Y"],
-        )
-        Xs = [np.random.rand(batch_size, input_channels, size, size).astype(
-            np.float32) for _ in range(2)]
-        self.assertDeviceChecks(dc, op, Xs, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_elementwise_sum_op_test.py b/caffe2/python/mkl/mkl_elementwise_sum_op_test.py
deleted file mode 100644
index 3adec4848e50..000000000000
--- a/caffe2/python/mkl/mkl_elementwise_sum_op_test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-class MKLElementwiseSumTest(hu.HypothesisTestCase):
-    @given(size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           inputs=st.integers(1, 3),
-           inplace=st.booleans(),
-           **mu.gcs)
-    def test_mkl_elementwise_sum(self,
-                                 size,
-                                 input_channels,
-                                 batch_size,
-                                 inputs,
-                                 inplace,
-                                 gc,
-                                 dc):
-        op = core.CreateOperator(
-            "Sum",
-            ["X_{}".format(i) for i in range(inputs)],
-            ["X_0" if inplace else "Y"],
-        )
-        Xs = [np.random.rand(batch_size, input_channels, size, size).astype(
-            np.float32) for _ in range(inputs)]
-        self.assertDeviceChecks(dc, op, Xs, [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_fc_op_test.py b/caffe2/python/mkl/mkl_fc_op_test.py
deleted file mode 100644
index 180d93f26570..000000000000
--- a/caffe2/python/mkl/mkl_fc_op_test.py
+++ /dev/null
@@ -1,37 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-class MKLFcTest(hu.HypothesisTestCase):
-    @given(n=st.integers(1, 5), m=st.integers(1, 5),
-           k=st.integers(1, 5), **mu.gcs)
-
-    def test_mkl_fc(self,n, m, k, gc, dc):
-        X = np.random.rand(m, k).astype(np.float32) - 0.5
-        W = np.random.rand(n, k).astype(np.float32) - 0.5
-        b = np.random.rand(n).astype(np.float32) - 0.5
-
-        op = core.CreateOperator(
-            'FC',
-            ['X', 'W', 'b'],
-            ["Y"]
-            )
-
-        self.assertDeviceChecks(dc, op, [X, W, b], [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_fc_speed_test.py b/caffe2/python/mkl/mkl_fc_speed_test.py
deleted file mode 100644
index 243e49c2f8f8..000000000000
--- a/caffe2/python/mkl/mkl_fc_speed_test.py
+++ /dev/null
@@ -1,96 +0,0 @@
-
-
-
-
-import unittest
-
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, test_util
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
-class TestMKLBasic(test_util.TestCase):
-    def testFCSpeed(self):
-        # We randomly select a shape to test the speed. Intentionally we
-        # test a batch size of 1 since this may be the most frequent use
-        # case for MKL during deployment time.
-        X = np.random.rand(1, 256, 6, 6).astype(np.float32) - 0.5
-        #X = np.random.rand(32, 256*6*6).astype(np.float32) - 0.5
-        W = np.random.rand(4096, 9216).astype(np.float32) - 0.5
-        b = np.random.rand(4096).astype(np.float32) - 0.5
-        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
-        # Makes sure that feed works.
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("W", W)
-        workspace.FeedBlob("b", b)
-        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
-        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
-        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
-        net = core.Net("test")
-        # Makes sure that we can run relu.
-        net.FC(["X", "W", "b"], "Y")
-        net.FC(["X_mkl", "W_mkl", "b_mkl"], "Y_mkl", device_option=mkl_do)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-        # makes sure that the results are good.
-        np.testing.assert_allclose(
-            workspace.FetchBlob("Y"),
-            workspace.FetchBlob("Y_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
-
-        print("FC CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
-
-    def testConvReluMaxPoolFcSpeed(self):
-        # We randomly select a shape to test the speed. Intentionally we
-        # test a batch size of 1 since this may be the most frequent use
-        # case for MKL during deployment time.
-        X = np.random.rand(1, 256, 13, 13).astype(np.float32) - 0.5
-        W = np.random.rand(256, 256, 3, 3).astype(np.float32) - 0.5
-        b = np.random.rand(256).astype(np.float32) - 0.5
-
-        w_fc = np.random.rand(4096, 9216).astype(np.float32) - 0.5
-        b_fc = np.random.rand(4096).astype(np.float32) - 0.5
-        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
-        # Makes sure that feed works.
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("W", W)
-        workspace.FeedBlob("b", b)
-        workspace.FeedBlob("w_fc", w_fc)
-        workspace.FeedBlob("b_fc", b_fc)
-        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
-        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
-        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
-        workspace.FeedBlob("w_fc_mkl", w_fc, device_option=mkl_do)
-        workspace.FeedBlob("b_fc_mkl", b_fc, device_option=mkl_do)
-
-        net = core.Net("test")
-
-        net.Conv(["X", "W", "b"], "C", pad=1, stride=1, kernel=3)
-        net.Relu("C", "R")
-        net.MaxPool("R", "P", stride=2, kernel=3)
-        net.FC(["P","w_fc", "b_fc"], "Y")
-
-        net.Conv(["X_mkl", "W_mkl", "b_mkl"], "C_mkl",
-                 pad=1, stride=1, kernel=3, device_option=mkl_do)
-        net.Relu("C_mkl", "R_mkl", device_option=mkl_do)
-        net.MaxPool("R_mkl", "P_mkl",
-                 stride=2, kernel=3, device_option=mkl_do)
-        net.FC(["P_mkl","w_fc_mkl", "b_fc_mkl"], "Y_mkl", device_option=mkl_do)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-        # makes sure that the results are good.
-        np.testing.assert_allclose(
-            workspace.FetchBlob("Y"),
-            workspace.FetchBlob("Y_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_fill_op_test.py b/caffe2/python/mkl/mkl_fill_op_test.py
deleted file mode 100644
index f233275786f7..000000000000
--- a/caffe2/python/mkl/mkl_fill_op_test.py
+++ /dev/null
@@ -1,37 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-class MKLFillTest(hu.HypothesisTestCase):
-    @given(n=st.integers(1, 4), c=st.integers(1, 4),
-           h=st.integers(1, 4), w=st.integers(1, 4),
-           filler=st.sampled_from(
-               ["XavierFill", "ConstantFill", "GaussianFill", "MSRAFill"]
-           ),
-           seed=st.integers(5, 10),
-           **mu.gcs_cpu_mkl)
-    def test_mkl_fill(self, n, c, h, w, filler, seed, gc, dc):
-        op = core.CreateOperator(
-            filler,
-            [],
-            ["Y"],
-            shape=[n, c, h, w],
-        )
-        for d in dc:
-            d.random_seed = seed
-        self.assertDeviceChecks(dc, op, [], [0])
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_pool_op_test.py b/caffe2/python/mkl/mkl_pool_op_test.py
deleted file mode 100644
index a56e9448317a..000000000000
--- a/caffe2/python/mkl/mkl_pool_op_test.py
+++ /dev/null
@@ -1,47 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, settings, assume
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-class MKLPoolTest(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           method=st.sampled_from(["MaxPool", "AveragePool"]),
-           **mu.gcs)
-    @settings(max_examples=2, deadline=100)
-    def test_mkl_pooling(self, stride, pad, kernel, size,
-                         input_channels, batch_size,
-                         method, gc, dc):
-        assume(pad < kernel)
-        op = core.CreateOperator(
-            method,
-            ["X"],
-            ["Y"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-        )
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_pool_speed_test.py b/caffe2/python/mkl/mkl_pool_speed_test.py
deleted file mode 100644
index aa43aed97a09..000000000000
--- a/caffe2/python/mkl/mkl_pool_speed_test.py
+++ /dev/null
@@ -1,106 +0,0 @@
-
-
-
-
-import unittest
-
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, test_util
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
-class TestMKLBasic(test_util.TestCase):
-    def testMaxPoolingSpeed(self):
-        # We randomly select a shape to test the speed. Intentionally we
-        # test a batch size of 1 since this may be the most frequent use
-        # case for MKL during deployment time.
-        X = np.random.rand(1, 64, 224, 224).astype(np.float32)
-        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
-        # Makes sure that feed works.
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
-        net = core.Net("test")
-        # Makes sure that we can run relu.
-        net.MaxPool("X", "Y", stride=2, kernel=3)
-        net.MaxPool("X_mkl", "Y_mkl",
-                 stride=2, kernel=3, device_option=mkl_do)
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-        # makes sure that the results are good.
-        np.testing.assert_allclose(
-            workspace.FetchBlob("Y"),
-            workspace.FetchBlob("Y_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
-
-        print("Maxpooling CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
-
-    def testAveragePoolingSpeed(self):
-        # We randomly select a shape to test the speed. Intentionally we
-        # test a batch size of 1 since this may be the most frequent use
-        # case for MKL during deployment time.
-        X = np.random.rand(1, 64, 224, 224).astype(np.float32)
-        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
-        # Makes sure that feed works.
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
-        net = core.Net("test")
-        # Makes sure that we can run relu.
-        net.AveragePool("X", "Y", stride=2, kernel=3)
-        net.AveragePool("X_mkl", "Y_mkl",
-                 stride=2, kernel=3, device_option=mkl_do)
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-        # makes sure that the results are good.
-        np.testing.assert_allclose(
-            workspace.FetchBlob("Y"),
-            workspace.FetchBlob("Y_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
-
-        print("Averagepooling CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
-
-    def testConvReluMaxPoolSpeed(self):
-        # We randomly select a shape to test the speed. Intentionally we
-        # test a batch size of 1 since this may be the most frequent use
-        # case for MKL during deployment time.
-        X = np.random.rand(1, 3, 224, 224).astype(np.float32) - 0.5
-        W = np.random.rand(64, 3, 11, 11).astype(np.float32) - 0.5
-        b = np.random.rand(64).astype(np.float32) - 0.5
-
-        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
-        # Makes sure that feed works.
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("W", W)
-        workspace.FeedBlob("b", b)
-        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
-        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
-        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
-
-        net = core.Net("test")
-
-        net.Conv(["X", "W", "b"], "C", pad=1, stride=1, kernel=11)
-        net.Conv(["X_mkl", "W_mkl", "b_mkl"], "C_mkl",
-                 pad=1, stride=1, kernel=11, device_option=mkl_do)
-        net.Relu("C", "R")
-        net.Relu("C_mkl", "R_mkl", device_option=mkl_do)
-        net.AveragePool("R", "Y", stride=2, kernel=3)
-        net.AveragePool("R_mkl", "Y_mkl",
-                 stride=2, kernel=3, device_option=mkl_do)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-        # makes sure that the results are good.
-        np.testing.assert_allclose(
-            workspace.FetchBlob("Y"),
-            workspace.FetchBlob("Y_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_relu_op_test.py b/caffe2/python/mkl/mkl_relu_op_test.py
deleted file mode 100644
index 76ec33bcbe91..000000000000
--- a/caffe2/python/mkl/mkl_relu_op_test.py
+++ /dev/null
@@ -1,36 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-class MKLReluTest(hu.HypothesisTestCase):
-    @given(size=st.integers(8, 20),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           inplace=st.booleans(),
-           **mu.gcs)
-    def test_mkl_relu(self, size, input_channels, batch_size, inplace, gc, dc):
-        op = core.CreateOperator(
-            "Relu",
-            ["X"],
-            ["Y"] if not inplace else ["X"],
-        )
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_sbn_op_test.py b/caffe2/python/mkl/mkl_sbn_op_test.py
deleted file mode 100644
index 86856b130d63..000000000000
--- a/caffe2/python/mkl/mkl_sbn_op_test.py
+++ /dev/null
@@ -1,82 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-class MKLSpatialBNTest(hu.HypothesisTestCase):
-    @given(size=st.integers(7, 10),
-           input_channels=st.integers(1, 10),
-           batch_size=st.integers(1, 3),
-           seed=st.integers(0, 65535),
-           #order=st.sampled_from(["NCHW", "NHWC"]),
-           order=st.sampled_from(["NCHW"]),
-           epsilon=st.floats(1e-5, 1e-2),
-           **mu.gcs)
-    def test_spatialbn_test_mode(self, size, input_channels,
-                                 batch_size, seed, order, epsilon, gc, dc):
-        np.random.seed(seed)
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "mean", "var"],
-            ["Y"],
-            order=order,
-            is_test=True,
-            epsilon=epsilon,
-        )
-
-        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var], [0])
-
-    @given(size=st.integers(7, 10),
-           input_channels=st.integers(1, 10),
-           batch_size=st.integers(1, 3),
-           seed=st.integers(0, 65535),
-           #order=st.sampled_from(["NCHW", "NHWC"]),
-           order=st.sampled_from(["NCHW"]),
-           epsilon=st.floats(1e-5, 1e-2),
-           **mu.gcs)
-    def test_spatialbn_train_mode(
-            self, size, input_channels, batch_size, seed, order, epsilon,
-            gc, dc):
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "running_mean", "running_var"],
-            ["Y", "running_mean", "running_var", "saved_mean", "saved_var"],
-            order=order,
-            is_test=False,
-            epsilon=epsilon,
-        )
-        np.random.seed(seed)
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        # Note: it seems that the running mean and var do not pass the device
-        # test, suggesting that the semantics are a bit different. Only
-        # checking the output and saved mean and var at this stage.
-        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var],
-                                [0, 3, 4])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_sbn_speed_test.py b/caffe2/python/mkl/mkl_sbn_speed_test.py
deleted file mode 100644
index 05885ceca575..000000000000
--- a/caffe2/python/mkl/mkl_sbn_speed_test.py
+++ /dev/null
@@ -1,120 +0,0 @@
-
-
-
-
-import unittest
-
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, test_util
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
-class TestMKLBasic(test_util.TestCase):
-    def testSpatialBNTestingSpeed(self):
-
-        input_channel = 10
-        X = np.random.rand(1, input_channel, 100, 100).astype(np.float32) - 0.5
-        scale = np.random.rand(input_channel).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channel).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channel).astype(np.float32)
-        var = np.random.rand(input_channel).astype(np.float32) + 0.5
-
-        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
-        # Makes sure that feed works.
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("scale", scale)
-        workspace.FeedBlob("bias", bias)
-        workspace.FeedBlob("mean", mean)
-        workspace.FeedBlob("var", var)
-        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
-        workspace.FeedBlob("scale_mkl", scale, device_option=mkl_do)
-        workspace.FeedBlob("bias_mkl", bias, device_option=mkl_do)
-        workspace.FeedBlob("mean_mkl", mean, device_option=mkl_do)
-        workspace.FeedBlob("var_mkl", var, device_option=mkl_do)
-        net = core.Net("test")
-        # Makes sure that we can run relu.
-        net.SpatialBN(["X", "scale", "bias","mean","var"], "Y", order="NCHW",
-            is_test=True,
-            epsilon=1e-5)
-        net.SpatialBN(["X_mkl", "scale_mkl", "bias_mkl","mean_mkl","var_mkl"], "Y_mkl", order="NCHW",
-            is_test=True,
-            epsilon=1e-5, device_option=mkl_do)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-        # makes sure that the results are good.
-        np.testing.assert_allclose(
-            workspace.FetchBlob("Y"),
-            workspace.FetchBlob("Y_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
-
-        print("FC CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
-
-    def testSpatialBNTrainingSpeed(self):
-        input_channel = 10
-        X = np.random.rand(1, input_channel, 100, 100).astype(np.float32) - 0.5
-        scale = np.random.rand(input_channel).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channel).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channel).astype(np.float32)
-        var = np.random.rand(input_channel).astype(np.float32) + 0.5
-
-        #mean = np.zeros(input_channel)
-        #var = np.zeros(input_channel)
-
-        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
-        # Makes sure that feed works.
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("scale", scale)
-        workspace.FeedBlob("bias", bias)
-        workspace.FeedBlob("mean", mean)
-        workspace.FeedBlob("var", var)
-        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
-        workspace.FeedBlob("scale_mkl", scale, device_option=mkl_do)
-        workspace.FeedBlob("bias_mkl", bias, device_option=mkl_do)
-        workspace.FeedBlob("mean_mkl", mean, device_option=mkl_do)
-        workspace.FeedBlob("var_mkl", var, device_option=mkl_do)
-        net = core.Net("test")
-        # Makes sure that we can run relu.
-        net.SpatialBN(["X", "scale", "bias","mean", "var"],
-            ["Y", "mean", "var", "saved_mean", "saved_var"],
-            order="NCHW",
-            is_test=False,
-            epsilon=1e-5)
-        net.SpatialBN(["X_mkl", "scale_mkl", "bias_mkl","mean_mkl","var_mkl"],
-            ["Y_mkl", "mean_mkl", "var_mkl", "saved_mean_mkl", "saved_var_mkl"],
-            order="NCHW",
-            is_test=False,
-            epsilon=1e-5,
-            device_option=mkl_do)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-
-        # makes sure that the results are good.
-        np.testing.assert_allclose(
-            workspace.FetchBlob("Y"),
-            workspace.FetchBlob("Y_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-        np.testing.assert_allclose(
-            workspace.FetchBlob("mean"),
-            workspace.FetchBlob("mean_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-        np.testing.assert_allclose(
-            workspace.FetchBlob("var"),
-            workspace.FetchBlob("var_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-
-        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
-
-        print("FC CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
-
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_sigmoid_op_test.py b/caffe2/python/mkl/mkl_sigmoid_op_test.py
deleted file mode 100644
index abdb0983778d..000000000000
--- a/caffe2/python/mkl/mkl_sigmoid_op_test.py
+++ /dev/null
@@ -1,32 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-class MKLSigmoidTest(hu.HypothesisTestCase):
-    @given(n=st.integers(1, 5), m=st.integers(1, 5), inplace=st.booleans(),
-           **mu.gcs)
-    def test_mkl_sigmoid(self, n, m, inplace, gc, dc):
-        X = np.random.rand(m, n).astype(np.float32)
-        op = core.CreateOperator(
-            "Sigmoid",
-            ["X"],
-            ["Y" if not inplace else "X"]
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_speed_test.py b/caffe2/python/mkl/mkl_speed_test.py
deleted file mode 100644
index ab2e4428519a..000000000000
--- a/caffe2/python/mkl/mkl_speed_test.py
+++ /dev/null
@@ -1,80 +0,0 @@
-
-
-
-
-import unittest
-
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, test_util
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn.")
-class TestMKLBasic(test_util.TestCase):
-    def testReLUSpeed(self):
-        X = np.random.randn(128, 4096).astype(np.float32)
-        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
-        # Makes sure that feed works.
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
-        net = core.Net("test")
-        # Makes sure that we can run relu.
-        net.Relu("X", "Y")
-        net.Relu("X_mkl", "Y_mkl", device_option=mkl_do)
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-        # makes sure that the results are good.
-        np.testing.assert_allclose(
-            workspace.FetchBlob("Y"),
-            workspace.FetchBlob("Y_mkl"),
-            atol=1e-10,
-            rtol=1e-10)
-        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
-
-        # The returned runtime is the time of
-        # [whole_net, cpu_op, mkl_op]
-        # so we will assume that the MKL one runs faster than the CPU one.
-
-        # Note(Yangqing): in fact, it seems that in optimized mode, this is
-        # not always guaranteed - MKL runs slower than the Eigen vectorized
-        # version, so I am turning this assertion off.
-        #self.assertTrue(runtime[1] >= runtime[2])
-
-        print("Relu CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
-
-
-    def testConvSpeed(self):
-        # We randomly select a shape to test the speed. Intentionally we
-        # test a batch size of 1 since this may be the most frequent use
-        # case for MKL during deployment time.
-        X = np.random.rand(1, 256, 27, 27).astype(np.float32) - 0.5
-        W = np.random.rand(192, 256, 3, 3).astype(np.float32) - 0.5
-        b = np.random.rand(192).astype(np.float32) - 0.5
-        mkl_do = core.DeviceOption(caffe2_pb2.MKLDNN)
-        # Makes sure that feed works.
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("W", W)
-        workspace.FeedBlob("b", b)
-        workspace.FeedBlob("X_mkl", X, device_option=mkl_do)
-        workspace.FeedBlob("W_mkl", W, device_option=mkl_do)
-        workspace.FeedBlob("b_mkl", b, device_option=mkl_do)
-        net = core.Net("test")
-        # Makes sure that we can run relu.
-        net.Conv(["X", "W", "b"], "Y", pad=1, stride=1, kernel=3)
-        net.Conv(["X_mkl", "W_mkl", "b_mkl"], "Y_mkl",
-                 pad=1, stride=1, kernel=3, device_option=mkl_do)
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-        # makes sure that the results are good.
-        np.testing.assert_allclose(
-            workspace.FetchBlob("Y"),
-            workspace.FetchBlob("Y_mkl"),
-            atol=1e-2,
-            rtol=1e-2)
-        runtime = workspace.BenchmarkNet(net.Proto().name, 1, 100, True)
-
-        print("Conv CPU runtime {}, MKL runtime {}.".format(runtime[1], runtime[2]))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/mkl/mkl_squeeze_op_test.py b/caffe2/python/mkl/mkl_squeeze_op_test.py
deleted file mode 100644
index 8af090f60d88..000000000000
--- a/caffe2/python/mkl/mkl_squeeze_op_test.py
+++ /dev/null
@@ -1,37 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(
-    not workspace.C.has_mkldnn, "Skipping as we do not have mkldnn."
-)
-class MKLSqueezeTest(hu.HypothesisTestCase):
-    @given(
-        squeeze_dims=st.lists(st.integers(0, 3), min_size=1, max_size=3),
-        inplace=st.booleans(),
-        **mu.gcs
-    )
-    def test_mkl_squeeze(self, squeeze_dims, inplace, gc, dc):
-        shape = [
-            1 if dim in squeeze_dims else np.random.randint(1, 5)
-            for dim in range(4)
-        ]
-        X = np.random.rand(*shape).astype(np.float32)
-        op = core.CreateOperator(
-            "Squeeze", "X", "X" if inplace else "Y", dims=squeeze_dims
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/mkl/rewrite_graph.py b/caffe2/python/mkl/rewrite_graph.py
deleted file mode 100644
index b52501584064..000000000000
--- a/caffe2/python/mkl/rewrite_graph.py
+++ /dev/null
@@ -1,215 +0,0 @@
-
-
-
-
-
-import copy
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-
-
-def rewrite_init_net_simple(net):
-    for op in net.op:
-        op.device_option.device_type = caffe2_pb2.IDEEP
-
-def last_producer(ops, blob):
-    for (i, op) in reversed(list(enumerate(ops))):
-        if blob in op.output:
-            return i
-    raise ValueError("Failed to find last producer of blob, %s", blob)
-
-
-def fix_BoxWithNMSLimit(net):
-    outputs = set()
-    for op in net.op:
-        if op.type == 'BoxWithNMSLimit':
-            outputs.add(op.output[0])
-            outputs.add(op.output[1])
-            outputs.add(op.output[2])
-    for op in net.op:
-        if op.type == 'CopyIDEEPToCPU':
-            if op.input[0] in outputs:
-                print("Chaning CopyIDEEPToCPU to Copy for {}".format(op.input[0]))
-                op.type = 'Copy'
-                op.device_option.device_type = caffe2_pb2.CPU
-
-
-def rewrite_run_net_simple(net):
-    # Simple rewrite for now - assume entire graph can be executed
-    # with MKL, so just insert copy ops for external_input[0] and
-    # external_output[0]
-    def mkl_tmp(name):
-        return "{}__MKL__".format(name)
-
-    input_blob = net.external_input[0]
-    if input_blob != net.op[0].input[0]:
-        raise Exception(
-            "Input blob: {} is not consumed by first op: {}".format(
-                input_blob, net.op[0]))
-    # Modify input/outputs to point to copied MKL blobs.
-    from_cpu = "CopyCPUToIDEEP"
-    to_cpu = "CopyIDEEPToCPU"
-    copy_input_op = core.CreateOperator(
-        from_cpu, input_blob, mkl_tmp(input_blob))
-    net.op[0].input[0] = mkl_tmp(input_blob)
-
-    copy_output_ops = [
-        core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
-        for output_blob in net.external_output]
-
-    for output_blob in net.external_output:
-        last_producer_idx = last_producer(net.op, output_blob)
-        renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
-                           for blob in net.op[last_producer_idx].output]
-        net.op[last_producer_idx].output[:] = renamed_outputs
-        # Rename any subsequent consumers of an output blob.
-        for op in net.op[last_producer_idx + 1:]:
-            renamed_input = [blob if blob != output_blob else mkl_tmp(blob)
-                             for blob in op.input]
-            op.input[:] = renamed_input
-
-    ops = [copy_input_op] + net.op[:] + copy_output_ops
-    del net.op[:]
-    net.op.extend(ops)
-    device = caffe2_pb2.IDEEP
-    for op in net.op:
-        op.device_option.MergeFrom(
-            core.DeviceOption(device_type=device))
-        op.engine = ""
-
-    # Temporarily disable conv+relu fusion until we verify further
-    # net.ParseFromString(
-    #     C.transform_optimizeForMKLDNN(net.SerializeToString()))
-    fix_BoxWithNMSLimit(net)
-
-
-def rewrite_run_net_simple_xrayocr_lstm(net):
-    # For xrayocr model with lstm, only rewrite the non-lstm part of the net to
-    # enable mkl, then copy the temporary output blob at the break point
-    # and all external inputs for lstm part to cpu, and execuate rest of the net
-    # (two lstm) on cpu
-    # This only works for the xrayocr lstm model which uses the first 'Shape' op
-    # to decide the break point, and after two lstm it's external_output
-    # directly so there's no need to copy back to ideep/mkl
-
-    def mkl_tmp(name):
-        return "{}__MKL__".format(name)
-
-    def cpu_tmp(name):
-        return "{}__CPU__".format(name)
-
-    input_blob = net.external_input[0]
-    if input_blob != net.op[0].input[0]:
-        raise Exception(
-            "Input blob: {} is not consumed by first op: {}".format(
-                input_blob, net.op[0]))
-    # Modify input/outputs to point to copied MKL blobs.
-    from_cpu = "CopyCPUToIDEEP"
-    to_cpu = "CopyIDEEPToCPU"
-    copy_input_op = core.CreateOperator(
-        from_cpu, input_blob, mkl_tmp(input_blob))
-    net.op[0].input[0] = mkl_tmp(input_blob)
-
-    # the net may contain some external_inputs falsely added during ONNX->Caffe2
-    # This should be taken care of in early steps during pytorch_to_caffe2,
-    # but if not it can cause issue in follow up steps, so check here to confirm
-    for input_blob in net.external_input:
-        for op in net.op:
-            # look for if the external_input blob is output of any op in the net
-            assert input_blob not in op.output
-
-    external_output = None
-    external_inputs_to_cpu = set()
-    find_first_shape_op = False
-    cpu_op_start_idx = -1
-    for op_idx, op in enumerate(net.op):
-        # the first Shape op mark the starting point of LSTM chunk of the net
-        if not find_first_shape_op:
-            if op.type == 'Shape':
-                external_output = op.input
-                find_first_shape_op = True
-                cpu_op_start_idx = op_idx
-        else:
-            # any external input in the LSTM part need to be copied to CPU
-            for in_blob in op.input:
-                if in_blob in net.external_input:
-                    external_inputs_to_cpu.add(in_blob)
-
-    # make sure we found the expected break point of the net
-    assert external_output is not None
-
-    # create op to copy external input blobs used in LSTM part from IDEEP to CPU
-    copy_extra_input_ops = []
-    for in_blob in external_inputs_to_cpu:
-        copy_extra_input_ops.append(core.CreateOperator(to_cpu, in_blob,
-                                                        cpu_tmp(in_blob)))
-        # rename input blobs in LSTM part to use the CPU copy
-        for op in net.op[cpu_op_start_idx:]:
-            renamed_input = [blob if blob != in_blob else cpu_tmp(in_blob)
-                             for blob in op.input]
-            op.input[:] = renamed_input
-
-    copy_output_ops = [
-        core.CreateOperator(to_cpu, mkl_tmp(output_blob), output_blob)
-        for output_blob in external_output]
-
-    for output_blob in external_output:
-        last_producer_idx = last_producer(net.op, output_blob)
-        renamed_outputs = [blob if blob != output_blob else mkl_tmp(blob)
-                           for blob in net.op[last_producer_idx].output]
-        net.op[last_producer_idx].output[:] = renamed_outputs
-
-    # rearrange all ops in correct order
-    ops = [copy_input_op] + net.op[:cpu_op_start_idx] \
-          + copy_output_ops + copy_extra_input_ops + net.op[cpu_op_start_idx:]
-    del net.op[:]
-    net.op.extend(ops)
-
-    device = caffe2_pb2.IDEEP
-    for op in net.op:
-        # the first Shape op mark the starting point of LSTM chunk of the net
-        if op.type == 'Shape':
-            # all LSTM ops should run on CPU
-            device = caffe2_pb2.CPU
-        op.device_option.MergeFrom(
-            core.DeviceOption(device_type=device))
-        op.engine = ""
-
-        # RecurrentNetwork has a nested step_net that needs special treatment
-        if op.type == 'RecurrentNetwork':
-            for arg in op.arg:
-                if arg.name == 'step_net':
-                    for nested_op in arg.n.op:
-                        # set device to CPU
-                        nested_op.device_option.MergeFrom(
-                            core.DeviceOption(device_type=device))
-                        nested_op.engine = ""
-
-                        # rename inputs in op of nested net
-                        renamed_input = []
-                        for blob in nested_op.input:
-                            renamed_input.append(blob
-                                if blob not in external_inputs_to_cpu
-                                else cpu_tmp(blob))
-                        nested_op.input[:] = renamed_input
-
-                    # rename external inputs of nested net
-                    new_external_input = []
-                    for blob in arg.n.external_input:
-                        new_external_input.append(blob
-                            if blob not in external_inputs_to_cpu
-                            else cpu_tmp(blob))
-                    arg.n.external_input[:] = new_external_input
-
-    # Temporarily disable conv+relu fusion until we verify further
-    # net.ParseFromString(
-    #     C.transform_optimizeForMKLDNN(net.SerializeToString()))
-    fix_BoxWithNMSLimit(net)
-
-
-def rewrite_model_helper_simple(model):
-    model = copy.deepcopy(model)
-    # All parameter initialization should run on MKL
-    rewrite_init_net_simple(model.param_init_net.Proto())
-    rewrite_run_net_simple(model.net.Proto())
-    return model
diff --git a/caffe2/python/mkl/rewrite_graph_test.py b/caffe2/python/mkl/rewrite_graph_test.py
deleted file mode 100644
index 1ad209cdbdfd..000000000000
--- a/caffe2/python/mkl/rewrite_graph_test.py
+++ /dev/null
@@ -1,255 +0,0 @@
-
-
-
-
-
-import unittest
-import numpy as np
-import copy
-from hypothesis import given
-import hypothesis.strategies as st
-
-from caffe2.python.model_helper import ModelHelper
-from caffe2.python.models import resnet
-from caffe2.python import workspace, brew
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl.rewrite_graph as rewrite_graph
-
-
-def deterministic_io(model):
-    model = copy.deepcopy(model)
-    for i, op in enumerate(model.InitProto().op):
-        op.device_option.random_seed = i + 1
-    if not model.Proto().external_output:
-        model.Proto().external_output.extend([model.Proto().op[-1].output[0]])
-    return model
-
-def simple_fc():
-    model = ModelHelper(name="r")
-    brew.fc(model, "data", "fc", 10, 10)
-    return model, [(1, 10)]
-
-def double_matmul():
-    model = ModelHelper(name="r")
-    fc0 = brew.fc(model, "data", "fc0", 10, 10)
-    fc1 = brew.fc(model, fc0, "fc1", 10, 10)
-    model.Proto().external_output[:] = [str(fc0), str(fc1)]
-    return model, [(1, 10)]
-
-def simple_relu():
-    model = ModelHelper(name="r")
-    brew.relu(model, "data", "fc")
-    return model, [(1, 10)]
-
-
-def simple_mlp():
-    model = ModelHelper(name="r")
-    brew.relu(
-        model,
-        brew.fc(
-            model,
-            brew.relu(
-                model,
-                brew.fc(
-                    model,
-                    "data",
-                    "fc1",
-                    10,
-                    10),
-                "rl1"),
-            "fc2",
-            10,
-            10),
-        "rl2")
-    return model, [(1, 10)]
-
-
-def simple_cnn():
-    model = ModelHelper(name="r", arg_scope={"order": "NCHW", "is_test": True})
-    brew.conv(
-        model, "data", 'conv1', 3, 16, kernel=3, stride=1
-    )
-    brew.spatial_bn(
-        model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3
-    )
-    brew.relu(model, 'conv1_spatbn', 'relu1')
-    return model, [(1, 3, 32, 32)]
-
-
-def alexnet():
-    model = ModelHelper(name="r", arg_scope={"order": "NCHW", "is_test": True})
-    conv1 = brew.conv(
-        model,
-        "data",
-        "conv1",
-        3,
-        64,
-        11, ('XavierFill', {}), ('ConstantFill', {}),
-        stride=4,
-        pad=2
-    )
-    relu1 = brew.relu(model, conv1, "conv1")
-    pool1 = brew.max_pool(model, relu1, "pool1", kernel=3, stride=2, pad=0,
-                          legacy_pad=3)
-    lrn1 = brew.lrn(
-        model, pool1, "pool1_lrn", size=5, alpha=1.0e-4, beta=0.75, bias=1.0)
-    conv2 = brew.conv(
-        model,
-        lrn1,
-        "conv2",
-        64,
-        192,
-        5,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=2
-    )
-    relu2 = brew.relu(model, conv2, "conv2")
-    pool2 = brew.max_pool(model, relu2, "pool2", kernel=3, stride=2)
-    lrn2 = brew.lrn(
-        model, pool2, "pool2_lrn", size=5, alpha=1.0e-4, beta=0.75, bias=1.0)
-    conv3 = brew.conv(
-        model,
-        lrn2,
-        "conv3",
-        192,
-        384,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu3 = brew.relu(model, conv3, "conv3")
-    conv4 = brew.conv(
-        model,
-        relu3,
-        "conv4",
-        384,
-        256,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu4 = brew.relu(model, conv4, "conv4")
-    conv5 = brew.conv(
-        model,
-        relu4,
-        "conv5",
-        256,
-        256,
-        3,
-        ('XavierFill', {}),
-        ('ConstantFill', {}),
-        pad=1
-    )
-    relu5 = brew.relu(model, conv5, "conv5")
-    pool5 = brew.max_pool(model, relu5, "pool5", kernel=3, stride=2)
-    fc6 = brew.fc(
-        model,
-        pool5, "fc6", 256 * 6 * 6, 4096, ('XavierFill', {}),
-        ('ConstantFill', {})
-    )
-    relu6 = brew.relu(model, fc6, "fc6")
-    fc7 = brew.fc(
-        model, relu6, "fc7", 4096, 4096, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    relu7 = brew.relu(model, fc7, "fc7")
-    drop7 = brew.dropout(model, relu7, "fc7_dropout", is_test=1, ratio=0.5)
-    fc8 = brew.fc(
-        model, drop7, "fc8", 4096, 1000, ('XavierFill', {}), ('ConstantFill', {})
-    )
-    relu8 = brew.relu(model, fc8, "fc8")
-    brew.dropout(model, relu8, "fc8_dropout", is_test=1, ratio=0.5)
-    return model, [(1, 3, 224, 224)]
-
-
-def simple_resnet():
-    model = ModelHelper(name="r", arg_scope={"order": "NCHW", "is_test": True})
-    resnet.create_resnet_32x32(
-        model, "data", num_input_channels=1, num_groups=1, num_labels=5,
-        is_test=True)
-    return model, [(1, 1, 32, 32)]
-
-
-def complex_resnet():
-    model = ModelHelper(name="r", arg_scope={"order": "NCHW", "is_test": True})
-    resnet.create_resnet50(
-        model, "data", num_input_channels=1, num_labels=5, is_test=True,
-        no_loss=True)
-    return model, [(1, 1, 224, 224)]
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class MKLRewriteTest(hu.HypothesisTestCase):
-    @given(gen=st.sampled_from([simple_relu, simple_fc,
-                                simple_mlp, simple_cnn]))
-    def test_mkl_simple_rewrite(self, gen):
-        cpu_model, (shape,) = gen()
-        cpu_model = deterministic_io(cpu_model)
-        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model)
-        X = np.random.randn(*shape).astype(np.float32)
-
-        def run(model):
-            self.ws.run(model.InitProto())
-            self.ws.create_blob(model.Proto().external_input[0]).feed(X)
-            self.ws.run(model.Proto())
-            return self.ws.blobs[model.Proto().external_output[0]].fetch()
-
-        np.testing.assert_allclose(run(cpu_model), run(mkl_model),
-                                   atol=1e-4, rtol=1e-4)
-
-    def test_mkl_resnet_rewrite(self):
-        cpu_model, (shape,) = complex_resnet()
-        cpu_model = deterministic_io(cpu_model)
-        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model)
-        np.random.seed(1701)
-        X = np.random.randn(*shape).astype(np.float32)
-
-        def run(model):
-            self.ws.run(model.InitProto())
-            self.ws.create_blob(model.Proto().external_input[0]).feed(X)
-            self.ws.run(model.Proto())
-            return self.ws.blobs[model.Proto().external_output[0]].fetch()
-        np.testing.assert_allclose(run(cpu_model), run(mkl_model),
-                                   atol=1e-4, rtol=1e-4)
-
-    def test_mkl_multi_output_rewrite(self):
-        cpu_model, shapes = double_matmul()
-        cpu_model = deterministic_io(cpu_model)
-        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model)
-        np.random.seed(1701)
-        Xs = [np.random.randn(*shape).astype(np.float32) for shape in shapes]
-
-        def run(model):
-            self.ws.run(model.InitProto())
-            for (name, X) in zip(model.Proto().external_input, Xs):
-                self.ws.create_blob(name).feed(X)
-            print(model.Proto())
-            self.ws.run(model.Proto())
-            return [self.ws.blobs[name].fetch()
-                    for name in model.Proto().external_output]
-
-        run(mkl_model)
-
-        np.testing.assert_allclose(run(cpu_model), run(mkl_model),
-                                   atol=1e-4, rtol=1e-4)
-
-    def test_mkl_alexnet_rewrite(self):
-        cpu_model, (shape,) = alexnet()
-        cpu_model = deterministic_io(cpu_model)
-        mkl_model = rewrite_graph.rewrite_model_helper_simple(cpu_model)
-        np.random.seed(1701)
-        X = np.random.randn(*shape).astype(np.float32)
-
-        def run(model):
-            self.ws.run(model.InitProto())
-            self.ws.create_blob(model.Proto().external_input[0]).feed(X)
-            self.ws.run(model.Proto())
-            return self.ws.blobs[model.Proto().external_output[0]].fetch()
-        np.testing.assert_allclose(run(cpu_model), run(mkl_model),
-                                   atol=1e-4, rtol=1e-4)
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/mkl_test_util.py b/caffe2/python/mkl_test_util.py
deleted file mode 100644
index 88fb3cc800ec..000000000000
--- a/caffe2/python/mkl_test_util.py
+++ /dev/null
@@ -1,44 +0,0 @@
-## @package mkl_test_util
-# Module caffe2.python.mkl_test_util
-"""
-The MKL test utils is a small addition on top of the hypothesis test utils
-under caffe2/python, which allows one to more easily test MKL related
-operators.
-"""
-
-
-
-
-
-
-import hypothesis.strategies as st
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import workspace
-from caffe2.python import hypothesis_test_util as hu
-
-cpu_do = hu.cpu_do
-gpu_do = hu.gpu_do
-mkl_do = caffe2_pb2.DeviceOption(device_type=caffe2_pb2.MKLDNN)
-device_options = hu.device_options + (
-    [mkl_do] if workspace.C.has_mkldnn else [])
-
-
-def device_checker_device_options():
-    return st.just(device_options)
-
-
-def gradient_checker_device_option():
-    return st.sampled_from(device_options)
-
-
-gcs = dict(
-    gc=gradient_checker_device_option(),
-    dc=device_checker_device_options()
-)
-
-gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
-gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do]))
-gcs_mkl_only = dict(gc=st.sampled_from([mkl_do]), dc=st.just([mkl_do]))
-
-gcs_cpu_mkl = dict(gc=st.sampled_from([cpu_do, mkl_do]), dc=st.just([cpu_do, mkl_do]))
diff --git a/caffe2/python/model_device_test.py b/caffe2/python/model_device_test.py
deleted file mode 100644
index 33f594dd302f..000000000000
--- a/caffe2/python/model_device_test.py
+++ /dev/null
@@ -1,152 +0,0 @@
-import numpy as np
-import unittest
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import (
-    workspace,
-    device_checker,
-    test_util,
-    model_helper,
-    brew,
-)
-
-
-class TestMiniAlexNet(test_util.TestCase):
-
-    def _MiniAlexNetNoDropout(self, order):
-        # First, AlexNet using the cnn wrapper.
-        model = model_helper.ModelHelper(name="alexnet")
-        conv1 = brew.conv(
-            model,
-            "data",
-            "conv1",
-            3,
-            16,
-            11,
-            ("XavierFill", {}),
-            ("ConstantFill", {}),
-            stride=4,
-            pad=0
-        )
-        relu1 = brew.relu(model, conv1, "relu1")
-        norm1 = brew.lrn(model, relu1, "norm1", size=5, alpha=0.0001, beta=0.75)
-        pool1 = brew.max_pool(model, norm1, "pool1", kernel=3, stride=2)
-        conv2 = brew.group_conv(
-            model,
-            pool1,
-            "conv2",
-            16,
-            32,
-            5,
-            ("XavierFill", {}),
-            ("ConstantFill", {"value": 0.1}),
-            group=2,
-            stride=1,
-            pad=2
-        )
-        relu2 = brew.relu(model, conv2, "relu2")
-        norm2 = brew.lrn(model, relu2, "norm2", size=5, alpha=0.0001, beta=0.75)
-        pool2 = brew.max_pool(model, norm2, "pool2", kernel=3, stride=2)
-        conv3 = brew.conv(
-            model,
-            pool2,
-            "conv3",
-            32,
-            64,
-            3,
-            ("XavierFill", {'std': 0.01}),
-            ("ConstantFill", {}),
-            pad=1
-        )
-        relu3 = brew.relu(model, conv3, "relu3")
-        conv4 = brew.group_conv(
-            model,
-            relu3,
-            "conv4",
-            64,
-            64,
-            3,
-            ("XavierFill", {}),
-            ("ConstantFill", {"value": 0.1}),
-            group=2,
-            pad=1
-        )
-        relu4 = brew.relu(model, conv4, "relu4")
-        conv5 = brew.group_conv(
-            model,
-            relu4,
-            "conv5",
-            64,
-            32,
-            3,
-            ("XavierFill", {}),
-            ("ConstantFill", {"value": 0.1}),
-            group=2,
-            pad=1
-        )
-        relu5 = brew.relu(model, conv5, "relu5")
-        pool5 = brew.max_pool(model, relu5, "pool5", kernel=3, stride=2)
-        fc6 = brew.fc(
-            model, pool5, "fc6", 1152, 1024, ("XavierFill", {}),
-            ("ConstantFill", {"value": 0.1})
-        )
-        relu6 = brew.relu(model, fc6, "relu6")
-        fc7 = brew.fc(
-            model, relu6, "fc7", 1024, 1024, ("XavierFill", {}),
-            ("ConstantFill", {"value": 0.1})
-        )
-        relu7 = brew.relu(model, fc7, "relu7")
-        fc8 = brew.fc(
-            model, relu7, "fc8", 1024, 5, ("XavierFill", {}),
-            ("ConstantFill", {"value": 0.0})
-        )
-        pred = brew.softmax(model, fc8, "pred")
-        xent = model.LabelCrossEntropy([pred, "label"], "xent")
-        loss = model.AveragedLoss([xent], ["loss"])
-        model.AddGradientOperators([loss])
-        return model
-
-    def _testMiniAlexNet(self, order):
-        # First, we get all the random initialization of parameters.
-        model = self._MiniAlexNetNoDropout(order)
-        workspace.ResetWorkspace()
-        workspace.RunNetOnce(model.param_init_net)
-        inputs = dict(
-            [(str(name), workspace.FetchBlob(str(name))) for name in
-             model.params]
-        )
-        if order == "NCHW":
-            inputs["data"] = np.random.rand(4, 3, 227, 227).astype(np.float32)
-        else:
-            inputs["data"] = np.random.rand(4, 227, 227, 3).astype(np.float32)
-        inputs["label"] = np.array([1, 2, 3, 4]).astype(np.int32)
-
-        cpu_device = caffe2_pb2.DeviceOption()
-        cpu_device.device_type = caffe2_pb2.CPU
-        gpu_device = caffe2_pb2.DeviceOption()
-        gpu_device.device_type = workspace.GpuDeviceType
-
-        checker = device_checker.DeviceChecker(0.05, [cpu_device, gpu_device])
-        ret = checker.CheckNet(
-            model.net.Proto(),
-            inputs,
-            # The indices sometimes may be sensitive to small numerical
-            # differences in the input, so we ignore checking them.
-            ignore=['_pool1_idx', '_pool2_idx', '_pool5_idx']
-        )
-        self.assertEqual(ret, True)
-
-    @unittest.skipIf(not workspace.has_gpu_support,
-                     "No GPU support. Skipping test.")
-    def testMiniAlexNetNCHW(self):
-        self._testMiniAlexNet("NCHW")
-
-    # No Group convolution support for NHWC right now
-    #@unittest.skipIf(not workspace.has_gpu_support,
-    #                 "No GPU support. Skipping test.")
-    #def testMiniAlexNetNHWC(self):
-    #    self._testMiniAlexNet("NHWC")
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
deleted file mode 100644
index 34466620cb27..000000000000
--- a/caffe2/python/model_helper.py
+++ /dev/null
@@ -1,646 +0,0 @@
-## @package model_helper
-# Module caffe2.python.model_helper
-
-
-
-
-
-from caffe2.python import core, scope, workspace
-from caffe2.python.helpers.db_input import db_input
-from caffe2.python.modeling import parameter_info
-from caffe2.python.modeling.parameter_sharing import (
-    parameter_sharing_context,
-)
-from caffe2.python.optimizer_context import (
-    OptimizerContext,
-    DEFAULT_OPTIM,
-)
-from caffe2.python.regularizer_context import RegularizerContext
-
-from itertools import chain
-
-import logging
-
-
-# _known_working_ops are operators that do not need special care.
-_known_working_ops = [
-    "Accuracy",
-    "Adam",
-    "Add",
-    "Adagrad",
-    "SparseAdagrad",
-    "Adadelta",
-    "SparseAdadelta",
-    "AveragedLoss",
-    "Cast",
-    "Checkpoint",
-    "ConstantFill",
-    "Copy",
-    "CopyGPUToCPU",
-    "CopyCPUToGPU",
-    "DequeueBlobs",
-    "EnsureCPUOutput",
-    "ExpandDims",
-    "Flatten",
-    "FlattenToVec",
-    "LabelCrossEntropy",
-    "LearningRate",
-    "MakeTwoClass",
-    "MatMul",
-    "NCCLAllreduce",
-    "NHWC2NCHW",
-    "PackSegments",
-    "Print",
-    "PRelu",
-    "ReduceFrontSum",
-    "Scale",
-    "ScatterWeightedSum",
-    "Sigmoid",
-    "SortedSegmentSum",
-    "Snapshot",  # Note: snapshot is deprecated, use Checkpoint
-    "Softmax",
-    "SoftmaxWithLoss",
-    "SquaredL2Distance",
-    "Squeeze",
-    "StopGradient",
-    "Summarize",
-    "Tanh",
-    "Transpose",
-    "UnpackSegments",
-    "WeightedSum",
-    "YellowFin"
-]
-
-
-class ModelHelper:
-    """A helper model so we can manange models more easily. It contains net def
-    and parameter storages. You can add an Operator yourself, e.g.
-
-        model = model_helper.ModelHelper(name="train_net")
-        # init your weight and bias as w and b
-        w = model.param_init_net.XavierFill(...)
-        b = model.param_init_net.ConstantFill(...)
-        fc1 = model.FC([input, w, b], output, **kwargs)
-
-    or you can use helper functions in brew module without manually
-    defining parameter initializations and operators.
-
-        model = model_helper.ModelHelper(name="train_net")
-        fc1 = brew.fc(model, input, output, dim_in, dim_out, **kwargs)
-
-    """
-
-    def __init__(self, name=None, init_params=True, allow_not_known_ops=True,
-                 skip_sparse_optim=False, param_model=None, arg_scope=None):
-        self.name = name or "model"
-        self.net = core.Net(self.name)
-
-        if param_model is not None:
-            self.param_init_net = param_model.param_init_net
-            self.param_to_grad = param_model.param_to_grad
-            self.params = param_model.params
-            self._parameters_info = param_model._parameters_info
-            self._computed_params = param_model._computed_params
-        else:
-            self.param_init_net = core.Net(self.name + '_init')
-            self.param_to_grad = {}
-            self.params = []
-            self._parameters_info = {}
-            self._computed_params = []
-
-        self._param_info_deprecated = []
-        self._devices = []
-        self.gradient_ops_added = False
-        self.init_params = init_params
-        self.allow_not_known_ops = allow_not_known_ops
-        self.skip_sparse_optim = skip_sparse_optim
-        self.weights = []
-        self.biases = []
-        self._arg_scope = {
-            'order': "NCHW",
-            'use_cudnn': True,
-            'cudnn_exhaustive_search': False,
-        }
-        if arg_scope is not None:
-            # Please notice value as None is not acceptable. We are not checking it
-            # here because we already have check in MakeArgument.
-            self._arg_scope.update(arg_scope)
-
-    @property
-    def arg_scope(self):
-        return self._arg_scope
-
-    def get_name(self):
-        return self.name
-
-    def _infer_param_shape(self, param):
-        for op in self.param_init_net.Proto().op:
-            if str(param) in op.output:
-                for arg in op.arg:
-                    if arg.name == "shape":
-                        return list(arg.ints)
-        return None
-
-    def _update_param_info_deprecated(self):
-        assert len(self._param_info_deprecated) <= len(self.params)
-        for param in self.params[len(self._param_info_deprecated):]:
-            if not isinstance(param, core.BlobReference):
-                raise ValueError(
-                    "Param %s must be a BlobReference!" % str(param))
-            self._param_info_deprecated.append(parameter_info.ParameterInfo(
-                param_id=len(self._param_info_deprecated),
-                param=param,
-                shape=self._infer_param_shape(param)))
-        for info in self._param_info_deprecated:
-            info.grad = self.param_to_grad.get(info.name)
-
-    def _normalize_tags(self, tags):
-        tags = tags or []
-        return set(tags) if isinstance(tags, list) else set([tags])
-
-    def create_param(self, param_name, shape, initializer, tags=None):
-        """
-        Creates parameter with a given name and initializer.
-
-        If param_name is instance of BlobRefernce - then this blob will be used
-        to store parameter (no any logic will affect it's location).
-
-        If param_name is instance of a string type, then the final blob will
-        be created in the CurrentNameScope with the respect of all parameter
-        sharing logic, i.e. 'resolved_name_scope/param_name'.
-
-        Parameter sharing logic is going to override CurrentNameScope according
-        to the rules that are specified through ParameterSharing contexts,
-        all ParameterSharing contexts are applied recursively until there are no
-        extra overrides present, where on each step the best match will be
-        applied first.
-
-        The following examples should clarify the way ParameterSharing logic
-        works:
-
-        As an example if this function is called with parameter 'w':
-        a. Call from some scope 'global_scope' with no Parameter sharing:
-          'global_scope/w'
-        b. Call from scope 'scope_b', with override {'scope_b': 'scope_a'}:
-          'scope_a/w'
-        c. Call from scope 'scope_a', with override {'scope_a': ''}:
-          'scope_a/w'
-        d. Call from scope 'scope_b/shared', with overrides
-          {'scope_b/shared': 'scope_b', 'scope_b': 'scope_a'}:
-          'scope_a/w'
-        d. Call from scope 'scope_b/unshared', with overrides
-          {'scope_b/shared': 'scope_b', 'scope_b': 'scope_a'}:
-          'scope_a/unshared/w'
-        """
-        # ParameterSharing works only for case when param_name is instance of
-        # a string type. If param_name is a BlobReference - no attempt for
-        # ParameterSharing will be applied.
-        if isinstance(param_name, core.BlobReference):
-            param_name = str(param_name)
-        elif isinstance(param_name, str):
-            # Parameter name will be equal to current Namescope that got
-            # resolved with the respect of parameter sharing of the scopes.
-            param_name = parameter_sharing_context.get_parameter_name(
-                param_name)
-        else:
-            raise TypeError("Unsupported type for param_name")
-
-        if param_name in self._parameters_info:
-            assert self._parameters_info[param_name].shape == shape
-            return self._parameters_info[param_name].blob
-
-        param_info = initializer.create_param(
-            param_name=core.BlobReference(param_name),
-            init_net=self.param_init_net,
-            shape=shape,
-        )
-        optim_context = OptimizerContext.current()
-        for tag in self._normalize_tags(tags):
-            if optim_context.has_optimizer(tag):
-                # param_info will check optimizer has not been set
-                param_info.optimizer = optim_context.get_optimizer(tag)
-        if not param_info.optimizer and optim_context.has_optimizer(DEFAULT_OPTIM):
-            param_info.optimizer = optim_context.get_optimizer(DEFAULT_OPTIM)
-
-        reg_context = RegularizerContext.current()
-        param_info.regularizer = reg_context
-
-        self._parameters_info[param_name] = param_info
-        # Add param to legacy structs as well, so all other functions for
-        # parameters are still working.
-        self.AddParameter(param_info.blob, tags)
-        return param_info.blob
-
-    def get_param_info(self, param):
-        assert isinstance(param, core.BlobReference), \
-            "Param {} is not a BlobReference".format(param)
-        return self._parameters_info.get(param, None)
-
-    # This method is deprecated, use create_param method which
-    # also does parameter initialization when needed
-    def add_param_DEPRECATED(self, param, key=None, shape=None, length=None):
-        logging.warning("add_param method is DEPRECATED")
-        self._update_param_info_deprecated()
-        self.AddParameter(param)
-        if key is not None and self.net.input_record() is not None:
-            idx = self.net.input_record().field_blobs().index(key)
-            key = self.net.input_record().field_names()[idx]
-        shape = shape if shape is not None else self._infer_param_shape(param)
-        if not isinstance(param, core.BlobReference):
-            raise ValueError("Param %s must be a BlobReference!" % str(param))
-        self._param_info_deprecated.append(parameter_info.ParameterInfo(
-            param_id=len(self._param_info_deprecated),
-            param=param,
-            shape=shape,
-            key=key,
-            length=length,
-        ))
-        return self._param_info_deprecated[-1]
-
-    def AddParameter(self, param, tags=None):
-        assert isinstance(param, core.BlobReference)
-        tags = self._normalize_tags(tags)
-        if parameter_info.ParameterTags.COMPUTED_PARAM in tags:
-            self._computed_params.append(param)
-        else:
-            self.params.append(param)
-
-        if parameter_info.ParameterTags.WEIGHT in tags:
-            self.weights.append(param)
-        if parameter_info.ParameterTags.BIAS in tags:
-            self.biases.append(param)
-
-    @staticmethod
-    def _NormalizeNamescope(namescope):
-        if namescope is None:
-            return scope.CurrentNameScope()
-        elif namescope == '' or namescope.endswith(scope._NAMESCOPE_SEPARATOR):
-            return namescope
-        else:
-            return namescope + scope._NAMESCOPE_SEPARATOR
-
-    def GetParams(self, namescope=None, top_scope=False):
-        '''
-        Returns the params in current namescope
-        '''
-        namescope = ModelHelper._NormalizeNamescope(namescope)
-
-        if namescope == '':
-            return self.params[:]
-        else:
-            return [p for p in self.params if
-                    p.GetNameScope().startswith(namescope)]
-
-    def Proto(self):
-        return self.net.Proto()
-
-    def InitProto(self):
-        return self.param_init_net.Proto()
-
-    def RunAllOnGPU(self, *args, **kwargs):
-        self.param_init_net.RunAllOnGPU(*args, **kwargs)
-        self.net.RunAllOnGPU(*args, **kwargs)
-
-    def CreateDB(self, blob_out, db, db_type, **kwargs):
-        dbreader = self.param_init_net.CreateDB(
-            [], blob_out, db=db, db_type=db_type, **kwargs)
-        return dbreader
-
-    def AddGradientOperators(self, *args, **kwargs):
-        if self.gradient_ops_added:
-            raise RuntimeError("You cannot run AddGradientOperators twice.")
-        self.Validate()
-
-        self.gradient_ops_added = True
-        self.grad_map = self.net.AddGradientOperators(*args, **kwargs)
-        self.param_to_grad = self.get_param_to_grad(self.params)
-
-        # Populate ParameterInfo for all parameters if missing
-        # and add gradient blob information. So optimizers can use it
-        for param, grad in self.param_to_grad.items():
-            param_info = self.get_param_info(param)
-            if param_info:
-                param_info.grad = grad
-            else:
-                self._parameters_info[param] = parameter_info.ParameterInfo(
-                    param_id=None,
-                    param=param,
-                    grad=grad,
-                )
-
-        return self.grad_map
-
-    def get_param_to_grad(self, params):
-        '''
-        Given a list of parameters returns a dict from a parameter
-        to a corresponding gradient
-        '''
-
-        param_to_grad = {}
-        if not self.gradient_ops_added:
-            raise RuntimeError("You need to run AddGradientOperators first.")
-        # We need to use empty namescope when creating the gradients
-        # to prevent duplicating the namescope prefix for gradient blobs.
-        for p in params:
-            if str(p) in self.grad_map:
-                param_to_grad[p] = self.grad_map[str(p)]
-        return param_to_grad
-
-    def GetOptimizationParamInfo(self, params=None):
-        '''
-        Returns a map for param => grad.
-        If params is not specified, all parameters will be considered.
-        '''
-        if not self.gradient_ops_added:
-            raise RuntimeError("Need to call AddGradientOperators first")
-
-        param_to_grad = self.param_to_grad
-        if params:
-            param_to_grad = self.get_param_to_grad(params)
-
-        return [
-            self.get_param_info(param) for param, grad in param_to_grad.items()
-            if (
-                not self.skip_sparse_optim or
-                not isinstance(grad, core.GradientSlice)
-            )
-        ]
-
-    def _Validate(self):
-        '''
-        Check for duplicate params
-        '''
-        params_list = [str(p) for p in self.params]
-        params_set = set(params_list)
-
-        dupes = []
-        if len(params_set) != len(params_list):
-            params_list = sorted(params_list)
-            for j, p in enumerate(params_list):
-                if j > 0 and params_list[j - 1] == p:
-                    if p not in dupes:
-                        dupes.append(p)
-
-        return dupes
-
-    def Validate(self):
-        dupes = self._Validate()
-        assert dupes == [], "Duplicate params: {}".format(dupes)
-
-    def GetComputedParams(self, namescope=None):
-        '''
-        Returns the computed params in current namescope. 'Computed params'
-        are such parameters that are not optimized via gradient descent but are
-        directly computed from data, such as the running mean and variance
-        of Spatial Batch Normalization.
-        '''
-        namescope = ModelHelper._NormalizeNamescope(namescope)
-
-        if namescope == '':
-            return self._computed_params[:]
-        else:
-            return [p for p in self._computed_params
-                    if p.GetNameScope().startswith(namescope)]
-
-    def GetAllParams(self, namescope=None):
-        return self.GetParams(namescope) + self.GetComputedParams(namescope)
-
-    def TensorProtosDBInput(
-        self, unused_blob_in, blob_out, batch_size, db, db_type, **kwargs
-    ):
-        """TensorProtosDBInput."""
-        assert len(unused_blob_in) == 0, \
-            """You cannot pass reader to model_helper.TensorProtosDBInput.
-               Use model.net.TensorProtosDBInput instead to create the op."""
-
-        return db_input(
-            self, blob_out, batch_size, db, db_type, **kwargs)
-
-    def GetDevices(self):
-        assert len(self._devices) > 0, \
-            "Use data_parallel_model to run model on multiple GPUs."
-        return self._devices
-
-    def __getattr__(self, op_type):
-        """Catch-all for all other operators, mostly those without params."""
-        if op_type.startswith('__'):
-            raise AttributeError(op_type)
-
-        if not core.IsOperator(op_type):
-            raise AttributeError(
-                'Method ' + op_type + ' is not a registered operator.' +
-                ' Did you mean: [' +
-                ','.join(workspace.C.nearby_opnames(op_type)) + ']'
-            )
-        if op_type not in _known_working_ops:
-            if not self.allow_not_known_ops:
-                raise AttributeError(
-                    "Operator {} is not known to be safe".format(op_type))
-
-            logging.warning("You are creating an op that the ModelHelper "
-                            "does not recognize: {}.".format(op_type))
-        return self.net.__getattr__(op_type)
-
-    def __dir__(self):
-        return sorted(set(chain(
-            dir(type(self)),
-            self.__dict__.keys(),
-            _known_working_ops
-        )))
-
-    def GetCompleteNet(self):
-        r""" Return param_init_net + net Net.
-        Returns:
-          'core.Net' containing param_init_net and net
-        """
-        new_net = self.param_init_net.Clone(
-            self.name + "_complete_net", keep_schema=True)
-        # add init net info to debug info
-        for op in new_net.Proto().op:
-            op.debug_info = op.debug_info + "/param_init_net"
-        new_net.AppendNet(self.net)
-        # keep the execution optimization
-        if self.net.Proto().HasField("type"):
-            new_net.Proto().type = self.net.Proto().type
-        return new_net
-
-    def ConstructInitTrainNetfromNet(self, net):
-        r""" construct init net and train net from complete_net
-        Inputs:
-          net: 'core.Net' containing param_init_net and train net
-        """
-        param_op_mask = []
-        train_op_mask = []
-        for idx, op in enumerate(net.Proto().op):
-            if op.debug_info.endswith("/param_init_net"):
-                param_op_mask.append(idx)
-            else:
-                train_op_mask.append(idx)
-
-        self.param_init_net = net.Clone(
-            net.Name() + "/generated_param_init_net",
-            keep_schema=True,
-            op_id_mask=param_op_mask,
-            update_external_list=True,
-        )
-        self.net = net.Clone(
-            net.Name() + "/generated_net",
-            keep_schema=True,
-            op_id_mask=train_op_mask,
-            update_external_list=True,
-        )
-
-
-def ExtractPredictorNet(
-    net_proto,
-    input_blobs,
-    output_blobs,
-    device=None,
-    renames=None,
-    disabled_inputs=None,
-):
-    '''
-    Takes a model net for training and returns a net which can be
-    used for prediction. For example, all gradient operators and
-    input operators are removed.
-    @param net_proto protobuf of the net you want to process (net.Proto())
-    @param input_blobs list/set of blob names that are the inputs of predictor
-    @param output_blobs list/set of blob names that are outputs of predictor
-    @param device optional device option that is assigned
-    @param renames dictionary of blob name to a new name (optional)
-    @param disabled_inputs optional set of blobs that are 'switched off'. This
-                will cause branches with those blobs as inputs to be removed
-    '''
-    predict_net = core.Net(net_proto.name + "_predict")
-    predict_proto = predict_net.Proto()
-
-    orig_external_inputs = set(net_proto.external_input)
-    orig_external_outputs = set(net_proto.external_output)
-    input_blobs = {str(b) for b in input_blobs}
-    known_blobs = set(orig_external_inputs).union(input_blobs)
-    output_blobs = {str(b) for b in output_blobs}
-    external_inputs = set(input_blobs)
-    external_outputs = set(output_blobs)
-
-    if renames is None:
-        renames = {}
-
-    if disabled_inputs is not None:
-        known_blobs = known_blobs - set(disabled_inputs)
-
-    ops = list(net_proto.op)
-
-    # Find the range of ops that we should include
-    try:
-        first_op_with_input = min(
-            [
-                j for j in range(len(ops))
-                if input_blobs.intersection(ops[j].input) and ops[j].type !=
-                'StopGradient'
-            ]
-        )
-    except ValueError as e:
-        raise Exception("No ops with input={}".format(input_blobs)) from e
-    try:
-        last_op_with_output = max(
-            [
-                j for j in range(len(ops))
-                if output_blobs.intersection(ops[j].output)
-            ]
-        )
-    except ValueError as e:
-        raise Exception("No ops with output={}".format(output_blobs)) from e
-
-    def validate_op(op):
-        # Check that the op does not have is_test = 0 set. This is a common
-        # pitfall with SpatialBN op, at lest.
-        for arg in op.arg:
-            if arg.name == "is_test" and arg.i == 0:
-                raise Exception(
-                    "An operator had is_test=0, did you try to extract a " +
-                    "predictor from a train model (instead of test model)?" +
-                    " Op was: {}".format(str(op))
-                )
-
-    def rename_list(proto_list):
-        # proto lists don't support assignments
-        new_list = proto_list[:]
-        for j, b in enumerate(new_list):
-            if b in renames:
-                new_list[j] = renames[b]
-
-        del proto_list[:]
-        proto_list.extend(new_list)
-
-    # Iterate through the ops and only include those whose inputs
-    # we can satisfy.
-    for op in ops[first_op_with_input:(last_op_with_output + 1)]:
-        if known_blobs.issuperset(op.input):
-
-            # Special handling for recurrent nets
-            # TODO: when standard argument type for "nets" is introduced,
-            # this can be more general
-            if op.type == 'RecurrentNetwork':
-                for arg in op.arg:
-                    if arg.name == 'backward_step_net':
-                        arg.ClearField(str('n'))
-                    elif arg.name == 'step_net':
-                        for step_op in arg.n.op:
-                            rename_list(step_op.input)
-                            rename_list(step_op.output)
-                            if device is not None:
-                                step_op.device_option.device_type = device.device_type
-                                step_op.device_option.device_id = device.device_id
-
-                        rename_list(arg.n.external_input)
-                        rename_list(arg.n.external_output)
-
-                        # Add additional external inputs
-                        external_inputs.update(
-                            set(arg.n.external_input).intersection(
-                                orig_external_inputs
-                            )
-                        )
-
-            if device is not None:
-                op.device_option.device_type = device.device_type
-                op.device_option.device_id = device.device_id
-            validate_op(op)
-            predict_proto.op.extend([op])
-            known_blobs.update(op.output)
-            external_inputs.update(
-                set(op.input).intersection(orig_external_inputs)
-            )
-            external_outputs.update(
-                set(op.output).intersection(orig_external_outputs)
-            )
-
-        else:
-            logging.debug(
-                "Op {} had unknown inputs: {}".format(
-                    op.type, set(op.input).difference(known_blobs)
-                )
-            )
-
-    # Predictor net's external inputs and outputs include only those
-    # that are part of this net.
-    predict_proto.external_input.extend(external_inputs)
-    predict_proto.external_output.extend(external_outputs)
-
-    rename_list(predict_proto.external_input)
-    rename_list(predict_proto.external_output)
-
-    renamed_input_blobs = []
-    for b in input_blobs:
-        if b in renames:
-            renamed_input_blobs.append(renames[b])
-        else:
-            renamed_input_blobs.append(b)
-
-    for op in predict_proto.op:
-        rename_list(op.input)
-        rename_list(op.output)
-
-    return predict_net, list(
-        set(predict_proto.external_input) - set(renamed_input_blobs)
-    )
diff --git a/caffe2/python/model_helper_test.py b/caffe2/python/model_helper_test.py
deleted file mode 100644
index 1423e4a97733..000000000000
--- a/caffe2/python/model_helper_test.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""unittest for ModelHelper class"""
-
-
-
-import unittest
-
-from caffe2.python import brew, model_helper
-
-
-class ModelHelperTest(unittest.TestCase):
-    def test_get_complete_net_type(self):
-        model = model_helper.ModelHelper("test_orig")
-        brew.conv(
-            model,
-            "input",
-            "conv",
-            dim_in=3,
-            dim_out=16,
-            weight_init=("MSRAFill", {}),
-            kernel=3,
-            stride=1,
-            pad=0,
-        )
-        model.net.Proto().type = "async_scheduling"
-        net = model.GetCompleteNet()
-        model2 = model_helper.ModelHelper("test_new")
-        model2.ConstructInitTrainNetfromNet(net)
-        self.assertTrue(model2.net.Proto().type, "async_scheduling")
-        self.assertTrue(model2.param_init_net.Proto().type, "async_scheduling")
-
-    def test_get_complete_net(self):
-        model = model_helper.ModelHelper("test_orig")
-        conv = brew.conv(
-            model,
-            "input",
-            "conv",
-            dim_in=3,
-            dim_out=16,
-            weight_init=("MSRAFill", {}),
-            kernel=3,
-            stride=1,
-            pad=0,
-        )
-        conv = brew.spatial_bn(model, conv, "conv_bn", 16, epsilon=1e-3, is_test=False)
-        conv = brew.relu(model, conv, "conv_relu")
-        pred = brew.fc(model, conv, "pred", dim_in=16 * 3 * 3, dim_out=10)
-        brew.softmax(model, pred, "softmax")
-        net = model.GetCompleteNet()
-        model2 = model_helper.ModelHelper("test_new")
-        model2.ConstructInitTrainNetfromNet(net)
-
-        net = model.param_init_net
-        net2 = model2.param_init_net
-        for op1, op2 in zip(net.Proto().op, net2.Proto().op):
-            op1.debug_info = op1.debug_info + "/param_init_net"
-            self.assertEqual(
-                op1, op2, "op mismatch between {}\n and {}\n".format(op1, op2)
-            )
-        net = model.net
-        net2 = model2.net
-        for op1, op2 in zip(net.Proto().op, net2.Proto().op):
-            self.assertEqual(
-                op1, op2, "op mismatch between {}\n and {}\n".format(op1, op2)
-            )
-        # this is not guaranteed in other situations where user define own net
-        self.assertEqual(
-            sorted(map(str, net.external_inputs)),
-            sorted(map(str, net2.external_inputs)),
-        )
diff --git a/caffe2/python/modeling/__init__.py b/caffe2/python/modeling/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/modeling/compute_histogram_for_blobs.py b/caffe2/python/modeling/compute_histogram_for_blobs.py
deleted file mode 100644
index ea83f96f7019..000000000000
--- a/caffe2/python/modeling/compute_histogram_for_blobs.py
+++ /dev/null
@@ -1,92 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.modeling.net_modifier import NetModifier
-
-import numpy as np
-
-
-class ComputeHistogramForBlobs(NetModifier):
-    """
-    This class modifies the net passed in by adding ops to compute histogram for
-    certain blobs.
-
-    Args:
-        blobs: list of blobs to compute histogram for
-        logging_frequency: frequency for printing
-        lower_bound: left boundary of histogram values
-        upper_bound: right boundary of histogram values
-        num_buckets: number of buckets to use in [lower_bound, upper_bound)
-        accumulate: boolean to output accumulate or per-batch histogram
-    """
-
-    def __init__(self, blobs, logging_frequency, num_buckets=30,
-            lower_bound=0.0, upper_bound=1.0, accumulate=False):
-        self._blobs = blobs
-        self._logging_frequency = logging_frequency
-        self._accumulate = accumulate
-        if self._accumulate:
-            self._field_name_suffix = '_acc_normalized_hist'
-        else:
-            self._field_name_suffix = '_curr_normalized_hist'
-
-        self._num_buckets = int(num_buckets)
-        assert self._num_buckets > 0, (
-            "num_buckets need to be greater than 0, got {}".format(num_buckets))
-        self._lower_bound = float(lower_bound)
-        self._upper_bound = float(upper_bound)
-
-    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
-                   modify_output_record=False):
-        for blob_name in self._blobs:
-            blob = core.BlobReference(blob_name)
-            assert net.BlobIsDefined(blob), 'blob {} is not defined in net {} whose proto is {}'.format(blob, net.Name(), net.Proto())
-
-            blob_float = net.Cast(blob, net.NextScopedBlob(prefix=blob +
-                '_float'), to=core.DataType.FLOAT)
-            curr_hist, acc_hist = net.AccumulateHistogram(
-                [blob_float],
-                [net.NextScopedBlob(prefix=blob + '_curr_hist'),
-                 net.NextScopedBlob(prefix=blob + '_acc_hist')],
-                num_buckets=self._num_buckets,
-                lower_bound=self._lower_bound,
-                upper_bound=self._upper_bound)
-
-            if self._accumulate:
-                hist = net.Cast(
-                    acc_hist,
-                    net.NextScopedBlob(prefix=blob + '_cast_hist'),
-                    to=core.DataType.FLOAT)
-            else:
-                hist = net.Cast(
-                    curr_hist,
-                    net.NextScopedBlob(prefix=blob + '_cast_hist'),
-                    to=core.DataType.FLOAT)
-
-            normalized_hist = net.NormalizeL1(
-                hist,
-                net.NextScopedBlob(prefix=blob + self._field_name_suffix)
-            )
-
-            if self._logging_frequency >= 1:
-                net.Print(normalized_hist, [], every_n=self._logging_frequency)
-
-            if modify_output_record:
-                output_field_name = str(blob) + self._field_name_suffix
-                output_scalar = schema.Scalar((np.float32, (self._num_buckets + 2,)),
-                    normalized_hist)
-
-                if net.output_record() is None:
-                    net.set_output_record(
-                        schema.Struct((output_field_name, output_scalar))
-                    )
-                else:
-                    net.AppendOutputRecordField(
-                        output_field_name,
-                        output_scalar)
-
-    def field_name_suffix(self):
-        return self._field_name_suffix
diff --git a/caffe2/python/modeling/compute_histogram_for_blobs_test.py b/caffe2/python/modeling/compute_histogram_for_blobs_test.py
deleted file mode 100644
index 4ce6bf11487a..000000000000
--- a/caffe2/python/modeling/compute_histogram_for_blobs_test.py
+++ /dev/null
@@ -1,119 +0,0 @@
-
-
-
-
-
-import unittest
-from caffe2.python import workspace, brew, model_helper
-from caffe2.python.modeling.compute_histogram_for_blobs import (
-    ComputeHistogramForBlobs
-)
-
-import numpy as np
-
-
-class ComputeHistogramForBlobsTest(unittest.TestCase):
-
-    def histogram(self, X, lower_bound=0.0, upper_bound=1.0, num_buckets=20):
-        assert X.ndim == 2, ('this test assume 2d array,  but X.ndim is {0}'.
-            format(X.ndim))
-        N, M = X.shape
-        hist = np.zeros((num_buckets + 2, ), dtype=np.int32)
-        segment = (upper_bound - lower_bound) / num_buckets
-        Y = np.zeros((N, M), dtype=np.int32)
-        Y[X < lower_bound] = 0
-        Y[X >= upper_bound] = num_buckets + 1
-        Y[(X >= lower_bound) & (X < upper_bound)] = \
-            ((X[(X >= lower_bound) & (X < upper_bound)] - lower_bound) /
-                    segment + 1).astype(np.int32)
-
-        for i in range(Y.shape[0]):
-            for j in range(Y.shape[1]):
-                hist[Y[i][j]] += 1
-
-        cur_hist = hist.astype(np.float32) / (N * M)
-        acc_hist = cur_hist
-        return [cur_hist, acc_hist]
-
-    def test_compute_histogram_for_blobs(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        num_buckets = 20
-        lower_bound = 0.2
-        upper_bound = 0.8
-        accumulate = False
-        net_modifier = ComputeHistogramForBlobs(blobs=['fc1_w', 'fc2_w'],
-                                                logging_frequency=10,
-                                                num_buckets=num_buckets,
-                                                lower_bound=lower_bound,
-                                                upper_bound=upper_bound,
-                                                accumulate=accumulate)
-        net_modifier(model.net)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_curr_normalized_hist = workspace.FetchBlob('fc1_w_curr_normalized_hist')
-        cur_hist, acc_hist = self.histogram(fc1_w,
-                                            lower_bound=lower_bound,
-                                            upper_bound=upper_bound,
-                                            num_buckets=num_buckets)
-
-        self.assertEqual(fc1_w_curr_normalized_hist.size, num_buckets + 2)
-        self.assertAlmostEqual(np.linalg.norm(
-            fc1_w_curr_normalized_hist - cur_hist), 0.0, delta=1e-5)
-        self.assertEqual(len(model.net.Proto().op), 12)
-
-        assert model.net.output_record() is None
-
-    def test_compute_histogram_for_blobs_modify_output_record(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        num_buckets = 20
-        lower_bound = 0.2
-        upper_bound = 0.8
-        accumulate = False
-        net_modifier = ComputeHistogramForBlobs(blobs=['fc1_w', 'fc2_w'],
-                                                logging_frequency=10,
-                                                num_buckets=num_buckets,
-                                                lower_bound=lower_bound,
-                                                upper_bound=upper_bound,
-                                                accumulate=accumulate)
-        net_modifier(model.net, modify_output_record=True)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_curr_normalized_hist = workspace.FetchBlob('fc1_w_curr_normalized_hist')
-        cur_hist, acc_hist = self.histogram(fc1_w,
-                                            lower_bound=lower_bound,
-                                            upper_bound=upper_bound,
-                                            num_buckets=num_buckets)
-
-        self.assertEqual(fc1_w_curr_normalized_hist.size, num_buckets + 2)
-        self.assertAlmostEqual(np.linalg.norm(
-            fc1_w_curr_normalized_hist - cur_hist), 0.0, delta=1e-5)
-        self.assertEqual(len(model.net.Proto().op), 12)
-
-        assert 'fc1_w' + net_modifier.field_name_suffix() in\
-            model.net.output_record().field_blobs(),\
-            model.net.output_record().field_blobs()
-        assert 'fc2_w' + net_modifier.field_name_suffix() in\
-            model.net.output_record().field_blobs(),\
-            model.net.output_record().field_blobs()
diff --git a/caffe2/python/modeling/compute_norm_for_blobs.py b/caffe2/python/modeling/compute_norm_for_blobs.py
deleted file mode 100644
index 010e01dffcf2..000000000000
--- a/caffe2/python/modeling/compute_norm_for_blobs.py
+++ /dev/null
@@ -1,96 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, schema, muji
-from caffe2.python.modeling.net_modifier import NetModifier
-
-
-import numpy as np
-
-
-class ComputeNormForBlobs(NetModifier):
-    """
-    This class modifies the net passed in by adding ops to compute norms for
-    certain blobs.
-
-    Args:
-        blobs: list of blobs to compute norm for
-        logging_frequency: frequency for printing norms to logs
-        p: type of norm. Currently it supports p=1 or p=2
-        compute_averaged_norm: norm or averaged_norm (averaged_norm = norm/size
-        row_index: to plot the entire blob or simply one row at the row_index)
-    """
-
-    def __init__(self, blobs, logging_frequency, p=2, compute_averaged_norm=False, row_index=None):
-        self._blobs = blobs
-        self._logging_frequency = logging_frequency
-        self._p = p
-        self._compute_averaged_norm = compute_averaged_norm
-        self._field_name_suffix = '_l{}_norm'.format(p)
-        if compute_averaged_norm:
-            self._field_name_suffix = '_averaged' + self._field_name_suffix
-
-        if row_index and row_index < 0:
-            raise Exception('{0} is not a valid row index, row_index should be >= 0'.format(
-                row_index))
-        self.row_index = row_index
-
-    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
-                   modify_output_record=False):
-
-        p = self._p
-        compute_averaged_norm = self._compute_averaged_norm
-        row_index = self.row_index
-
-        CPU = muji.OnCPU()
-        # if given, blob_to_device is a map from blob to device_option
-        blob_to_device = blob_to_device or {}
-        for blob_name in self._blobs:
-            blob = core.BlobReference(blob_name)
-            assert net.BlobIsDefined(blob), 'blob {} is not defined in net {} whose proto is {}'.format(blob, net.Name(), net.Proto())
-            if blob in blob_to_device:
-                device = blob_to_device[blob]
-            else:
-                device = CPU
-
-            with core.DeviceScope(device):
-                if row_index and row_index >= 0:
-                    blob = net.Slice(
-                        [blob],
-                        net.NextScopedBlob(prefix=blob + '_row_{0}'.format(row_index)),
-                        starts=[row_index, 0],
-                        ends=[row_index + 1, -1]
-                    )
-
-                cast_blob = net.Cast(
-                    blob,
-                    net.NextScopedBlob(prefix=blob + '_float'),
-                    to=core.DataType.FLOAT
-                )
-
-                norm_name = net.NextScopedBlob(prefix=blob + self._field_name_suffix)
-                norm = net.LpNorm(
-                    cast_blob, norm_name, p=p, average=compute_averaged_norm
-                )
-                norm_stop_gradient = net.StopGradient(norm, net.NextScopedBlob(norm_name + "_stop_gradient"))
-
-                if self._logging_frequency >= 1:
-                    net.Print(norm, [], every_n=self._logging_frequency)
-
-                if modify_output_record:
-                    output_field_name = str(blob) + self._field_name_suffix
-                    output_scalar = schema.Scalar((np.float64, (1,)), norm)
-
-                    if net.output_record() is None:
-                        net.set_output_record(
-                            schema.Struct((output_field_name, output_scalar))
-                        )
-                    else:
-                        net.AppendOutputRecordField(
-                            output_field_name,
-                            output_scalar)
-
-    def field_name_suffix(self):
-        return self._field_name_suffix
diff --git a/caffe2/python/modeling/compute_norm_for_blobs_test.py b/caffe2/python/modeling/compute_norm_for_blobs_test.py
deleted file mode 100644
index 1bf3dae0353f..000000000000
--- a/caffe2/python/modeling/compute_norm_for_blobs_test.py
+++ /dev/null
@@ -1,231 +0,0 @@
-
-
-
-
-
-import unittest
-from caffe2.python import workspace, brew, model_helper
-from caffe2.python.modeling.compute_norm_for_blobs import ComputeNormForBlobs
-
-import numpy as np
-
-
-class ComputeNormForBlobsTest(unittest.TestCase):
-    def test_compute_norm_for_blobs(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        net_modifier = ComputeNormForBlobs(
-            blobs=['fc1_w', 'fc2_w'],
-            logging_frequency=10,
-        )
-
-        net_modifier(model.net)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_l2_norm = workspace.FetchBlob('fc1_w_l2_norm')
-
-        self.assertEqual(fc1_w_l2_norm.size, 1)
-        self.assertAlmostEqual(fc1_w_l2_norm[0],
-                               np.linalg.norm(fc1_w)**2,
-                               delta=1e-5)
-
-        self.assertEqual(len(model.net.Proto().op), 10)
-
-        assert model.net.output_record() is None
-
-    def test_compute_norm_for_blobs_modify_output_record(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        net_modifier = ComputeNormForBlobs(
-            blobs=['fc1_w', 'fc2_w'],
-            logging_frequency=10,
-        )
-
-        net_modifier(model.net, modify_output_record=True)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_l2_norm = workspace.FetchBlob('fc1_w_l2_norm')
-
-        self.assertEqual(fc1_w_l2_norm.size, 1)
-        self.assertAlmostEqual(fc1_w_l2_norm[0],
-                               np.linalg.norm(fc1_w)**2,
-                               delta=1e-5)
-
-        self.assertEqual(len(model.net.Proto().op), 10)
-        assert 'fc1_w' + net_modifier.field_name_suffix() in\
-            model.net.output_record().field_blobs(),\
-            model.net.output_record().field_blobs()
-        assert 'fc2_w' + net_modifier.field_name_suffix() in\
-            model.net.output_record().field_blobs(),\
-            model.net.output_record().field_blobs()
-
-    def test_compute_averaged_norm_for_blobs(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        net_modifier = ComputeNormForBlobs(
-            blobs=['fc1_w', 'fc2_w'],
-            logging_frequency=10,
-            compute_averaged_norm=True,
-        )
-
-        net_modifier(model.net)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_averaged_l2_norm = workspace.FetchBlob('fc1_w_averaged_l2_norm')
-
-        self.assertEqual(fc1_w_averaged_l2_norm.size, 1)
-        self.assertAlmostEqual(fc1_w_averaged_l2_norm[0],
-                               np.linalg.norm(fc1_w)**2 / fc1_w.size,
-                               delta=1e-5)
-
-        self.assertEqual(len(model.net.Proto().op), 10)
-
-    def test_compute_norm_for_blobs_no_print(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        net_modifier = ComputeNormForBlobs(
-            blobs=['fc1_w', 'fc2_w'],
-            logging_frequency=-1,
-        )
-
-        net_modifier(model.net)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_l2_norm = workspace.FetchBlob('fc1_w_l2_norm')
-
-        self.assertEqual(fc1_w_l2_norm.size, 1)
-        self.assertAlmostEqual(fc1_w_l2_norm[0],
-                               np.linalg.norm(fc1_w)**2,
-                               delta=1e-5)
-
-        self.assertEqual(len(model.net.Proto().op), 8)
-
-    def test_compute_l1_norm_for_blobs(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        net_modifier = ComputeNormForBlobs(
-            blobs=['fc1_w', 'fc2_w'],
-            logging_frequency=10,
-            p=1,
-        )
-
-        net_modifier(model.net)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_l1_norm = workspace.FetchBlob('fc1_w_l1_norm')
-
-        self.assertEqual(fc1_w_l1_norm.size, 1)
-        self.assertAlmostEqual(fc1_w_l1_norm[0],
-                               np.sum(np.abs(fc1_w)),
-                               delta=1e-5)
-
-        self.assertEqual(len(model.net.Proto().op), 10)
-
-    def test_compute_l1_averaged_norm_for_blobs(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        net_modifier = ComputeNormForBlobs(
-            blobs=['fc1_w', 'fc2_w'],
-            logging_frequency=10,
-            p=1,
-            compute_averaged_norm=True,
-        )
-
-        net_modifier(model.net)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_averaged_l1_norm = workspace.FetchBlob('fc1_w_averaged_l1_norm')
-
-        self.assertEqual(fc1_w_averaged_l1_norm.size, 1)
-        self.assertAlmostEqual(fc1_w_averaged_l1_norm[0],
-                               np.sum(np.abs(fc1_w)) / fc1_w.size,
-                               delta=1e-5)
-
-        self.assertEqual(len(model.net.Proto().op), 10)
-
-    def test_compute_norm_row_index_for_blobs(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        net_modifier = ComputeNormForBlobs(
-            blobs=['fc1_w'],
-            logging_frequency=10,
-            compute_averaged_norm=True,
-            row_index=1
-        )
-
-        net_modifier(model.net)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_row_1_averaged_l2_norm = workspace.FetchBlob('fc1_w_row_1_averaged_l2_norm')
-
-        self.assertEqual(fc1_w_row_1_averaged_l2_norm.size, 1)
-        self.assertAlmostEqual(fc1_w_row_1_averaged_l2_norm[0],
-                               np.linalg.norm(fc1_w[1])**2 / fc1_w[1].size,
-                               delta=1e-5)
diff --git a/caffe2/python/modeling/compute_statistics_for_blobs.py b/caffe2/python/modeling/compute_statistics_for_blobs.py
deleted file mode 100644
index 590b050469f0..000000000000
--- a/caffe2/python/modeling/compute_statistics_for_blobs.py
+++ /dev/null
@@ -1,54 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.modeling.net_modifier import NetModifier
-
-import numpy as np
-
-
-class ComputeStatisticsForBlobs(NetModifier):
-    """
-    This class modifies the net passed in by adding ops to compute statistics
-    for certain blobs. For each blob in the list, its min, max, mean and standard
-    deviation will be computed.
-
-    Args:
-        blobs: list of blobs to compute norm for
-        logging_frequency: frequency for printing norms to logs
-    """
-
-    def __init__(self, blobs, logging_frequency):
-        self._blobs = blobs
-        self._logging_frequency = logging_frequency
-        self._field_name_suffix = '_summary'
-
-    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
-                   modify_output_record=False):
-
-        for blob_name in self._blobs:
-            blob = core.BlobReference(blob_name)
-            assert net.BlobIsDefined(blob), 'blob {} is not defined in net {} whose proto is {}'.format(blob, net.Name(), net.Proto())
-
-            cast_blob = net.Cast(blob, to=core.DataType.FLOAT)
-            stats_name = net.NextScopedBlob(prefix=blob + self._field_name_suffix)
-            stats = net.Summarize(cast_blob, stats_name, to_file=0)
-            net.Print(stats, [], every_n=self._logging_frequency)
-
-            if modify_output_record:
-                output_field_name = str(blob) + self._field_name_suffix
-                output_scalar = schema.Scalar((np.float64, (1,)), stats)
-
-                if net.output_record() is None:
-                    net.set_output_record(
-                        schema.Struct((output_field_name, output_scalar))
-                    )
-                else:
-                    net.AppendOutputRecordField(
-                        output_field_name,
-                        output_scalar)
-
-    def field_name_suffix(self):
-        return self._field_name_suffix
diff --git a/caffe2/python/modeling/compute_statistics_for_blobs_test.py b/caffe2/python/modeling/compute_statistics_for_blobs_test.py
deleted file mode 100644
index bf75a1f7d149..000000000000
--- a/caffe2/python/modeling/compute_statistics_for_blobs_test.py
+++ /dev/null
@@ -1,84 +0,0 @@
-
-
-
-
-
-import unittest
-from caffe2.python import workspace, brew, model_helper
-from caffe2.python.modeling.compute_statistics_for_blobs import (
-    ComputeStatisticsForBlobs
-)
-
-import numpy as np
-
-
-class ComputeStatisticsForBlobsTest(unittest.TestCase):
-    def test_compute_statistics_for_blobs(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        net_modifier = ComputeStatisticsForBlobs(
-            blobs=['fc1_w', 'fc2_w'],
-            logging_frequency=10,
-        )
-
-        net_modifier(model.net)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_summary = workspace.FetchBlob('fc1_w_summary')
-
-        # std is unbiased here
-        stats_ref = np.array([fc1_w.flatten().min(), fc1_w.flatten().max(),
-                     fc1_w.flatten().mean(), fc1_w.flatten().std(ddof=1)])
-
-        self.assertAlmostEqual(np.linalg.norm(stats_ref - fc1_w_summary), 0,
-                               delta=1e-5)
-        self.assertEqual(fc1_w_summary.size, 4)
-
-        assert model.net.output_record() is None
-
-    def test_compute_statistics_for_blobs_modify_output_record(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        net_modifier = ComputeStatisticsForBlobs(
-            blobs=['fc1_w', 'fc2_w'],
-            logging_frequency=10,
-        )
-
-        net_modifier(model.net, modify_output_record=True)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_summary = workspace.FetchBlob('fc1_w_summary')
-
-        # std is unbiased here
-        stats_ref = np.array([fc1_w.flatten().min(), fc1_w.flatten().max(),
-                     fc1_w.flatten().mean(), fc1_w.flatten().std(ddof=1)])
-
-        self.assertAlmostEqual(np.linalg.norm(stats_ref - fc1_w_summary), 0,
-                               delta=1e-5)
-        self.assertEqual(fc1_w_summary.size, 4)
-
-        self.assertEqual(len(model.net.Proto().op), 8)
-        assert 'fc1_w' + net_modifier.field_name_suffix() in\
-            model.net.output_record().field_blobs()
-        assert 'fc2_w' + net_modifier.field_name_suffix() in\
-            model.net.output_record().field_blobs()
diff --git a/caffe2/python/modeling/get_entry_from_blobs.py b/caffe2/python/modeling/get_entry_from_blobs.py
deleted file mode 100644
index ff59bc6974dc..000000000000
--- a/caffe2/python/modeling/get_entry_from_blobs.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-from caffe2.python import core, schema
-from caffe2.python.modeling.net_modifier import NetModifier
-
-import numpy as np
-
-
-class GetEntryFromBlobs(NetModifier):
-    """
-    This class modifies the net passed in by adding ops to get a certain entry
-    from certain blobs.
-
-    Args:
-        blobs: list of blobs to get entry from
-        logging_frequency: frequency for printing entry values to logs
-        i1, i2: the first, second dimension of the blob. (currently, we assume
-        the blobs to be 2-dimensional blobs). When i2 = -1, print all entries
-        in blob[i1]
-    """
-
-    def __init__(self, blobs, logging_frequency, i1=0, i2=0):
-        self._blobs = blobs
-        self._logging_frequency = logging_frequency
-        self._i1 = i1
-        self._i2 = i2
-        self._field_name_suffix = '_{0}_{1}'.format(i1, i2) if i2 >= 0 \
-            else '_{0}_all'.format(i1)
-
-    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
-                    modify_output_record=False):
-
-        i1, i2 = [self._i1, self._i2]
-        if i1 < 0:
-            raise ValueError('index is out of range')
-
-        for blob_name in self._blobs:
-            blob = core.BlobReference(blob_name)
-            assert net.BlobIsDefined(blob), 'blob {} is not defined in net {} whose proto is {}'.format(blob, net.Name(), net.Proto())
-
-            blob_i1 = net.Slice([blob], starts=[i1, 0], ends=[i1 + 1, -1])
-            if self._i2 == -1:
-                blob_i1_i2 = net.Copy([blob_i1],
-                    [net.NextScopedBlob(prefix=blob + '_{0}_all'.format(i1))])
-            else:
-                blob_i1_i2 = net.Slice([blob_i1],
-                    net.NextScopedBlob(prefix=blob + '_{0}_{1}'.format(i1, i2)),
-                    starts=[0, i2], ends=[-1, i2 + 1])
-
-            if self._logging_frequency >= 1:
-                net.Print(blob_i1_i2, [], every_n=self._logging_frequency)
-
-            if modify_output_record:
-                output_field_name = str(blob) + self._field_name_suffix
-                output_scalar = schema.Scalar((np.float64), blob_i1_i2)
-
-                if net.output_record() is None:
-                    net.set_output_record(
-                        schema.Struct((output_field_name, output_scalar))
-                    )
-                else:
-                    net.AppendOutputRecordField(output_field_name, output_scalar)
-
-    def field_name_suffix(self):
-        return self._field_name_suffix
diff --git a/caffe2/python/modeling/get_entry_from_blobs_test.py b/caffe2/python/modeling/get_entry_from_blobs_test.py
deleted file mode 100644
index 3ec146766f30..000000000000
--- a/caffe2/python/modeling/get_entry_from_blobs_test.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-import unittest
-from caffe2.python import workspace, brew, model_helper
-from caffe2.python.modeling.get_entry_from_blobs import GetEntryFromBlobs
-
-import numpy as np
-
-
-class GetEntryFromBlobsTest(unittest.TestCase):
-    def test_get_entry_from_blobs(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=10, dim_out=8)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=8, dim_out=4)
-        i1, i2 = np.random.randint(4, size=2)
-        net_modifier = GetEntryFromBlobs(
-            blobs=['fc1_w', 'fc2_w'],
-            logging_frequency=10,
-            i1=i1,
-            i2=i2,
-        )
-        net_modifier(model.net)
-
-        workspace.FeedBlob('data', np.random.rand(10, 10).astype(np.float32))
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        fc1_w_entry = workspace.FetchBlob('fc1_w_{0}_{1}'.format(i1, i2))
-
-        self.assertEqual(fc1_w_entry.size, 1)
-        self.assertEqual(fc1_w_entry[0], fc1_w[i1][i2])
-        assert model.net.output_record() is None
-
-    def test_get_entry_from_blobs_modify_output_record(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=4)
-
-        # no operator name set, will use default
-        brew.fc(model, fc1, "fc2", dim_in=4, dim_out=4)
-        i1, i2 = np.random.randint(4), np.random.randint(5) - 1
-        net_modifier = GetEntryFromBlobs(
-            blobs=['fc1_w', 'fc2_w'],
-            logging_frequency=10,
-            i1=i1,
-            i2=i2,
-        )
-        net_modifier(model.net, modify_output_record=True)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        fc1_w = workspace.FetchBlob('fc1_w')
-        if i2 < 0:
-            fc1_w_entry = workspace.FetchBlob('fc1_w_{0}_all'.format(i1))
-        else:
-            fc1_w_entry = workspace.FetchBlob('fc1_w_{0}_{1}'.format(i1, i2))
-
-        if i2 < 0:
-            self.assertEqual(fc1_w_entry.size, 4)
-            for j in range(4):
-                self.assertEqual(fc1_w_entry[0][j], fc1_w[i1][j])
-        else:
-            self.assertEqual(fc1_w_entry.size, 1)
-            self.assertEqual(fc1_w_entry[0], fc1_w[i1][i2])
-
-        assert 'fc1_w' + net_modifier.field_name_suffix() in\
-            model.net.output_record().field_blobs(),\
-            model.net.output_record().field_blobs()
-        assert 'fc2_w' + net_modifier.field_name_suffix() in\
-            model.net.output_record().field_blobs(),\
-            model.net.output_record().field_blobs()
diff --git a/caffe2/python/modeling/gradient_clipping.py b/caffe2/python/modeling/gradient_clipping.py
deleted file mode 100644
index b01bc2ba301f..000000000000
--- a/caffe2/python/modeling/gradient_clipping.py
+++ /dev/null
@@ -1,153 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from caffe2.proto import caffe2_pb2
-from caffe2.python.optimizer import get_param_device
-from caffe2.python.modeling.net_modifier import NetModifier
-
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-class GradientClipping(NetModifier):
-
-    L1_NORM = 'l1_norm'
-    L2_NORM = 'l2_norm'
-
-    BY_NORM = 'by_norm'
-    BY_VALUE = 'by_value'
-
-    GRAD_CLIP_METHODS = [BY_NORM, BY_VALUE]
-    CLIP_GRADIENT_NORM_TYPES = [L2_NORM, L1_NORM]
-
-    def __init__(self, grad_clip_method, clip_norm_type='l2_norm',
-                clip_threshold=0.1, use_parameter_norm=False,
-                compute_norm_ratio=False, clip_max=1, clip_min=-1,
-                blobs_to_include=None, blobs_to_exclude=None):
-        """
-        Clips gradient to avoid gradient magnitude explosion or vanishing gradient.
-
-        Args:
-        grad_clip_method: ways to clip the gradients
-        clip_norm_type: type of norm used in the necessary computation
-        clip_threshold: threshold used to determine whether to clip
-        use_parameter_norm: a boolean to indicate whether to incorporate
-            the norm of the parameter
-        compute_norm_ratio: a boolean to compute the ratio between gradient norm
-            and parameter norm explicitly for debugging purpose
-        clip_max: when clipping by_value, any value that is greater than
-            clip_max will be clipped to clip_max
-        clip_min: when clipping by_value, any value that is smaller than
-            clip_min will be clipped to clip_min
-        blobs_to_include: names of blobs whose gradient is to be clipped. If it is set
-            to none, all param 's gradient in grad_map will be clipped.
-        blobs_to_exclude: names of blobs whose gradient is not to be clipped.
-        """
-
-        assert grad_clip_method in self.GRAD_CLIP_METHODS, (
-            "This method of clipping, {}, has not been implemented.".format(
-                clip_norm_type))
-        if clip_norm_type is not None:
-            assert clip_norm_type in self.CLIP_GRADIENT_NORM_TYPES, (
-                "This method of clipping, {}, has not been implemented.".format(
-                    clip_norm_type))
-
-        self.grad_clip_method = grad_clip_method
-        self.clip_norm_type = clip_norm_type
-        self.clip_threshold = float(clip_threshold)
-        self.use_parameter_norm = use_parameter_norm
-        self.compute_norm_ratio = compute_norm_ratio
-        self.clip_max = float(clip_max)
-        self.clip_min = float(clip_min)
-        self.blobs_to_include = blobs_to_include
-        self.blobs_to_exclude = blobs_to_exclude
-
-    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None,
-                   modify_output_record=False):
-
-        assert grad_map is not None
-
-        CPU = core.DeviceOption(caffe2_pb2.CPU)
-
-        final_param_map = {}
-        if self.blobs_to_include is None:
-            final_param_map = grad_map
-        else:
-            for blob in self.blobs_to_include:
-                param = core.BlobReference(blob)
-                if not net.BlobIsDefined(param):
-                    raise Exception('param {0} is not defined in net {1}'.format(
-                        param, net.Name()))
-                final_param_map[param] = grad_map[param]
-
-        if self.blobs_to_exclude is not None:
-            for blob in self.blobs_to_exclude:
-                final_param_map.pop(blob, None)
-
-        for param, grad in final_param_map.items():
-            # currently sparse gradients won't be clipped
-            # further implementation is needed to enable it
-            if isinstance(grad, core.GradientSlice):
-                continue
-
-            device = get_param_device(
-                param,
-                grad_map[str(param)],
-                param_to_device=blob_to_device,
-                default_device=CPU,
-            )
-
-            with core.DeviceScope(device):
-                if self.grad_clip_method == self.BY_NORM:
-                    if self.clip_norm_type == self.L2_NORM:
-                        p = 2
-                    elif self.clip_norm_type == self.L1_NORM:
-                        p = 1
-
-                    grad_norm = net.LpNorm(
-                        [grad],
-                        net.NextScopedBlob(prefix=str(grad) + '_l{}_norm'.format(p)),
-                        p=p,
-                    )
-
-                    if p == 2:
-                        grad_norm = net.Pow([grad_norm], exponent=0.5)
-
-                    op_inputs = [grad, grad_norm]
-
-                    if self.use_parameter_norm:
-                        param_norm = net.LpNorm(
-                            [param],
-                            net.NextScopedBlob(
-                                prefix=str(param) + '_l{}_norm'.format(p)),
-                            p=p,
-                        )
-
-                        if p == 2:
-                            param_norm = net.Pow([param_norm], exponent=0.5)
-
-                        op_inputs.append(param_norm)
-
-                        if self.compute_norm_ratio:
-                            net.Div(
-                                [grad_norm, param_norm],
-                                [net.NextScopedBlob(
-                                    prefix=str(param) + "_norm_ratio")]
-                            )
-
-                    net.ClipTensorByScaling(
-                        op_inputs,
-                        [grad],
-                        threshold=self.clip_threshold,
-                    )
-                elif self.grad_clip_method == self.BY_VALUE:
-                    net.Clip(
-                        [grad],
-                        [grad],
-                        max=self.clip_max,
-                        min=self.clip_min,
-                    )
diff --git a/caffe2/python/modeling/gradient_clipping_test.py b/caffe2/python/modeling/gradient_clipping_test.py
deleted file mode 100644
index 0b0e962cb727..000000000000
--- a/caffe2/python/modeling/gradient_clipping_test.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-import unittest
-from caffe2.python import workspace, brew, model_helper
-from caffe2.python.modeling.gradient_clipping import GradientClipping
-
-import numpy as np
-
-
-class GradientClippingTest(unittest.TestCase):
-    def test_gradient_clipping_by_norm(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        sigm = model.net.Sigmoid(fc2, 'sigm')
-        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
-        loss = model.net.SumElements(sq, 'loss')
-
-        grad_map = model.AddGradientOperators([loss])
-
-        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
-
-        net_modifier = GradientClipping(
-            grad_clip_method='by_norm',
-            clip_norm_type='l2_norm',
-            clip_threshold=0.1,
-        )
-
-        net_modifier(model.net, grad_map=grad_map_for_param)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        # 5 forward ops + 6 backward ops + 2 * (3 gradient clipping ops)
-        self.assertEqual(len(model.net.Proto().op), 17)
-
-    def test_gradient_clipping_by_norm_l1_norm(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        sigm = model.net.Sigmoid(fc2, 'sigm')
-        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
-        loss = model.net.SumElements(sq, 'loss')
-
-        grad_map = model.AddGradientOperators([loss])
-
-        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
-
-        net_modifier = GradientClipping(
-            grad_clip_method='by_norm',
-            clip_norm_type='l1_norm',
-            clip_threshold=0.1,
-        )
-
-        net_modifier(model.net, grad_map=grad_map_for_param)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        # 5 forward ops + 6 backward ops + 2 * (2 gradient clipping ops)
-        self.assertEqual(len(model.net.Proto().op), 15)
-
-    def test_gradient_clipping_by_norm_using_param_norm(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        sigm = model.net.Sigmoid(fc2, 'sigm')
-        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
-        loss = model.net.SumElements(sq, 'loss')
-
-        grad_map = model.AddGradientOperators([loss])
-
-        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
-
-        net_modifier = GradientClipping(
-            grad_clip_method='by_norm',
-            clip_norm_type='l2_norm',
-            clip_threshold=0.1,
-            use_parameter_norm=True,
-        )
-
-        net_modifier(model.net, grad_map=grad_map_for_param)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        # 5 forward ops + 6 backward ops + 2 * (5 gradient clipping ops)
-        self.assertEqual(len(model.net.Proto().op), 21)
-
-    def test_gradient_clipping_by_norm_compute_norm_ratio(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        sigm = model.net.Sigmoid(fc2, 'sigm')
-        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
-        loss = model.net.SumElements(sq, 'loss')
-
-        grad_map = model.AddGradientOperators([loss])
-
-        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
-
-        net_modifier = GradientClipping(
-            grad_clip_method='by_norm',
-            clip_norm_type='l2_norm',
-            clip_threshold=0.1,
-            use_parameter_norm=True,
-            compute_norm_ratio=True,
-        )
-
-        net_modifier(model.net, grad_map=grad_map_for_param)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        # 5 forward ops + 6 backward ops + 2 * (6 gradient clipping ops)
-        self.assertEqual(len(model.net.Proto().op), 23)
-
-    def test_gradient_clipping_by_value(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        sigm = model.net.Sigmoid(fc2, 'sigm')
-        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
-        loss = model.net.SumElements(sq, 'loss')
-
-        grad_map = model.AddGradientOperators([loss])
-
-        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
-
-        clip_max = 1e-8
-        clip_min = 0
-        net_modifier = GradientClipping(
-            grad_clip_method='by_value',
-            clip_max=clip_max,
-            clip_min=clip_min,
-        )
-
-        net_modifier(model.net, grad_map=grad_map_for_param)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        # 5 forward ops + 6 backward ops + 2 * (1 gradient clipping ops)
-        self.assertEqual(len(model.net.Proto().op), 13)
-
-        fc1_w_grad = workspace.FetchBlob('fc1_w_grad')
-        self.assertLessEqual(np.amax(fc1_w_grad), clip_max)
-        self.assertGreaterEqual(np.amin(fc1_w_grad), clip_min)
-
-    def test_gradient_clipping_by_norm_including_blobs(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        sigm = model.net.Sigmoid(fc2, 'sigm')
-        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
-        loss = model.net.SumElements(sq, 'loss')
-
-        grad_map = model.AddGradientOperators([loss])
-
-        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
-
-        net_modifier = GradientClipping(
-            grad_clip_method='by_norm',
-            clip_norm_type='l2_norm',
-            clip_threshold=0.1,
-            blobs_to_include=['fc1_w'],
-            blobs_to_exclude=None
-        )
-
-        net_modifier(model.net, grad_map=grad_map_for_param)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        # 5 forward ops + 6 backward ops + 1 * (3 gradient clipping ops)
-        self.assertEqual(len(model.net.Proto().op), 14)
-
-    def test_gradient_clipping_by_norm_excluding_blobs(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=4, dim_out=2)
-
-        # no operator name set, will use default
-        fc2 = brew.fc(model, fc1, "fc2", dim_in=2, dim_out=1)
-
-        sigm = model.net.Sigmoid(fc2, 'sigm')
-        sq = model.net.SquaredL2Distance([sigm, 'label'], 'sq')
-        loss = model.net.SumElements(sq, 'loss')
-
-        grad_map = model.AddGradientOperators([loss])
-
-        grad_map_for_param = {key: grad_map[key] for key in ['fc1_w', 'fc2_w']}
-
-        net_modifier = GradientClipping(
-            grad_clip_method='by_norm',
-            clip_norm_type='l2_norm',
-            clip_threshold=0.1,
-            blobs_to_include=None,
-            blobs_to_exclude=['fc1_w', 'fc2_w']
-        )
-
-        net_modifier(model.net, grad_map=grad_map_for_param)
-
-        workspace.FeedBlob('data', np.random.rand(10, 4).astype(np.float32))
-        workspace.FeedBlob('label', np.random.rand(10, 1).astype(np.float32))
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        # 5 forward ops + 6 backward ops + 0 * (3 gradient clipping ops)
-        self.assertEqual(len(model.net.Proto().op), 11)
diff --git a/caffe2/python/modeling/initializers.py b/caffe2/python/modeling/initializers.py
deleted file mode 100644
index 8e2943a8955b..000000000000
--- a/caffe2/python/modeling/initializers.py
+++ /dev/null
@@ -1,153 +0,0 @@
-
-
-
-
-
-from caffe2.python.core import DataType, BlobReference, ScopedBlobReference
-from caffe2.python.modeling.parameter_info import ParameterInfo
-
-
-class Initializer:
-    '''
-    This class abstracts out parameter creation. One can come up with a new
-    Initializer in order to implement more complex parameter initialization logic
-    '''
-
-    def __init__(self, operator_name=None, **kwargs):
-        self.operator_name = operator_name
-        self.operator_kwargs = kwargs
-
-    def update(self, operator_name, kwargs):
-        if self.operator_name is not None:
-            raise Exception("Operator name overwrites are not allowed")
-        self.operator_name = operator_name
-        self.operator_kwargs = kwargs
-
-    def create_param(self, param_name, init_net, shape):
-        param = init_net.__getattr__(self.operator_name)(
-            [], param_name, shape=shape, **self.operator_kwargs)
-        return ParameterInfo(
-            param_id=None,
-            param=param,
-            shape=shape,
-        )
-
-
-class ExternalInitializer:
-    '''
-    This class is used in cases when the parameter should not be initialized by
-    the initializer, but rather provided in the workspace when param_init_net is
-    executed.
-
-    Current version is not doing any real sanity checks to the parameter.
-    '''
-
-    def create_param(self, param_name, init_net, shape):
-        if isinstance(param_name, BlobReference):
-            param = BlobReference(str(param_name), init_net)
-        elif isinstance(param_name, str):
-            param = ScopedBlobReference(param_name, init_net)
-        else:
-            raise TypeError("Unsupported type for param_name")
-        # TODO(amalevich): Add operator that will check param in the workspace
-        return ParameterInfo(
-            param_id=None,
-            param=param,
-            shape=shape,
-        )
-
-
-class PseudoFP16Initializer(Initializer):
-    '''
-    Used in cases when the parameter should be used at half (16-bit) precision
-    for compute purposes (i.e. on the forward and backward pass) but
-    needs to be stored and optimized at single (32-bit) precision so tiny
-    gradients with small learning rates don't underflow FP16 precision.
-    A 32-bit copy of the 16-bit blob is stored in the ParameterInfo.
-    This is helpful for mixed-precision training, see
-    https://arxiv.org/abs/1710.03740 for details.
-    '''
-    def update(self, operator_name, kwargs):
-        if self.operator_name is not None:
-            raise Exception("Operator name overwrites are not allowed")
-        self.operator_name = operator_name
-        self.operator_kwargs = kwargs
-
-    def create_param(self, param_name, init_net, shape):
-        # create master fp32 copy
-        param_fp32 = init_net.__getattr__(self.operator_name)(
-            [], param_name + "_fp32", shape=shape,
-            **self.operator_kwargs)
-        # cast to fp16 copy
-        param = init_net.FloatToHalf(
-            param_fp32, param_name)
-
-        return ParameterInfo(
-            param_id=None,
-            param=param,
-            shape=shape,
-            blob_copy={DataType.FLOAT: param_fp32}
-        )
-
-
-class ReversePseudoFP16Initializer(Initializer):
-    '''
-    Like PseudoFP16Initializer above, except the primary blob is taken to
-    be the 32-bit precision parameter, and the 16-bit version of the blob
-    is stored in blob_copy instead.
-    '''
-    def update(self, operator_name, kwargs):
-        if self.operator_name is not None:
-            raise Exception("Operator name overwrites are not allowed")
-        self.operator_name = operator_name
-        self.operator_kwargs = kwargs
-
-    def create_param(self, param_name, init_net, shape):
-        # create master fp32 copy
-        param_fp32 = init_net.__getattr__(self.operator_name)(
-            [], param_name, shape=shape,
-            **self.operator_kwargs)
-        # cast to fp16 copy
-        param_fp16 = init_net.FloatToHalf(
-            param_fp32, param_name + "_fp16")
-
-        return ParameterInfo(
-            param_id=None,
-            param=param_fp32,
-            shape=shape,
-            blob_copy={DataType.FLOAT16: param_fp16}
-        )
-
-def update_initializer(initializer_class,
-                       operator_name_and_kwargs,
-                       default_operator_name_and_kwargs):
-    '''
-    A helper function to convert from operator_name_and_kwargs to new
-    object of type initializer_class. This function serves two purposes:
-
-    1. Support for custom initialization operators being passed in
-    2. Allow user to specify a custom Initializer without overwriting
-       default operators used for initialization
-
-    If initializer_class is None, creates a default initializer using
-    the Initializer class and operator_name_and_kwargs provided
-
-    If operator_name_and_kwargs is None, uses default_operator_name_and_kwargs
-
-    returns an instantiated Initializer object
-    '''
-    def get_initializer_args():
-        return (
-            operator_name_and_kwargs or
-            default_operator_name_and_kwargs
-        )
-
-    if initializer_class is not None:
-        init = initializer_class(get_initializer_args()[0],
-                                 **get_initializer_args()[1])
-    else:
-        init = Initializer(
-            get_initializer_args()[0],
-            **get_initializer_args()[1]
-        )
-    return init
diff --git a/caffe2/python/modeling/initializers_test.py b/caffe2/python/modeling/initializers_test.py
deleted file mode 100644
index fad40c159b6e..000000000000
--- a/caffe2/python/modeling/initializers_test.py
+++ /dev/null
@@ -1,56 +0,0 @@
-
-
-
-
-
-import unittest
-from caffe2.python import brew, model_helper, workspace
-from caffe2.python.modeling.initializers import (
-        Initializer, PseudoFP16Initializer)
-
-
-class InitializerTest(unittest.TestCase):
-    def test_fc_initializer(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=1, dim_out=1)
-
-        # no operator name set, will use default
-        fc2 = brew.fc(model, fc1, "fc2", dim_in=1, dim_out=1,
-                      WeightInitializer=Initializer)
-
-        # no operator name set, will use custom
-        fc3 = brew.fc(model, fc2, "fc3", dim_in=1, dim_out=1,
-                      WeightInitializer=Initializer,
-                      weight_init=("ConstantFill", {}),
-        )
-
-        # operator name set, no initializer class set
-        fc4 = brew.fc(model, fc3, "fc4", dim_in=1, dim_out=1,
-                      WeightInitializer=None,
-                      weight_init=("ConstantFill", {})
-        )
-
-    @unittest.skipIf(not workspace.has_gpu_support, 'No GPU support')
-    def test_fc_fp16_initializer(self):
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=1, dim_out=1)
-
-        # default operator, PseudoFP16Initializer
-        fc2 = brew.fc(model, fc1, "fc2", dim_in=1, dim_out=1,
-                      WeightInitializer=PseudoFP16Initializer
-        )
-
-        # specified operator, PseudoFP16Initializer
-        fc3 = brew.fc(model, fc2, "fc3", dim_in=1, dim_out=1,
-                      weight_init=("ConstantFill", {}),
-                      WeightInitializer=PseudoFP16Initializer
-        )
-
-    def test_fc_external_initializer(self):
-        model = model_helper.ModelHelper(name="test", init_params=False)
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=1, dim_out=1)  # noqa
-        self.assertEqual(len(model.net.Proto().op), 1)
-        self.assertEqual(len(model.param_init_net.Proto().op), 0)
diff --git a/caffe2/python/modeling/net_modifier.py b/caffe2/python/modeling/net_modifier.py
deleted file mode 100644
index 55f47f8fbac8..000000000000
--- a/caffe2/python/modeling/net_modifier.py
+++ /dev/null
@@ -1,31 +0,0 @@
-
-
-
-
-
-import abc
-
-
-class NetModifier(metaclass=abc.ABCMeta):
-    """
-    An abstraction class for supporting modifying a generated net.
-    Inherited classes should implement the modify_net method where
-    related operators are added to the net.
-
-    Example usage:
-        modifier = SomeNetModifier(opts)
-        modifier(net)
-    """
-
-    @abc.abstractmethod
-    def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None):
-        pass
-
-    def __call__(self, net, init_net=None, grad_map=None, blob_to_device=None,
-                 modify_output_record=False):
-        self.modify_net(
-            net,
-            init_net=init_net,
-            grad_map=grad_map,
-            blob_to_device=blob_to_device,
-            modify_output_record=modify_output_record)
diff --git a/caffe2/python/modeling/parameter_info.py b/caffe2/python/modeling/parameter_info.py
deleted file mode 100644
index dfbaffbd801c..000000000000
--- a/caffe2/python/modeling/parameter_info.py
+++ /dev/null
@@ -1,54 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-
-import numpy as np
-
-
-class ParameterTags:
-    BIAS = 'BIAS'
-    WEIGHT = 'WEIGHT'
-    COMPUTED_PARAM = 'COMPUTED_PARAM'
-
-
-class ParameterInfo:
-
-    def __init__(
-            self, param_id, param, key=None, shape=None, length=None,
-            grad=None, blob_copy=None):
-        assert isinstance(param, core.BlobReference)
-        self.param_id = param_id
-        self.name = str(param)
-        self.blob = param
-        self.key = key
-        self.shape = shape
-        self.size = None if shape is None else np.prod(shape)
-        self.length = max(1, length if length is not None else 1)
-        self.grad = grad
-        self._cloned_init_net = None
-        # Optionally store equivalent copies of the blob
-        # in different precisions (i.e. half and float copies)
-        # stored as a dict of TensorProto.DataType -> BlobReference
-        self.blob_copy = blob_copy
-        # each param_info can have its own optimizer. It can be set within
-        # OptimizerContext (caffe2/python/optimizer.py)
-        self._optimizer = None
-
-    @property
-    def parameter(self):
-        return self.blob
-
-    @property
-    def optimizer(self):
-        return self._optimizer
-
-    @optimizer.setter
-    def optimizer(self, value):
-        assert self._optimizer is None, "optimizer has already been set"
-        self._optimizer = value
-
-    def __str__(self):
-        return self.name
diff --git a/caffe2/python/modeling/parameter_sharing.py b/caffe2/python/modeling/parameter_sharing.py
deleted file mode 100644
index afb1b53fdcb6..000000000000
--- a/caffe2/python/modeling/parameter_sharing.py
+++ /dev/null
@@ -1,118 +0,0 @@
-
-
-
-
-
-from caffe2.python import scope
-
-import contextlib
-import logging
-
-logger = logging.getLogger(__name__)
-
-
-class ParameterSharingContext:
-    """
-    This class manages scope driven way of parameter sharing across different
-    NameScopes.
-    """
-
-    def __init__(self):
-        self._scope_overrides = {}
-        self._contexts = []
-
-    def _resolve_scope_overrides(self, candidate_scope):
-        """
-        Recursively resolves all scope overrides, i.e multiple steps of
-        override can be used.
-
-        For example, if one provides following scope overrides:
-        {'scope_b': 'scope_a'} and within 'scope_b' - {'shared_child': ''},
-        then name 'w' will get resolved to the following blobs depending on the
-        namescope:
-          a. 'scope_a' -> 'scope_a/w'
-          b. 'scope_b' -> 'scope_a/w'
-          c. 'scope_c' -> 'scope_c/w'
-          d. 'scope_b/shared_child' -> 'scope_a/w'
-          d. 'scope_b/unshared_child' -> 'scope_a/unshared_child/w'
-        """
-        best_scope = candidate_scope
-        best_scope_idx = 0
-        sub_scopes = candidate_scope.split(scope._NAMESCOPE_SEPARATOR)
-
-        cur_scope = ''
-        for idx, sub_scope in enumerate(sub_scopes):
-            cur_scope = cur_scope + sub_scope + scope._NAMESCOPE_SEPARATOR
-            if cur_scope in self._scope_overrides:
-                best_scope = self._scope_overrides[cur_scope]
-                best_scope_idx = idx
-        if best_scope == candidate_scope:
-            return candidate_scope
-        else:
-            return (self._resolve_scope_overrides(best_scope) +
-                    scope._NAMESCOPE_SEPARATOR.join(
-                        sub_scopes[best_scope_idx + 1:]))
-
-    def get_parameter_name(self, name):
-        candidate_scope = scope.CurrentNameScope()
-        best_scope = self._resolve_scope_overrides(candidate_scope)
-        if best_scope != candidate_scope:
-            logger.info("Overwriting scope {0} with scope {1}".format(
-                candidate_scope, best_scope))
-
-        return best_scope + name
-
-    def add_scope_overrides(self, shared_scopes):
-        self._contexts.append(shared_scopes)
-        self._scope_overrides.update(shared_scopes)
-
-    def pop(self):
-        assert len(self._contexts) > 0
-        self._contexts.pop()
-        self._scope_overrides = {}
-        for x in self._contexts:
-            self._scope_overrides.update(x)
-
-
-parameter_sharing_context = ParameterSharingContext()
-
-
-def _normalize_namescope(namescope):
-    if namescope and namescope[-1] != scope._NAMESCOPE_SEPARATOR:
-        return namescope + scope._NAMESCOPE_SEPARATOR
-    else:
-        return namescope
-
-
-@contextlib.contextmanager
-def ParameterSharing(shared_scopes):
-    """
-    Helper function for sharing scopes.
-    All the parameters within the shared_scopes, will be remapped with the
-    respect of CurrentNamescope()
-
-    I.e. if one calls ParameterSharing with {'scope_b': 'scope_'a'}, from the
-    scope 'some_global_scope', it'll effectively mean, that all parameters from
-    'some_global_scope/scope_b' will shared with the parameters from
-    'some_global_scope/scope_a'
-    """
-    assert isinstance(shared_scopes, dict)
-
-    shared_scope_overrides = {}
-    current_scope = scope.CurrentNameScope()
-    for k, v in shared_scopes.items():
-        assert not v.startswith(k), (
-            "Illegal override for parameter sharing. {} is prefix of {}".
-            format(k, v))
-        k = current_scope + k
-        v = current_scope + v
-        # Normalize all the scopes, so scope_a and scope_a/ are equivalent
-        k = _normalize_namescope(k)
-        v = _normalize_namescope(v)
-        shared_scope_overrides[k] = v
-
-    try:
-        parameter_sharing_context.add_scope_overrides(shared_scope_overrides)
-        yield
-    finally:
-        parameter_sharing_context.pop()
diff --git a/caffe2/python/modeling/parameter_sharing_test.py b/caffe2/python/modeling/parameter_sharing_test.py
deleted file mode 100644
index d845d6decb46..000000000000
--- a/caffe2/python/modeling/parameter_sharing_test.py
+++ /dev/null
@@ -1,155 +0,0 @@
-
-
-
-
-
-from caffe2.python import brew, model_helper, scope
-from caffe2.python.modeling.parameter_sharing import (
-    ParameterSharing,
-    parameter_sharing_context,
-)
-from caffe2.python.modeling.initializers import (
-    Initializer
-)
-import unittest
-
-
-class ParameterSharingTest(unittest.TestCase):
-
-    def test_parameter_sharing_default_scopes(self):
-        # Test no sharing default scopes
-        param_1 = parameter_sharing_context.get_parameter_name('w')
-        self.assertEqual(param_1, 'w')
-        with scope.NameScope('scope'):
-            param_2 = parameter_sharing_context.get_parameter_name('w')
-            self.assertEqual(param_2, 'scope/w')
-            with scope.NameScope('scope_2'):
-                param_3 = parameter_sharing_context.get_parameter_name('w')
-                self.assertEqual(param_3, 'scope/scope_2/w')
-
-    def test_parameter_sharing_nested_scopes(self):
-        # Test parameter sharing
-        with scope.NameScope('global_scope'):
-            with ParameterSharing({'model_b': 'model_a'}):
-                param_global = parameter_sharing_context.get_parameter_name('w')
-                self.assertEqual(param_global, 'global_scope/w')
-                # This scope is overridden to match 'model_a'
-                with scope.NameScope('model_b'):
-                    with ParameterSharing({'shared_scope': ''}):
-                        param_4 = parameter_sharing_context.get_parameter_name(
-                            'w')
-                        self.assertEqual(param_4, 'global_scope/model_a/w')
-                        with scope.NameScope('shared_scope'):
-                            param_5 = parameter_sharing_context.\
-                                get_parameter_name('w')
-                            self.assertEqual(param_5, 'global_scope/model_a/w')
-                # This scope is supposed to have not sharing
-                with scope.NameScope('model_c'):
-                    with ParameterSharing({'shared_scope': ''}):
-                        param_4 = parameter_sharing_context.get_parameter_name(
-                            'w')
-                        self.assertEqual(param_4, 'global_scope/model_c/w')
-                        with scope.NameScope('shared_scope'):
-                            param_5 = parameter_sharing_context.\
-                                get_parameter_name('w')
-                            self.assertEqual(param_5, 'global_scope/model_c/w')
-
-    def test_parameter_sharing_subscopes(self):
-        # Sharing only one of the subscopes
-        with ParameterSharing({'global_scope/b': 'global_scope/a'}):
-            with scope.NameScope('global_scope'):
-                param_6 = parameter_sharing_context.get_parameter_name('w')
-                self.assertEqual(param_6, 'global_scope/w')
-                with scope.NameScope('a'):
-                    param_7 = parameter_sharing_context.get_parameter_name('w')
-                    self.assertEqual(param_7, 'global_scope/a/w')
-                with scope.NameScope('b'):
-                    param_8 = parameter_sharing_context.get_parameter_name('w')
-                    self.assertEqual(param_8, 'global_scope/a/w')
-                with scope.NameScope('c'):
-                    param_9 = parameter_sharing_context.get_parameter_name('w')
-                    self.assertEqual(param_9, 'global_scope/c/w')
-
-    def test_create_param(self):
-        model = model_helper.ModelHelper(name="test")
-        # Test no sharing default scopes
-        p1 = model.create_param(
-            'w',
-            shape=[2],
-            initializer=Initializer("ConstantFill")
-        )
-        with scope.NameScope('some_global_scope'):
-            p2 = model.create_param(
-                'w',
-                shape=[2],
-                initializer=Initializer("ConstantFill")
-            )
-        self.assertNotEqual(model.get_param_info(p1), None)
-        self.assertNotEqual(model.get_param_info(p2), None)
-        self.assertNotEqual(model.get_param_info(p1), model.get_param_info(p2))
-        model.Validate()
-
-    def test_deep_hierarchy(self):
-        model = model_helper.ModelHelper(name="test")
-        with ParameterSharing({'a': 'b'}):
-            with scope.NameScope('a'):
-                with ParameterSharing({'c': 'd'}):
-                    with scope.NameScope('c'):
-                        with ParameterSharing({'e': 'f'}):
-                            with scope.NameScope('e'):
-                                p = model.create_param(
-                                    'w',
-                                    shape=[2],
-                                    initializer=Initializer("ConstantFill")
-                                )
-        self.assertNotEqual(model.get_param_info(p), None)
-
-
-    def test_parameter_sharing_brew(self):
-        # Test no sharing default scopes
-        model = model_helper.ModelHelper(name="test")
-        data = model.net.AddExternalInput("data")
-        fc1 = brew.fc(model, data, "fc1", dim_in=16, dim_out=16)
-        # Shared params are expected to share the same shape and fail if it's
-        # not true
-        with self.assertRaises(AssertionError):
-            _ = brew.fc(model, data, "fc1", dim_in=2, dim_out=2)  # noqa
-
-        output_blobs = set()
-        with scope.NameScope('some_global_scope'):
-            with scope.NameScope('model_a'):
-                output_blobs.add(str(brew.fc(model, fc1, 'output', 16, 16)))
-            with ParameterSharing({'model_b': 'model_a'}),\
-                    scope.NameScope('model_b'):
-                with ParameterSharing({'shared_1': '', 'shared_2': ''}):
-                    # All params in DenseLayers from shared_1, shared_2 and
-                    # model_a are shared and will be pointing to:
-                    # [some_global_scope/model_a/output_W,
-                    #  some_global_scope/model_a/output_b]
-                    with scope.NameScope('shared_1'):
-                        output_blobs.add(
-                            str(brew.fc(model, fc1, 'output', 16, 16)))
-                    with scope.NameScope('shared_2'):
-                        output_blobs.add(
-                            str(brew.fc(model, fc1, 'output', 16, 16)))
-                    # Params of this layer are not shared with anyone unless
-                    # there is some explicit sharing with model_a/unshared (not
-                    # in this example).
-                    # Names of the blobs are
-                    # [some_global_scope/model_a/unshared/output_W,
-                    #  some_global_scope/model_a/unshared/output_b]
-                    with scope.NameScope('unshared'):
-                        output_blobs.add(
-                            str(brew.fc(model, fc1, 'output', 16, 16)))
-
-        self.assertEqual(len(model._parameters_info), 6)
-        self.assertEqual(len(output_blobs), 4)
-        self.assertEqual(sorted(model._parameters_info.keys()), [
-            'fc1_b',
-            'fc1_w',
-            'some_global_scope/model_a/output_b',
-            'some_global_scope/model_a/output_w',
-            'some_global_scope/model_a/unshared/output_b',
-            'some_global_scope/model_a/unshared/output_w',
-        ])
-        model.Validate()
diff --git a/caffe2/python/models/__init__.py b/caffe2/python/models/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/models/__sym_init__.py b/caffe2/python/models/__sym_init__.py
deleted file mode 100644
index fa10bff7246b..000000000000
--- a/caffe2/python/models/__sym_init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-
-
-import os
-from caffe2.proto import caffe2_pb2
-
-
-def _parseFile(filename):
-    out_net = caffe2_pb2.NetDef()
-    # TODO(bwasti): A more robust handler for pathnames.
-    dir_path = os.path.dirname(__file__)
-    with open('{dir_path}/{filename}'.format(dir_path=dir_path,
-                                             filename=filename), 'rb') as f:
-        out_net.ParseFromString(f.read())
-    return out_net
-
-
-init_net = _parseFile('init_net.pb')
-predict_net = _parseFile('predict_net.pb')
diff --git a/caffe2/python/models/download.py b/caffe2/python/models/download.py
deleted file mode 100644
index f7e0234cde87..000000000000
--- a/caffe2/python/models/download.py
+++ /dev/null
@@ -1,215 +0,0 @@
-## @package download
-# Module caffe2.python.models.download
-
-
-
-
-import argparse
-import os
-import sys
-import signal
-import re
-import json
-
-from caffe2.proto import caffe2_pb2
-
-# Import urllib
-from urllib.error import HTTPError, URLError
-import urllib.request as urllib
-
-# urllib requires more work to deal with a redirect, so not using vanity url
-DOWNLOAD_BASE_URL = "https://s3.amazonaws.com/download.caffe2.ai/models/"
-DOWNLOAD_COLUMNS = 70
-
-
-# Don't let urllib hang up on big downloads
-def signalHandler(signal, frame):
-    print("Killing download...")
-    sys.exit(0)
-
-
-signal.signal(signal.SIGINT, signalHandler)
-
-
-def deleteDirectory(top_dir):
-    for root, dirs, files in os.walk(top_dir, topdown=False):
-        for name in files:
-            os.remove(os.path.join(root, name))
-        for name in dirs:
-            os.rmdir(os.path.join(root, name))
-    os.rmdir(top_dir)
-
-
-def progressBar(percentage):
-    full = int(DOWNLOAD_COLUMNS * percentage / 100)
-    bar = full * "#" + (DOWNLOAD_COLUMNS - full) * " "
-    sys.stdout.write(u"\u001b[1000D[" + bar + "] " + str(percentage) + "%")
-    sys.stdout.flush()
-
-
-def downloadFromURLToFile(url, filename, show_progress=True):
-    try:
-        print("Downloading from {url}".format(url=url))
-        response = urllib.urlopen(url)
-        size = int(response.info().get('Content-Length').strip())
-        chunk = min(size, 8192)
-        print("Writing to {filename}".format(filename=filename))
-        if show_progress:
-            downloaded_size = 0
-            progressBar(0)
-        with open(filename, "wb") as local_file:
-            while True:
-                data_chunk = response.read(chunk)
-                if not data_chunk:
-                    break
-                local_file.write(data_chunk)
-                if show_progress:
-                    downloaded_size += len(data_chunk)
-                    progressBar(int(100 * downloaded_size / size))
-        print("")  # New line to fix for progress bar
-    except HTTPError as e:
-        raise Exception("Could not download model. [HTTP Error] {code}: {reason}."
-                        .format(code=e.code, reason=e.reason)) from e
-    except URLError as e:
-        raise Exception("Could not download model. [URL Error] {reason}."
-                        .format(reason=e.reason)) from e
-
-
-def getURLFromName(name, filename):
-    return "{base_url}{name}/{filename}".format(base_url=DOWNLOAD_BASE_URL,
-                                                name=name, filename=filename)
-
-
-def downloadModel(model, args):
-    # Figure out where to store the model
-    model_folder = '{folder}'.format(folder=model)
-    dir_path = os.path.dirname(os.path.realpath(__file__))
-    if args.install:
-        model_folder = '{dir_path}/{folder}'.format(dir_path=dir_path,
-                                                    folder=model)
-
-    # Check if that folder is already there
-    if os.path.exists(model_folder) and not os.path.isdir(model_folder):
-        if not args.force:
-            raise Exception("Cannot create folder for storing the model,\
-                            there exists a file of the same name.")
-        else:
-            print("Overwriting existing file! ({filename})"
-                  .format(filename=model_folder))
-            os.remove(model_folder)
-    if os.path.isdir(model_folder):
-        if not args.force:
-            response = ""
-            query = "Model already exists, continue? [y/N] "
-            try:
-                response = raw_input(query)
-            except NameError:
-                response = input(query)
-            if response.upper() == 'N' or not response:
-                print("Cancelling download...")
-                sys.exit(0)
-        print("Overwriting existing folder! ({filename})".format(filename=model_folder))
-        deleteDirectory(model_folder)
-
-    # Now we can safely create the folder and download the model
-    os.makedirs(model_folder)
-    for f in ['predict_net.pb', 'init_net.pb']:
-        try:
-            downloadFromURLToFile(getURLFromName(model, f),
-                                  '{folder}/{f}'.format(folder=model_folder,
-                                                        f=f))
-        except Exception as e:
-            print("Abort: {reason}".format(reason=str(e)))
-            print("Cleaning up...")
-            deleteDirectory(model_folder)
-            sys.exit(0)
-
-    if args.install:
-        os.symlink("{folder}/__sym_init__.py".format(folder=dir_path),
-                   "{folder}/__init__.py".format(folder=model_folder))
-
-
-def validModelName(name):
-    invalid_names = ['__init__']
-    if name in invalid_names:
-        return False
-    if not re.match("^[/0-9a-zA-Z_-]+$", name):
-        return False
-    return True
-
-class ModelDownloader:
-
-    def __init__(self, model_env_name='CAFFE2_MODELS'):
-        self.model_env_name = model_env_name
-
-    def _model_dir(self, model):
-        caffe2_home = os.path.expanduser(os.getenv('CAFFE2_HOME', '~/.caffe2'))
-        models_dir = os.getenv(self.model_env_name, os.path.join(caffe2_home, 'models'))
-        return os.path.join(models_dir, model)
-
-    def _download(self, model):
-        model_dir = self._model_dir(model)
-        assert not os.path.exists(model_dir)
-        os.makedirs(model_dir)
-
-        for f in ['predict_net.pb', 'init_net.pb', 'value_info.json']:
-            url = getURLFromName(model, f)
-            dest = os.path.join(model_dir, f)
-            try:
-                downloadFromURLToFile(url, dest, show_progress=False)
-            except TypeError:
-                # show_progress not supported prior to
-                # Caffe2 78c014e752a374d905ecfb465d44fa16e02a28f1
-                # (Sep 17, 2017)
-                downloadFromURLToFile(url, dest)
-            except Exception:
-                deleteDirectory(model_dir)
-                raise
-
-    # This version returns an extra debug_str argument that helps to understand
-    # why our work sometimes fails in sandcastle
-    def get_c2_model_dbg(self, model_name):
-        debug_str = "get_c2_model debug:\n"
-        model_dir = self._model_dir(model_name)
-        if not os.path.exists(model_dir):
-            self._download(model_name)
-
-        c2_predict_pb = os.path.join(model_dir, 'predict_net.pb')
-        debug_str += "c2_predict_pb path: " + c2_predict_pb + "\n"
-        c2_predict_net = caffe2_pb2.NetDef()
-        with open(c2_predict_pb, 'rb') as f:
-            len_read = c2_predict_net.ParseFromString(f.read())
-            debug_str += "c2_predict_pb ParseFromString = " + str(len_read) + "\n"
-        c2_predict_net.name = model_name
-
-        c2_init_pb = os.path.join(model_dir, 'init_net.pb')
-        debug_str += "c2_init_pb path: " + c2_init_pb + "\n"
-        c2_init_net = caffe2_pb2.NetDef()
-        with open(c2_init_pb, 'rb') as f:
-            len_read = c2_init_net.ParseFromString(f.read())
-            debug_str += "c2_init_pb ParseFromString = " + str(len_read) + "\n"
-        c2_init_net.name = model_name + '_init'
-
-        with open(os.path.join(model_dir, 'value_info.json')) as f:
-            value_info = json.load(f)
-        return c2_init_net, c2_predict_net, value_info, debug_str
-
-    def get_c2_model(self, model_name):
-        init_net, predict_net, value_info, _ = self.get_c2_model_dbg(model_name)
-        return init_net, predict_net, value_info
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description='Download or install pretrained models.')
-    parser.add_argument('model', nargs='+',
-                        help='Model to download/install.')
-    parser.add_argument('-i', '--install', action='store_true',
-                        help='Install the model.')
-    parser.add_argument('-f', '--force', action='store_true',
-                        help='Force a download/installation.')
-    args = parser.parse_args()
-    for model in args.model:
-        if validModelName(model):
-            downloadModel(model, args)
-        else:
-            print("'{}' is not a valid model name.".format(model))
diff --git a/caffe2/python/models/imagenet_trainer_test_utils.py b/caffe2/python/models/imagenet_trainer_test_utils.py
deleted file mode 100644
index fec7708ea150..000000000000
--- a/caffe2/python/models/imagenet_trainer_test_utils.py
+++ /dev/null
@@ -1,200 +0,0 @@
-
-
-
-
-
-import numpy as np
-import time
-
-from caffe2.python import workspace, cnn, memonger, core
-
-def has_blob(proto, needle):
-    for op in proto.op:
-        for inp in op.input:
-            if inp == needle:
-                return True
-        for outp in op.output:
-            if outp == needle:
-                return True
-    return False
-
-
-def count_blobs(proto):
-    blobs = set()
-    for op in proto.op:
-        blobs = blobs.union(set(op.input)).union(set(op.output))
-    return len(blobs)
-
-
-def count_shared_blobs(proto):
-    blobs = set()
-    for op in proto.op:
-        blobs = blobs.union(set(op.input)).union(set(op.output))
-    return len([b for b in blobs if "_shared" in b])
-
-
-def test_shared_grads(
-    with_shapes,
-    create_model,
-    conv_blob,
-    last_out_blob,
-    data_blob='gpu_0/data',
-    label_blob='gpu_0/label',
-    num_labels=1000,
-):
-    model = cnn.CNNModelHelper(
-        order="NCHW",
-        name="test",
-        cudnn_exhaustive_search=True,
-    )
-    with core.NameScope("gpu_0"):
-        data = model.net.AddExternalInput(data_blob)
-        label = model.net.AddExternalInput(label_blob)
-        (_softmax, loss) = create_model(
-            model,
-            data,
-            num_input_channels=3,
-            num_labels=num_labels,
-            label=label,
-            is_test=False,
-        )
-
-    param_to_grad = model.AddGradientOperators([loss])
-
-    (shapes, types) = workspace.InferShapesAndTypes(
-        [model.param_init_net, model.net],
-        {data_blob: [4, 3, 227, 227],
-         label_blob: [4]},
-    )
-
-    count_before = count_blobs(model.net.Proto())
-    optim_proto = memonger.share_grad_blobs(
-        model.net,
-        ["gpu_0/loss"],
-        set(model.param_to_grad.values()),
-        "gpu_0/",
-        share_activations=True,
-        dont_share_blobs=set([str(param_to_grad[conv_blob])]),
-        blob_shapes=shapes if with_shapes else None,
-    )
-    count_after = count_blobs(optim_proto)
-
-    # Run model and compare results. We check that the loss is same
-    # and also that the final gradient (conv1_w_grad is same)
-    workspace.RunNetOnce(model.param_init_net)
-    data = np.random.rand(4, 3, 227, 227).astype(np.float32)
-    label = (np.random.rand(4) * num_labels).astype(np.int32)
-
-    workspace.FeedBlob(data_blob, data)
-    workspace.FeedBlob(label_blob, label)
-
-    workspace.RunNetOnce(model.net)
-    model.net.Proto().type = 'dag'
-    model.net.Proto().num_workers = 4
-    loss1 = workspace.FetchBlob(last_out_blob)
-    conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob])
-    workspace.FeedBlob(param_to_grad[conv_blob], np.array([0.0]))
-
-    workspace.RunNetOnce(optim_proto)
-    optimized_loss1 = workspace.FetchBlob(last_out_blob)
-    optim_conv1_w_grad = workspace.FetchBlob(param_to_grad[conv_blob])
-
-    return [(count_after, count_before),
-            (loss1, optimized_loss1),
-            (conv1_w_grad, optim_conv1_w_grad)]
-
-
-def test_forward_only(
-    create_model,
-    last_out_blob,
-    data_blob='gpu_0/data',
-    num_labels=1000,
-):
-    model = cnn.CNNModelHelper(
-        order="NCHW",
-        name="test",
-        cudnn_exhaustive_search=True,
-    )
-    with core.NameScope("gpu_0"):
-            data = model.net.AddExternalInput(data_blob)
-            create_model(
-                model,
-                data,
-                num_input_channels=3,
-                num_labels=num_labels,
-                is_test=True
-            )
-
-    count_before = count_blobs(model.net.Proto())
-    optim_proto = memonger.optimize_inference_for_dag(
-        model.net, [data_blob], "gpu_0/"
-    )
-    count_after = count_blobs(optim_proto)
-    num_shared_blobs = count_shared_blobs(optim_proto)
-
-    # Run model and compare results
-    workspace.RunNetOnce(model.param_init_net)
-    data = np.random.rand(4, 3, 227, 227).astype(np.float32)
-
-    workspace.FeedBlob(data_blob, data)
-    workspace.RunNetOnce(model.net)
-    model.net.Proto().type = 'dag'
-    model.net.Proto().num_workers = 4
-    loss1 = workspace.FetchBlob(last_out_blob)
-
-    workspace.RunNetOnce(optim_proto)
-    optimized_loss1 = workspace.FetchBlob(last_out_blob)
-    return [(count_after, count_before),
-            (num_shared_blobs),
-            (loss1, optimized_loss1)]
-
-
-def test_forward_only_fast_simplenet(
-    create_model,
-    last_out_blob,
-    data_blob="gpu_0/data",
-    num_labels=1000,
-):
-    model = cnn.CNNModelHelper(
-        order="NCHW",
-        name="test",
-        cudnn_exhaustive_search=True,
-    )
-    with core.NameScope("gpu_0"):
-            data = model.net.AddExternalInput(data_blob)
-            create_model(
-                model,
-                data,
-                num_input_channels=3,
-                num_labels=num_labels,
-                is_test=True
-            )
-
-    count_before = count_blobs(model.net.Proto())
-    t = time.time()
-    optim_proto = memonger.optimize_inference_fast(
-        model.net.Proto(),
-        set([data_blob, last_out_blob]).union(
-            set(model.net.Proto().external_input))
-    )
-    print("Optimization took {} secs".format(time.time() - t))
-    count_after = count_blobs(optim_proto)
-    num_shared_blobs = count_shared_blobs(optim_proto)
-
-    print(count_after, count_before, num_shared_blobs)
-
-    # Run model and compare results
-    workspace.RunNetOnce(model.param_init_net)
-    data = np.random.rand(4, 3, 227, 227).astype(np.float32)
-
-    workspace.FeedBlob(data_blob, data)
-    model.net.Proto().type = 'simple'
-
-    workspace.RunNetOnce(model.net)
-    loss1 = workspace.FetchBlob(last_out_blob)
-
-    workspace.RunNetOnce(optim_proto)
-    optimized_loss1 = workspace.FetchBlob(last_out_blob)
-    return [(count_after, count_before),
-            (num_shared_blobs),
-            (loss1, optimized_loss1)]
diff --git a/caffe2/python/models/resnet.py b/caffe2/python/models/resnet.py
deleted file mode 100644
index 430d3d335e1e..000000000000
--- a/caffe2/python/models/resnet.py
+++ /dev/null
@@ -1,434 +0,0 @@
-## @package resnet
-# Module caffe2.python.models.resnet
-
-
-
-
-
-from caffe2.python import brew
-import logging
-
-'''
-Utility for creating ResNe(X)t
-"Deep Residual Learning for Image Recognition" by He, Zhang et. al. 2015
-"Aggregated Residual Transformations for Deep Neural Networks" by Xie et. al. 2016
-'''
-
-
-class ResNetBuilder():
-    '''
-    Helper class for constructing residual blocks.
-    '''
-
-    def __init__(
-        self,
-        model,
-        prev_blob,
-        no_bias,
-        is_test,
-        bn_epsilon=1e-5,
-        bn_momentum=0.9,
-    ):
-        self.model = model
-        self.comp_count = 0
-        self.comp_idx = 0
-        self.prev_blob = prev_blob
-        self.is_test = is_test
-        self.bn_epsilon = bn_epsilon
-        self.bn_momentum = bn_momentum
-        self.no_bias = 1 if no_bias else 0
-
-    def add_conv(
-        self,
-        in_filters,
-        out_filters,
-        kernel,
-        stride=1,
-        group=1,
-        pad=0,
-    ):
-        self.comp_idx += 1
-        self.prev_blob = brew.conv(
-            self.model,
-            self.prev_blob,
-            'comp_%d_conv_%d' % (self.comp_count, self.comp_idx),
-            in_filters,
-            out_filters,
-            weight_init=("MSRAFill", {}),
-            kernel=kernel,
-            stride=stride,
-            group=group,
-            pad=pad,
-            no_bias=self.no_bias,
-        )
-        return self.prev_blob
-
-    def add_relu(self):
-        self.prev_blob = brew.relu(
-            self.model,
-            self.prev_blob,
-            self.prev_blob,  # in-place
-        )
-        return self.prev_blob
-
-    def add_spatial_bn(self, num_filters):
-        self.prev_blob = brew.spatial_bn(
-            self.model,
-            self.prev_blob,
-            'comp_%d_spatbn_%d' % (self.comp_count, self.comp_idx),
-            num_filters,
-            epsilon=self.bn_epsilon,
-            momentum=self.bn_momentum,
-            is_test=self.is_test,
-        )
-        return self.prev_blob
-
-    '''
-    Add a "bottleneck" component as described in He et. al. Figure 3 (right)
-    '''
-
-    def add_bottleneck(
-        self,
-        input_filters,   # num of feature maps from preceding layer
-        base_filters,    # num of filters internally in the component
-        output_filters,  # num of feature maps to output
-        stride=1,
-        group=1,
-        spatial_batch_norm=True,
-    ):
-        self.comp_idx = 0
-        shortcut_blob = self.prev_blob
-
-        # 1x1
-        self.add_conv(
-            input_filters,
-            base_filters,
-            kernel=1,
-            stride=1,
-        )
-
-        if spatial_batch_norm:
-            self.add_spatial_bn(base_filters)
-
-        self.add_relu()
-
-        # 3x3 (note the pad, required for keeping dimensions)
-        self.add_conv(
-            base_filters,
-            base_filters,
-            kernel=3,
-            stride=stride,
-            group=group,
-            pad=1,
-        )
-
-        if spatial_batch_norm:
-            self.add_spatial_bn(base_filters)
-        self.add_relu()
-
-        # 1x1
-        last_conv = self.add_conv(base_filters, output_filters, kernel=1)
-        if spatial_batch_norm:
-            last_conv = self.add_spatial_bn(output_filters)
-
-        # Summation with input signal (shortcut)
-        # When the number of feature maps mismatch between the input
-        # and output (this usually happens when the residual stage
-        # changes), we need to do a projection for the short cut
-        if output_filters != input_filters:
-            shortcut_blob = brew.conv(
-                self.model,
-                shortcut_blob,
-                'shortcut_projection_%d' % self.comp_count,
-                input_filters,
-                output_filters,
-                weight_init=("MSRAFill", {}),
-                kernel=1,
-                stride=stride,
-                no_bias=self.no_bias,
-            )
-            if spatial_batch_norm:
-                shortcut_blob = brew.spatial_bn(
-                    self.model,
-                    shortcut_blob,
-                    'shortcut_projection_%d_spatbn' % self.comp_count,
-                    output_filters,
-                    epsilon=self.bn_epsilon,
-                    momentum=self.bn_momentum,
-                    is_test=self.is_test,
-                )
-
-        self.prev_blob = brew.sum(
-            self.model, [shortcut_blob, last_conv],
-            'comp_%d_sum_%d' % (self.comp_count, self.comp_idx)
-        )
-        self.comp_idx += 1
-        self.add_relu()
-
-        # Keep track of number of high level components if this ResNetBuilder
-        self.comp_count += 1
-
-        return output_filters
-
-    def add_simple_block(
-        self,
-        input_filters,
-        num_filters,
-        down_sampling=False,
-        spatial_batch_norm=True
-    ):
-        self.comp_idx = 0
-        shortcut_blob = self.prev_blob
-
-        # 3x3
-        self.add_conv(
-            input_filters,
-            num_filters,
-            kernel=3,
-            stride=(1 if down_sampling is False else 2),
-            pad=1
-        )
-
-        if spatial_batch_norm:
-            self.add_spatial_bn(num_filters)
-        self.add_relu()
-
-        last_conv = self.add_conv(num_filters, num_filters, kernel=3, pad=1)
-        if spatial_batch_norm:
-            last_conv = self.add_spatial_bn(num_filters)
-
-        # Increase of dimensions, need a projection for the shortcut
-        if (num_filters != input_filters):
-            shortcut_blob = brew.conv(
-                self.model,
-                shortcut_blob,
-                'shortcut_projection_%d' % self.comp_count,
-                input_filters,
-                num_filters,
-                weight_init=("MSRAFill", {}),
-                kernel=1,
-                stride=(1 if down_sampling is False else 2),
-                no_bias=self.no_bias,
-            )
-            if spatial_batch_norm:
-                shortcut_blob = brew.spatial_bn(
-                    self.model,
-                    shortcut_blob,
-                    'shortcut_projection_%d_spatbn' % self.comp_count,
-                    num_filters,
-                    epsilon=1e-3,
-                    is_test=self.is_test,
-                )
-
-        self.prev_blob = brew.sum(
-            self.model, [shortcut_blob, last_conv],
-            'comp_%d_sum_%d' % (self.comp_count, self.comp_idx)
-        )
-        self.comp_idx += 1
-        self.add_relu()
-
-        # Keep track of number of high level components if this ResNetBuilder
-        self.comp_count += 1
-
-
-def create_resnet_32x32(
-    model, data, num_input_channels, num_groups, num_labels, is_test=False
-):
-    '''
-    Create residual net for smaller images (sec 4.2 of He et. al (2015))
-    num_groups = 'n' in the paper
-    '''
-    # conv1 + maxpool
-    brew.conv(
-        model, data, 'conv1', num_input_channels, 16, kernel=3, stride=1
-    )
-    brew.spatial_bn(
-        model, 'conv1', 'conv1_spatbn', 16, epsilon=1e-3, is_test=is_test
-    )
-    brew.relu(model, 'conv1_spatbn', 'relu1')
-
-    # Number of blocks as described in sec 4.2
-    filters = [16, 32, 64]
-
-    builder = ResNetBuilder(model, 'relu1', no_bias=0, is_test=is_test)
-    prev_filters = 16
-    for groupidx in range(0, 3):
-        for blockidx in range(0, 2 * num_groups):
-            builder.add_simple_block(
-                prev_filters if blockidx == 0 else filters[groupidx],
-                filters[groupidx],
-                down_sampling=(True if blockidx == 0 and
-                               groupidx > 0 else False))
-        prev_filters = filters[groupidx]
-
-    # Final layers
-    brew.average_pool(
-        model, builder.prev_blob, 'final_avg', kernel=8, stride=1
-    )
-    brew.fc(model, 'final_avg', 'last_out', 64, num_labels)
-    softmax = brew.softmax(model, 'last_out', 'softmax')
-    return softmax
-
-
-RESNEXT_BLOCK_CONFIG = {
-    18: (2, 2, 2, 2),
-    34: (3, 4, 6, 3),
-    50: (3, 4, 6, 3),
-    101: (3, 4, 23, 3),
-    152: (3, 8, 36, 3),
-    200: (3, 24, 36, 3),
-}
-
-RESNEXT_STRIDES = [1, 2, 2, 2]
-
-logging.basicConfig()
-log = logging.getLogger("resnext_builder")
-log.setLevel(logging.DEBUG)
-
-
-# The conv1 and final_avg kernel/stride args provide a basic mechanism for
-# adapting resnet50 for different sizes of input images.
-def create_resnext(
-    model,
-    data,
-    num_input_channels,
-    num_labels,
-    num_layers,
-    num_groups,
-    num_width_per_group,
-    label=None,
-    is_test=False,
-    no_loss=False,
-    no_bias=1,
-    conv1_kernel=7,
-    conv1_stride=2,
-    final_avg_kernel=7,
-    log=None,
-    bn_epsilon=1e-5,
-    bn_momentum=0.9,
-):
-    if num_layers not in RESNEXT_BLOCK_CONFIG:
-        log.error("{}-layer is invalid for resnext config".format(num_layers))
-
-    num_blocks = RESNEXT_BLOCK_CONFIG[num_layers]
-    strides = RESNEXT_STRIDES
-    num_filters = [64, 256, 512, 1024, 2048]
-
-    if num_layers in [18, 34]:
-        num_filters = [64, 64, 128, 256, 512]
-
-    # the number of features before the last FC layer
-    num_features = num_filters[-1]
-
-    # conv1 + maxpool
-    conv_blob = brew.conv(
-        model,
-        data,
-        'conv1',
-        num_input_channels,
-        num_filters[0],
-        weight_init=("MSRAFill", {}),
-        kernel=conv1_kernel,
-        stride=conv1_stride,
-        pad=3,
-        no_bias=no_bias
-    )
-
-    bn_blob = brew.spatial_bn(
-        model,
-        conv_blob,
-        'conv1_spatbn_relu',
-        num_filters[0],
-        epsilon=bn_epsilon,
-        momentum=bn_momentum,
-        is_test=is_test
-    )
-    relu_blob = brew.relu(model, bn_blob, bn_blob)
-    max_pool = brew.max_pool(model, relu_blob, 'pool1', kernel=3, stride=2, pad=1)
-
-    # Residual blocks...
-    builder = ResNetBuilder(model, max_pool, no_bias=no_bias,
-                            is_test=is_test, bn_epsilon=1e-5, bn_momentum=0.9)
-
-    inner_dim = num_groups * num_width_per_group
-
-    # 4 different kinds of residual blocks
-    for residual_idx in range(4):
-        residual_num = num_blocks[residual_idx]
-        residual_stride = strides[residual_idx]
-        dim_in = num_filters[residual_idx]
-
-        for blk_idx in range(residual_num):
-            dim_in = builder.add_bottleneck(
-                dim_in,
-                inner_dim,
-                num_filters[residual_idx + 1],  # dim out
-                stride=residual_stride if blk_idx == 0 else 1,
-                group=num_groups,
-            )
-
-        inner_dim *= 2
-
-    # Final layers
-    final_avg = brew.average_pool(
-        model,
-        builder.prev_blob,
-        'final_avg',
-        kernel=final_avg_kernel,
-        stride=1,
-        global_pooling=True,
-    )
-
-    # Final dimension of the "image" is reduced to 7x7
-    last_out = brew.fc(
-        model, final_avg, 'last_out_L{}'.format(num_labels), num_features, num_labels
-    )
-
-    if no_loss:
-        return last_out
-
-    # If we create model for training, use softmax-with-loss
-    if (label is not None):
-        (softmax, loss) = model.SoftmaxWithLoss(
-            [last_out, label],
-            ["softmax", "loss"],
-        )
-
-        return (softmax, loss)
-    else:
-        # For inference, we just return softmax
-        return brew.softmax(model, last_out, "softmax")
-
-
-# The conv1 and final_avg kernel/stride args provide a basic mechanism for
-# adapting resnet50 for different sizes of input images.
-def create_resnet50(
-    model,
-    data,
-    num_input_channels,
-    num_labels,
-    label=None,
-    is_test=False,
-    no_loss=False,
-    no_bias=0,
-    conv1_kernel=7,
-    conv1_stride=2,
-    final_avg_kernel=7,
-):
-    # resnet50 is a special case for ResNeXt50-1x64d
-    return create_resnext(
-        model,
-        data,
-        num_input_channels,
-        num_labels,
-        num_layers=50,
-        num_groups=1,
-        num_width_per_group=64,
-        label=label,
-        is_test=is_test,
-        no_loss=no_loss,
-        no_bias=no_bias,
-        conv1_kernel=conv1_kernel,
-        conv1_stride=conv1_stride,
-        final_avg_kernel=final_avg_kernel,
-    )
diff --git a/caffe2/python/models/resnet_test.py b/caffe2/python/models/resnet_test.py
deleted file mode 100644
index d089dd72fcba..000000000000
--- a/caffe2/python/models/resnet_test.py
+++ /dev/null
@@ -1,62 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-import caffe2.python.models.resnet as resnet
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.models.imagenet_trainer_test_utils as utils
-
-class ResnetMemongerTest(hu.HypothesisTestCase):
-
-    @given(with_shapes=st.booleans(), **hu.gcs_cpu_only)
-    @settings(max_examples=2, deadline=None)
-    def test_resnet_shared_grads(self, with_shapes, gc, dc):
-        results = utils.test_shared_grads(
-            with_shapes,
-            resnet.create_resnet50,
-            'gpu_0/conv1_w',
-            'gpu_0/last_out_L1000'
-        )
-        self.assertTrue(results[0][0] < results[0][1])
-        np.testing.assert_almost_equal(results[1][0], results[1][1])
-        np.testing.assert_almost_equal(results[2][0], results[2][1])
-
-    def test_resnet_forward_only(self):
-        results = utils.test_forward_only(
-            resnet.create_resnet50,
-            'gpu_0/last_out_L1000'
-        )
-        self.assertTrue(results[0][0] < results[0][1])
-        self.assertTrue(results[1] < 7 and results[1] > 0)
-        np.testing.assert_almost_equal(results[2][0], results[2][1])
-
-    def test_resnet_forward_only_fast_simplenet(self):
-        '''
-        Test C++ memonger that is only for simple nets
-        '''
-        results = utils.test_forward_only_fast_simplenet(
-            resnet.create_resnet50,
-            'gpu_0/last_out_L1000'
-        )
-
-        self.assertTrue(results[0][0] < results[0][1])
-        self.assertTrue(results[1] < 4 and results[1] > 0)
-        np.testing.assert_almost_equal(results[2][0], results[2][1])
-
-
-if __name__ == "__main__":
-    import unittest
-    import random
-    random.seed(2603)
-    # pyre-fixme[10]: Name `workspace` is used but not defined in the current scope
-    workspace.GlobalInit([
-        'caffe2',
-        '--caffe2_log_level=0',
-        '--caffe2_print_blob_sizes_at_exit=0',
-        '--caffe2_gpu_memory_tracking=1'])
-    unittest.main()
diff --git a/caffe2/python/models/seq2seq/__init__.py b/caffe2/python/models/seq2seq/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/models/seq2seq/beam_search.py b/caffe2/python/models/seq2seq/beam_search.py
deleted file mode 100644
index a94deb965e1b..000000000000
--- a/caffe2/python/models/seq2seq/beam_search.py
+++ /dev/null
@@ -1,493 +0,0 @@
-## @package beam_search
-# Module caffe2.python.models.seq2seq.beam_search
-
-
-
-
-
-from collections import namedtuple
-from caffe2.python import core
-import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
-from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
-
-
-class BeamSearchForwardOnly:
-    """
-    Class generalizing forward beam search for seq2seq models.
-
-    Also provides types to specify the recurrent structure of decoding:
-
-    StateConfig:
-        initial_value: blob providing value of state at first step_model
-        state_prev_link: LinkConfig describing how recurrent step receives
-            input from global state blob in each step
-        state_link: LinkConfig describing how step writes (produces new state)
-            to global state blob in each step
-
-    LinkConfig:
-        blob: blob connecting global state blob to step application
-        offset: offset from beginning of global blob for link in time dimension
-        window: width of global blob to read/write in time dimension
-    """
-
-    LinkConfig = namedtuple('LinkConfig', ['blob', 'offset', 'window'])
-
-    StateConfig = namedtuple(
-        'StateConfig',
-        ['initial_value', 'state_prev_link', 'state_link'],
-    )
-
-    def __init__(
-        self,
-        beam_size,
-        model,
-        eos_token_id,
-        go_token_id=seq2seq_util.GO_ID,
-        post_eos_penalty=None,
-    ):
-        self.beam_size = beam_size
-        self.model = model
-        self.step_model = Seq2SeqModelHelper(
-            name='step_model',
-            param_model=self.model,
-        )
-        self.go_token_id = go_token_id
-        self.eos_token_id = eos_token_id
-        self.post_eos_penalty = post_eos_penalty
-
-        (
-            self.timestep,
-            self.scores_t_prev,
-            self.tokens_t_prev,
-            self.hypo_t_prev,
-            self.attention_t_prev,
-        ) = self.step_model.net.AddExternalInputs(
-            'timestep',
-            'scores_t_prev',
-            'tokens_t_prev',
-            'hypo_t_prev',
-            'attention_t_prev',
-        )
-        tokens_t_prev_int32 = self.step_model.net.Cast(
-            self.tokens_t_prev,
-            'tokens_t_prev_int32',
-            to=core.DataType.INT32,
-        )
-        self.tokens_t_prev_int32_flattened, _ = self.step_model.net.Reshape(
-            [tokens_t_prev_int32],
-            [tokens_t_prev_int32, 'input_t_int32_old_shape'],
-            shape=[1, -1],
-        )
-
-    def get_step_model(self):
-        return self.step_model
-
-    def get_previous_tokens(self):
-        return self.tokens_t_prev_int32_flattened
-
-    def get_timestep(self):
-        return self.timestep
-
-    # TODO: make attentions a generic state
-    # data_dependencies is a list of blobs that the operator should wait for
-    # before beginning execution. This ensures that ops are run in the correct
-    # order when the RecurrentNetwork op is embedded in a DAGNet, for ex.
-    def apply(
-        self,
-        inputs,
-        length,
-        log_probs,
-        attentions,
-        state_configs,
-        data_dependencies,
-        word_rewards=None,
-        possible_translation_tokens=None,
-        go_token_id=None,
-    ):
-        ZERO = self.model.param_init_net.ConstantFill(
-            [],
-            'ZERO',
-            shape=[1],
-            value=0,
-            dtype=core.DataType.INT32,
-        )
-        on_initial_step = self.step_model.net.EQ(
-            [ZERO, self.timestep],
-            'on_initial_step',
-        )
-
-        if self.post_eos_penalty is not None:
-            eos_token = self.model.param_init_net.ConstantFill(
-                [],
-                'eos_token',
-                shape=[self.beam_size],
-                value=self.eos_token_id,
-                dtype=core.DataType.INT32,
-            )
-            finished_penalty = self.model.param_init_net.ConstantFill(
-                [],
-                'finished_penalty',
-                shape=[1],
-                value=float(self.post_eos_penalty),
-                dtype=core.DataType.FLOAT,
-            )
-            ZERO_FLOAT = self.model.param_init_net.ConstantFill(
-                [],
-                'ZERO_FLOAT',
-                shape=[1],
-                value=0.0,
-                dtype=core.DataType.FLOAT,
-            )
-            finished_penalty = self.step_model.net.Conditional(
-                [on_initial_step, ZERO_FLOAT, finished_penalty],
-                'possible_finished_penalty',
-            )
-
-            tokens_t_flat = self.step_model.net.FlattenToVec(
-                self.tokens_t_prev,
-                'tokens_t_flat',
-            )
-            tokens_t_flat_int = self.step_model.net.Cast(
-                tokens_t_flat,
-                'tokens_t_flat_int',
-                to=core.DataType.INT32,
-            )
-
-            predecessor_is_eos = self.step_model.net.EQ(
-                [tokens_t_flat_int, eos_token],
-                'predecessor_is_eos',
-            )
-            predecessor_is_eos_float = self.step_model.net.Cast(
-                predecessor_is_eos,
-                'predecessor_is_eos_float',
-                to=core.DataType.FLOAT,
-            )
-            predecessor_is_eos_penalty = self.step_model.net.Mul(
-                [predecessor_is_eos_float, finished_penalty],
-                'predecessor_is_eos_penalty',
-                broadcast=1,
-            )
-
-            log_probs = self.step_model.net.Add(
-                [log_probs, predecessor_is_eos_penalty],
-                'log_probs_penalized',
-                broadcast=1,
-                axis=0,
-            )
-
-        # [beam_size, beam_size]
-        best_scores_per_hypo, best_tokens_per_hypo = self.step_model.net.TopK(
-            log_probs,
-            ['best_scores_per_hypo', 'best_tokens_per_hypo_indices'],
-            k=self.beam_size,
-        )
-        if possible_translation_tokens:
-            # [beam_size, beam_size]
-            best_tokens_per_hypo = self.step_model.net.Gather(
-                [possible_translation_tokens, best_tokens_per_hypo],
-                ['best_tokens_per_hypo']
-            )
-
-        # [beam_size]
-        scores_t_prev_squeezed, _ = self.step_model.net.Reshape(
-            self.scores_t_prev,
-            ['scores_t_prev_squeezed', 'scores_t_prev_old_shape'],
-            shape=[self.beam_size],
-        )
-        # [beam_size, beam_size]
-        output_scores = self.step_model.net.Add(
-            [best_scores_per_hypo, scores_t_prev_squeezed],
-            'output_scores',
-            broadcast=1,
-            axis=0,
-        )
-        if word_rewards is not None:
-            # [beam_size, beam_size]
-            word_rewards_for_best_tokens_per_hypo = self.step_model.net.Gather(
-                [word_rewards, best_tokens_per_hypo],
-                'word_rewards_for_best_tokens_per_hypo',
-            )
-            # [beam_size, beam_size]
-            output_scores = self.step_model.net.Add(
-                [output_scores, word_rewards_for_best_tokens_per_hypo],
-                'output_scores',
-            )
-        # [beam_size * beam_size]
-        output_scores_flattened, _ = self.step_model.net.Reshape(
-            [output_scores],
-            [output_scores, 'output_scores_old_shape'],
-            shape=[-1],
-        )
-        MINUS_ONE_INT32 = self.model.param_init_net.ConstantFill(
-            [],
-            'MINUS_ONE_INT32',
-            value=-1,
-            shape=[1],
-            dtype=core.DataType.INT32,
-        )
-        BEAM_SIZE = self.model.param_init_net.ConstantFill(
-            [],
-            'beam_size',
-            shape=[1],
-            value=self.beam_size,
-            dtype=core.DataType.INT32,
-        )
-
-        # current_beam_size (predecessor states from previous step)
-        # is 1 on first step (so we just need beam_size scores),
-        # and beam_size subsequently (so we need all beam_size * beam_size
-        # scores)
-        slice_end = self.step_model.net.Conditional(
-            [on_initial_step, BEAM_SIZE, MINUS_ONE_INT32],
-            ['slice_end'],
-        )
-
-        # [current_beam_size * beam_size]
-        output_scores_flattened_slice = self.step_model.net.Slice(
-            [output_scores_flattened, ZERO, slice_end],
-            'output_scores_flattened_slice',
-        )
-        # [1, current_beam_size * beam_size]
-        output_scores_flattened_slice, _ = self.step_model.net.Reshape(
-            output_scores_flattened_slice,
-            [
-                output_scores_flattened_slice,
-                'output_scores_flattened_slice_old_shape',
-            ],
-            shape=[1, -1],
-        )
-        # [1, beam_size]
-        scores_t, best_indices = self.step_model.net.TopK(
-            output_scores_flattened_slice,
-            ['scores_t', 'best_indices'],
-            k=self.beam_size,
-        )
-        BEAM_SIZE_64 = self.model.param_init_net.Cast(
-            BEAM_SIZE,
-            'BEAM_SIZE_64',
-            to=core.DataType.INT64,
-        )
-        # [1, beam_size]
-        hypo_t_int32 = self.step_model.net.Div(
-            [best_indices, BEAM_SIZE_64],
-            'hypo_t_int32',
-            broadcast=1,
-        )
-        hypo_t = self.step_model.net.Cast(
-            hypo_t_int32,
-            'hypo_t',
-            to=core.DataType.FLOAT,
-        )
-
-        # [beam_size, encoder_length, 1]
-        attention_t = self.step_model.net.Gather(
-            [attentions, hypo_t_int32],
-            'attention_t',
-        )
-        # [1, beam_size, encoder_length]
-        attention_t, _ = self.step_model.net.Reshape(
-            attention_t,
-            [attention_t, 'attention_t_old_shape'],
-            shape=[1, self.beam_size, -1],
-        )
-        # [beam_size * beam_size]
-        best_tokens_per_hypo_flatten, _ = self.step_model.net.Reshape(
-            best_tokens_per_hypo,
-            [
-                'best_tokens_per_hypo_flatten',
-                'best_tokens_per_hypo_old_shape',
-            ],
-            shape=[-1],
-        )
-        tokens_t_int32 = self.step_model.net.Gather(
-            [best_tokens_per_hypo_flatten, best_indices],
-            'tokens_t_int32',
-        )
-        tokens_t = self.step_model.net.Cast(
-            tokens_t_int32,
-            'tokens_t',
-            to=core.DataType.FLOAT,
-        )
-
-        def choose_state_per_hypo(state_config):
-            state_flattened, _ = self.step_model.net.Reshape(
-                state_config.state_link.blob,
-                [
-                    state_config.state_link.blob,
-                    state_config.state_link.blob + '_old_shape',
-                ],
-                shape=[self.beam_size, -1],
-            )
-            state_chosen_per_hypo = self.step_model.net.Gather(
-                [state_flattened, hypo_t_int32],
-                str(state_config.state_link.blob) + '_chosen_per_hypo',
-            )
-            return self.StateConfig(
-                initial_value=state_config.initial_value,
-                state_prev_link=state_config.state_prev_link,
-                state_link=self.LinkConfig(
-                    blob=state_chosen_per_hypo,
-                    offset=state_config.state_link.offset,
-                    window=state_config.state_link.window,
-                )
-            )
-        state_configs = [choose_state_per_hypo(c) for c in state_configs]
-        initial_scores = self.model.param_init_net.ConstantFill(
-            [],
-            'initial_scores',
-            shape=[1],
-            value=0.0,
-            dtype=core.DataType.FLOAT,
-        )
-        if go_token_id:
-            initial_tokens = self.model.net.Copy(
-                [go_token_id],
-                'initial_tokens',
-            )
-        else:
-            initial_tokens = self.model.param_init_net.ConstantFill(
-                [],
-                'initial_tokens',
-                shape=[1],
-                value=float(self.go_token_id),
-                dtype=core.DataType.FLOAT,
-            )
-
-        initial_hypo = self.model.param_init_net.ConstantFill(
-            [],
-            'initial_hypo',
-            shape=[1],
-            value=0.0,
-            dtype=core.DataType.FLOAT,
-        )
-        encoder_inputs_flattened, _ = self.model.net.Reshape(
-            inputs,
-            ['encoder_inputs_flattened', 'encoder_inputs_old_shape'],
-            shape=[-1],
-        )
-        init_attention = self.model.net.ConstantFill(
-            encoder_inputs_flattened,
-            'init_attention',
-            value=0.0,
-            dtype=core.DataType.FLOAT,
-        )
-        state_configs = state_configs + [
-            self.StateConfig(
-                initial_value=initial_scores,
-                state_prev_link=self.LinkConfig(self.scores_t_prev, 0, 1),
-                state_link=self.LinkConfig(scores_t, 1, 1),
-            ),
-            self.StateConfig(
-                initial_value=initial_tokens,
-                state_prev_link=self.LinkConfig(self.tokens_t_prev, 0, 1),
-                state_link=self.LinkConfig(tokens_t, 1, 1),
-            ),
-            self.StateConfig(
-                initial_value=initial_hypo,
-                state_prev_link=self.LinkConfig(self.hypo_t_prev, 0, 1),
-                state_link=self.LinkConfig(hypo_t, 1, 1),
-            ),
-            self.StateConfig(
-                initial_value=init_attention,
-                state_prev_link=self.LinkConfig(self.attention_t_prev, 0, 1),
-                state_link=self.LinkConfig(attention_t, 1, 1),
-            ),
-        ]
-        fake_input = self.model.net.ConstantFill(
-            length,
-            'beam_search_fake_input',
-            input_as_shape=True,
-            extra_shape=[self.beam_size, 1],
-            value=0.0,
-            dtype=core.DataType.FLOAT,
-        )
-        all_inputs = (
-            [fake_input] +
-            self.step_model.params +
-            [state_config.initial_value for state_config in state_configs] +
-            data_dependencies
-        )
-        forward_links = []
-        recurrent_states = []
-        for state_config in state_configs:
-            state_name = str(state_config.state_prev_link.blob) + '_states'
-            recurrent_states.append(state_name)
-            forward_links.append((
-                state_config.state_prev_link.blob,
-                state_name,
-                state_config.state_prev_link.offset,
-                state_config.state_prev_link.window,
-            ))
-            forward_links.append((
-                state_config.state_link.blob,
-                state_name,
-                state_config.state_link.offset,
-                state_config.state_link.window,
-            ))
-        link_internal, link_external, link_offset, link_window = (
-            zip(*forward_links)
-        )
-        all_outputs = [
-            str(s) + '_all'
-            for s in [scores_t, tokens_t, hypo_t, attention_t]
-        ]
-        results = self.model.net.RecurrentNetwork(
-            all_inputs,
-            all_outputs + ['step_workspaces'],
-            param=[all_inputs.index(p) for p in self.step_model.params],
-            alias_src=[
-                str(s) + '_states'
-                for s in [
-                    self.scores_t_prev,
-                    self.tokens_t_prev,
-                    self.hypo_t_prev,
-                    self.attention_t_prev,
-                ]
-            ],
-            alias_dst=all_outputs,
-            alias_offset=[0] * 4,
-            recurrent_states=recurrent_states,
-            initial_recurrent_state_ids=[
-                all_inputs.index(state_config.initial_value)
-                for state_config in state_configs
-            ],
-            link_internal=[str(l) for l in link_internal],
-            link_external=[str(l) for l in link_external],
-            link_offset=link_offset,
-            link_window=link_window,
-            backward_link_internal=[],
-            backward_link_external=[],
-            backward_link_offset=[],
-            step_net=self.step_model.net.Proto(),
-            timestep=str(self.timestep),
-            outputs_with_grads=[],
-            enable_rnn_executor=1,
-            rnn_executor_debug=0
-        )
-        score_t_all, tokens_t_all, hypo_t_all, attention_t_all = results[:4]
-
-        output_token_beam_list = self.model.net.Cast(
-            tokens_t_all,
-            'output_token_beam_list',
-            to=core.DataType.INT32,
-        )
-        output_prev_index_beam_list = self.model.net.Cast(
-            hypo_t_all,
-            'output_prev_index_beam_list',
-            to=core.DataType.INT32,
-        )
-        output_score_beam_list = self.model.net.Alias(
-            score_t_all,
-            'output_score_beam_list',
-        )
-        output_attention_weights_beam_list = self.model.net.Alias(
-            attention_t_all,
-            'output_attention_weights_beam_list',
-        )
-
-        return (
-            output_token_beam_list,
-            output_prev_index_beam_list,
-            output_score_beam_list,
-            output_attention_weights_beam_list,
-        )
diff --git a/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py b/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py
deleted file mode 100644
index c10d2f1ab4ed..000000000000
--- a/caffe2/python/models/seq2seq/seq2seq_beam_search_test.py
+++ /dev/null
@@ -1,215 +0,0 @@
-
-
-
-
-
-import numpy as np
-import os
-import tempfile
-
-from caffe2.python import test_util, workspace
-import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
-from caffe2.python.models.seq2seq.train import Seq2SeqModelCaffe2
-from caffe2.python.models.seq2seq.translate import (
-    Seq2SeqModelCaffe2EnsembleDecoder,
-)
-
-
-class Seq2SeqBeamSearchTest(test_util.TestCase):
-
-    def _build_seq2seq_model(
-        self,
-        model_params,
-        tmp_dir,
-        source_vocab_size=20,
-        target_vocab_size=20,
-        num_gpus=0,
-        batch_size=2,
-    ):
-        training_params = dict(
-            model_params,
-            batch_size=batch_size,
-            optimizer_params=dict(
-                learning_rate=0.1,
-            ),
-            max_gradient_norm=1.0,
-        )
-
-        model_obj = Seq2SeqModelCaffe2(
-            training_params,
-            source_vocab_size,
-            target_vocab_size,
-            num_gpus,
-        )
-        model_obj.initialize_from_scratch()
-
-        checkpoint_path_prefix = os.path.join(tmp_dir, 'checkpoint')
-        checkpoint_path = model_obj.save(
-            checkpoint_path_prefix=checkpoint_path_prefix,
-            current_step=0,
-        )
-
-        return model_obj, checkpoint_path
-
-    def _run_compare_train_inference(self, model_params):
-        tmp_dir = tempfile.mkdtemp()
-
-        model_obj, checkpoint_path = self._build_seq2seq_model(
-            model_params,
-            tmp_dir=tmp_dir,
-            source_vocab_size=20,
-            target_vocab_size=20,
-            num_gpus=0,
-            batch_size=2,
-        )
-        assert model_obj is not None
-
-        translate_params = dict(
-            ensemble_models=[dict(
-                source_vocab={i: str(i) for i in range(20)},
-                target_vocab={i: str(i) for i in range(20)},
-                model_params=model_params,
-                model_file=checkpoint_path,
-            )],
-            decoding_params=dict(
-                beam_size=3,
-                word_reward=0,
-                unk_reward=0,
-            ),
-        )
-
-        beam_decoder_model = Seq2SeqModelCaffe2EnsembleDecoder(translate_params)
-        beam_decoder_model.load_models()
-
-        encoder_lengths = 5
-        decoder_lengths = 7
-
-        for _ in range(3):
-            encoder_inputs = np.random.random_integers(
-                low=3,  # after GO_ID (1) and EOS_ID (2)
-                high=19,
-                size=encoder_lengths,
-            )
-            targets, _, beam_model_score = beam_decoder_model.decode(
-                encoder_inputs,
-                decoder_lengths,
-            )
-            targets_2, _, beam_model_score = beam_decoder_model.decode(
-                encoder_inputs,
-                decoder_lengths,
-            )
-            self.assertEqual(targets, targets_2)
-
-            workspace.FeedBlob(
-                'encoder_inputs',
-                np.array(
-                    [list(reversed(encoder_inputs))]
-                ).transpose().astype(dtype=np.int32))
-            workspace.FeedBlob(
-                'encoder_lengths',
-                np.array([len(encoder_inputs)]).astype(dtype=np.int32),
-            )
-            decoder_inputs = [seq2seq_util.GO_ID] + targets[:-1]
-            workspace.FeedBlob(
-                'decoder_inputs',
-                np.array([decoder_inputs]).transpose().astype(dtype=np.int32),
-            )
-            workspace.FeedBlob(
-                'decoder_lengths',
-                np.array([len(decoder_inputs)]).astype(dtype=np.int32),
-            )
-            workspace.FeedBlob(
-                'targets',
-                np.array([targets]).transpose().astype(dtype=np.int32),
-            )
-            workspace.FeedBlob(
-                'target_weights',
-                np.array([[1.0] * len(targets)]).astype(dtype=np.float32),
-            )
-
-            workspace.RunNet(model_obj.forward_net)
-            train_model_score = workspace.FetchBlob('total_loss_scalar')
-
-            np.testing.assert_almost_equal(
-                beam_model_score,
-                train_model_score,
-                decimal=4,
-            )
-
-    def test_attention(self):
-        model_params = dict(
-            attention='regular',
-            decoder_layer_configs=[
-                dict(
-                    num_units=32,
-                ),
-            ],
-            encoder_type=dict(
-                encoder_layer_configs=[
-                    dict(
-                        num_units=16,
-                    ),
-                ],
-                use_bidirectional_encoder=True,
-            ),
-            encoder_embedding_size=8,
-            decoder_embedding_size=8,
-            decoder_softmax_size=None,
-        )
-        self._run_compare_train_inference(model_params)
-
-    def test_2layer_attention(self):
-        model_params = dict(
-            attention='regular',
-            decoder_layer_configs=[
-                dict(
-                    num_units=32,
-                ),
-                dict(
-                    num_units=32,
-                ),
-            ],
-            encoder_type=dict(
-                encoder_layer_configs=[
-                    dict(
-                        num_units=16,
-                    ),
-                    dict(
-                        num_units=32,
-                    ),
-                ],
-                use_bidirectional_encoder=True,
-            ),
-            encoder_embedding_size=8,
-            decoder_embedding_size=8,
-            decoder_softmax_size=None,
-        )
-        self._run_compare_train_inference(model_params)
-
-    def test_multi_decoder(self):
-        model_params = dict(
-            attention='regular',
-            decoder_layer_configs=[
-                dict(
-                    num_units=32,
-                ),
-                dict(
-                    num_units=32,
-                ),
-                dict(
-                    num_units=32,
-                ),
-            ],
-            encoder_type=dict(
-                encoder_layer_configs=[
-                    dict(
-                        num_units=32,
-                    ),
-                ],
-                use_bidirectional_encoder=False,
-            ),
-            encoder_embedding_size=8,
-            decoder_embedding_size=8,
-            decoder_softmax_size=None,
-        )
-        self._run_compare_train_inference(model_params)
diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper.py b/caffe2/python/models/seq2seq/seq2seq_model_helper.py
deleted file mode 100644
index 4eedbde4ab0e..000000000000
--- a/caffe2/python/models/seq2seq/seq2seq_model_helper.py
+++ /dev/null
@@ -1,82 +0,0 @@
-## @package seq2seq_model_helper
-# Module caffe2.python.models.seq2seq.seq2seq_model_helper
-
-
-
-
-
-from caffe2.python import scope
-from caffe2.python.model_helper import ModelHelper
-
-
-class Seq2SeqModelHelper(ModelHelper):
-
-    def __init__(self, init_params=True, **kwargs):
-        arg_scope = {
-            'use_cudnn': kwargs.pop('use_cudnn', True),
-            'cudnn_exhaustive_search': kwargs.pop('cudnn_exhaustive_search', False),
-            'order': 'NHWC',
-        }
-        if kwargs.get('ws_nbytes_limit', None):
-            arg_scope['ws_nbytes_limit'] = kwargs.pop('ws_nbytes_limit')
-
-        super().__init__(init_params=init_params, arg_scope=arg_scope, **kwargs)
-        self.non_trainable_params = []
-
-    def AddParam(self, name, init=None, init_value=None, trainable=True):
-        """Adds a parameter to the model's net and it's initializer if needed
-
-        Args:
-            init: a tuple (<initialization_op_name>, <initialization_op_kwargs>)
-            init_value: int, float or str. Can be used instead of `init` as a
-                simple constant initializer
-            trainable: bool, whether to compute gradient for this param or not
-        """
-        if init_value is not None:
-            assert init is None
-            assert type(init_value) in [int, float, str]
-            init = ('ConstantFill', dict(
-                shape=[1],
-                value=init_value,
-            ))
-
-        if self.init_params:
-            param = self.param_init_net.__getattr__(init[0])(
-                [],
-                name,
-                **init[1]
-            )
-        else:
-            param = self.net.AddExternalInput(name)
-
-        if trainable:
-            self.params.append(param)
-        else:
-            self.non_trainable_params.append(param)
-
-        return param
-
-    def GetNonTrainableParams(self, namescope=None):
-        '''
-        Returns the params in current namescope
-        '''
-        if namescope is None:
-            namescope = scope.CurrentNameScope()
-        else:
-            if not namescope.endswith(scope._NAMESCOPE_SEPARATOR):
-                namescope += scope._NAMESCOPE_SEPARATOR
-
-        if namescope == '':
-            return self.non_trainable_params[:]
-        else:
-            return [
-                p for p in self.non_trainable_params
-                if p.GetNameScope() == namescope
-            ]
-
-    def GetAllParams(self, namescope=None):
-        return (
-            self.GetParams(namescope) +
-            self.GetComputedParams(namescope) +
-            self.GetNonTrainableParams(namescope)
-        )
diff --git a/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py b/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py
deleted file mode 100644
index b70b74d39dc9..000000000000
--- a/caffe2/python/models/seq2seq/seq2seq_model_helper_test.py
+++ /dev/null
@@ -1,70 +0,0 @@
-
-
-
-
-
-from caffe2.python.models.seq2seq import seq2seq_model_helper
-from caffe2.python import scope, test_util
-
-
-class Seq2SeqModelHelperTest(test_util.TestCase):
-    def testConstuctor(self):
-        model_name = 'TestModel'
-        m = seq2seq_model_helper.Seq2SeqModelHelper(name=model_name)
-
-        self.assertEqual(m.name, model_name)
-        self.assertEqual(m.init_params, True)
-
-        self.assertEqual(m.arg_scope, {
-            'use_cudnn': True,
-            'cudnn_exhaustive_search': False,
-            'order': 'NHWC'
-        })
-
-    def testAddParam(self):
-        m = seq2seq_model_helper.Seq2SeqModelHelper()
-
-        param_name = 'test_param'
-        param = m.AddParam(param_name, init_value=1)
-        self.assertEqual(str(param), param_name)
-
-    def testGetNonTrainableParams(self):
-        m = seq2seq_model_helper.Seq2SeqModelHelper()
-
-        m.AddParam('test_param1', init_value=1, trainable=True)
-        p2 = m.AddParam('test_param2', init_value=2, trainable=False)
-
-        self.assertEqual(
-            m.GetNonTrainableParams(),
-            [p2]
-        )
-
-        with scope.NameScope('A', reset=True):
-            p3 = m.AddParam('test_param3', init_value=3, trainable=False)
-            self.assertEqual(
-                m.GetNonTrainableParams(),
-                [p3]
-            )
-
-        self.assertEqual(
-            m.GetNonTrainableParams(),
-            [p2, p3]
-        )
-
-    def testGetAllParams(self):
-        m = seq2seq_model_helper.Seq2SeqModelHelper()
-
-        p1 = m.AddParam('test_param1', init_value=1, trainable=True)
-        p2 = m.AddParam('test_param2', init_value=2, trainable=False)
-
-        self.assertEqual(
-            m.GetAllParams(),
-            [p1, p2]
-        )
-
-
-if __name__ == "__main__":
-    import unittest
-    import random
-    random.seed(2221)
-    unittest.main()
diff --git a/caffe2/python/models/seq2seq/seq2seq_util.py b/caffe2/python/models/seq2seq/seq2seq_util.py
deleted file mode 100644
index 17187a7894c4..000000000000
--- a/caffe2/python/models/seq2seq/seq2seq_util.py
+++ /dev/null
@@ -1,671 +0,0 @@
-## @package seq2seq_util
-# Module caffe2.python.examples.seq2seq_util
-""" A bunch of util functions to build Seq2Seq models with Caffe2."""
-
-
-
-
-
-
-import collections
-
-import caffe2.proto.caffe2_pb2 as caffe2_pb2
-from caffe2.python import attention, core, rnn_cell, brew
-
-
-PAD_ID = 0
-PAD = '<PAD>'
-GO_ID = 1
-GO = '<GO>'
-EOS_ID = 2
-EOS = '<EOS>'
-UNK_ID = 3
-UNK = '<UNK>'
-
-
-def gen_vocab(corpus, unk_threshold):
-    vocab = collections.defaultdict(lambda: len(vocab))
-    freqs = collections.defaultdict(lambda: 0)
-    # Adding padding tokens to the vocabulary to maintain consistency with IDs
-    vocab[PAD]
-    vocab[GO]
-    vocab[EOS]
-    vocab[UNK]
-
-    with open(corpus) as f:
-        for sentence in f:
-            tokens = sentence.strip().split()
-            for token in tokens:
-                freqs[token] += 1
-    for token, freq in freqs.items():
-        if freq > unk_threshold:
-            vocab[token]
-
-    return vocab
-
-
-def get_numberized_sentence(sentence, vocab):
-    numerized_sentence = []
-    for token in sentence.strip().split():
-        if token in vocab:
-            numerized_sentence.append(vocab[token])
-        else:
-            numerized_sentence.append(vocab[UNK])
-    return numerized_sentence
-
-
-def rnn_unidirectional_layer(
-    model,
-    inputs,
-    input_lengths,
-    input_size,
-    num_units,
-    dropout_keep_prob,
-    forward_only,
-    return_sequence_output,
-    return_final_state,
-    scope=None,
-):
-    """ Unidirectional LSTM encoder."""
-    with core.NameScope(scope):
-        initial_cell_state = model.param_init_net.ConstantFill(
-            [],
-            'initial_cell_state',
-            shape=[num_units],
-            value=0.0,
-        )
-        initial_hidden_state = model.param_init_net.ConstantFill(
-            [],
-            'initial_hidden_state',
-            shape=[num_units],
-            value=0.0,
-        )
-
-    cell = rnn_cell.LSTMCell(
-        input_size=input_size,
-        hidden_size=num_units,
-        forget_bias=0.0,
-        memory_optimization=False,
-        name=(scope + '/' if scope else '') + 'lstm',
-        forward_only=forward_only,
-    )
-
-    dropout_ratio = (
-        None if dropout_keep_prob is None else (1.0 - dropout_keep_prob)
-    )
-    if dropout_ratio is not None:
-        cell = rnn_cell.DropoutCell(
-            internal_cell=cell,
-            dropout_ratio=dropout_ratio,
-            name=(scope + '/' if scope else '') + 'dropout',
-            forward_only=forward_only,
-            is_test=False,
-        )
-
-    outputs_with_grads = []
-    if return_sequence_output:
-        outputs_with_grads.append(0)
-    if return_final_state:
-        outputs_with_grads.extend([1, 3])
-
-    outputs, (_, final_hidden_state, _, final_cell_state) = (
-        cell.apply_over_sequence(
-            model=model,
-            inputs=inputs,
-            seq_lengths=input_lengths,
-            initial_states=(initial_hidden_state, initial_cell_state),
-            outputs_with_grads=outputs_with_grads,
-        )
-    )
-    return outputs, final_hidden_state, final_cell_state
-
-
-def rnn_bidirectional_layer(
-    model,
-    inputs,
-    input_lengths,
-    input_size,
-    num_units,
-    dropout_keep_prob,
-    forward_only,
-    return_sequence_output,
-    return_final_state,
-    scope=None,
-):
-    outputs_fw, final_hidden_fw, final_cell_fw = rnn_unidirectional_layer(
-        model,
-        inputs,
-        input_lengths,
-        input_size,
-        num_units,
-        dropout_keep_prob,
-        forward_only,
-        return_sequence_output,
-        return_final_state,
-        scope=(scope + '/' if scope else '') + 'fw',
-    )
-    with core.NameScope(scope):
-        reversed_inputs = model.net.ReversePackedSegs(
-            [inputs, input_lengths],
-            ['reversed_inputs'],
-        )
-    outputs_bw, final_hidden_bw, final_cell_bw = rnn_unidirectional_layer(
-        model,
-        reversed_inputs,
-        input_lengths,
-        input_size,
-        num_units,
-        dropout_keep_prob,
-        forward_only,
-        return_sequence_output,
-        return_final_state,
-        scope=(scope + '/' if scope else '') + 'bw',
-    )
-    with core.NameScope(scope):
-        outputs_bw = model.net.ReversePackedSegs(
-            [outputs_bw, input_lengths],
-            ['outputs_bw'],
-        )
-
-    # Concatenate forward and backward results
-    if return_sequence_output:
-        with core.NameScope(scope):
-            outputs, _ = model.net.Concat(
-                [outputs_fw, outputs_bw],
-                ['outputs', 'outputs_dim'],
-                axis=2,
-            )
-    else:
-        outputs = None
-
-    if return_final_state:
-        with core.NameScope(scope):
-            final_hidden_state, _ = model.net.Concat(
-                [final_hidden_fw, final_hidden_bw],
-                ['final_hidden_state', 'final_hidden_state_dim'],
-                axis=2,
-            )
-            final_cell_state, _ = model.net.Concat(
-                [final_cell_fw, final_cell_bw],
-                ['final_cell_state', 'final_cell_state_dim'],
-                axis=2,
-            )
-    else:
-        final_hidden_state = None
-        final_cell_state = None
-
-    return outputs, final_hidden_state, final_cell_state
-
-
-def build_embeddings(
-    model,
-    vocab_size,
-    embedding_size,
-    name,
-    freeze_embeddings,
-):
-    embeddings = model.param_init_net.GaussianFill(
-        [],
-        name,
-        shape=[vocab_size, embedding_size],
-        std=0.1,
-    )
-    if not freeze_embeddings:
-        model.params.append(embeddings)
-    return embeddings
-
-
-def get_layer_scope(scope, layer_type, i):
-    prefix = (scope + '/' if scope else '') + layer_type
-    return '{}/layer{}'.format(prefix, i)
-
-
-def build_embedding_encoder(
-    model,
-    encoder_params,
-    num_decoder_layers,
-    inputs,
-    input_lengths,
-    vocab_size,
-    embeddings,
-    embedding_size,
-    use_attention,
-    num_gpus=0,
-    forward_only=False,
-    scope=None,
-):
-    with core.NameScope(scope or ''):
-        if num_gpus == 0:
-            embedded_encoder_inputs = model.net.Gather(
-                [embeddings, inputs],
-                ['embedded_encoder_inputs'],
-            )
-        else:
-            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-                embedded_encoder_inputs_cpu = model.net.Gather(
-                    [embeddings, inputs],
-                    ['embedded_encoder_inputs_cpu'],
-                )
-            embedded_encoder_inputs = model.CopyCPUToGPU(
-                embedded_encoder_inputs_cpu,
-                'embedded_encoder_inputs',
-            )
-
-    layer_inputs = embedded_encoder_inputs
-    layer_input_size = embedding_size
-    encoder_units_per_layer = []
-    final_encoder_hidden_states = []
-    final_encoder_cell_states = []
-
-    num_encoder_layers = len(encoder_params['encoder_layer_configs'])
-    use_bidirectional_encoder = encoder_params.get(
-        'use_bidirectional_encoder',
-        False,
-    )
-
-    for i, layer_config in enumerate(encoder_params['encoder_layer_configs']):
-
-        if use_bidirectional_encoder and i == 0:
-            layer_func = rnn_bidirectional_layer
-            output_dims = 2 * layer_config['num_units']
-        else:
-            layer_func = rnn_unidirectional_layer
-            output_dims = layer_config['num_units']
-        encoder_units_per_layer.append(output_dims)
-
-        is_final_layer = (i == num_encoder_layers - 1)
-
-        dropout_keep_prob = layer_config.get(
-            'dropout_keep_prob',
-            None,
-        )
-
-        return_final_state = i >= (num_encoder_layers - num_decoder_layers)
-        (
-            layer_outputs,
-            final_layer_hidden_state,
-            final_layer_cell_state,
-        ) = layer_func(
-            model=model,
-            inputs=layer_inputs,
-            input_lengths=input_lengths,
-            input_size=layer_input_size,
-            num_units=layer_config['num_units'],
-            dropout_keep_prob=dropout_keep_prob,
-            forward_only=forward_only,
-            return_sequence_output=(not is_final_layer) or use_attention,
-            return_final_state=return_final_state,
-            scope=get_layer_scope(scope, 'encoder', i),
-        )
-
-        if not is_final_layer:
-            layer_inputs = layer_outputs
-            layer_input_size = output_dims
-        final_encoder_hidden_states.append(final_layer_hidden_state)
-        final_encoder_cell_states.append(final_layer_cell_state)
-
-    encoder_outputs = layer_outputs
-    weighted_encoder_outputs = None
-
-    return (
-        encoder_outputs,
-        weighted_encoder_outputs,
-        final_encoder_hidden_states,
-        final_encoder_cell_states,
-        encoder_units_per_layer,
-    )
-
-
-class LSTMWithAttentionDecoder:
-
-    def scope(self, name):
-        return self.name + '/' + name if self.name is not None else name
-
-    def _get_attention_type(self, attention_type_as_string):
-        if attention_type_as_string == 'regular':
-            return attention.AttentionType.Regular
-        elif attention_type_as_string == 'recurrent':
-            return attention.AttentionType.Recurrent
-        else:
-            assert False, 'Unknown type ' + attention_type_as_string
-
-    def __init__(
-        self,
-        encoder_outputs,
-        encoder_output_dim,
-        encoder_lengths,
-        vocab_size,
-        attention_type,
-        embedding_size,
-        decoder_num_units,
-        decoder_cells,
-        residual_output_layers=None,
-        name=None,
-        weighted_encoder_outputs=None,
-    ):
-        self.name = name
-        self.num_layers = len(decoder_cells)
-        if attention_type == 'none':
-            self.cell = rnn_cell.MultiRNNCell(
-                decoder_cells,
-                name=self.scope('decoder'),
-                residual_output_layers=residual_output_layers,
-            )
-            self.use_attention = False
-            self.decoder_output_dim = decoder_num_units
-            self.output_indices = self.cell.output_indices
-        else:
-            decoder_cell = rnn_cell.MultiRNNCell(
-                decoder_cells,
-                name=self.scope('decoder'),
-                residual_output_layers=residual_output_layers,
-            )
-            self.cell = rnn_cell.AttentionCell(
-                encoder_output_dim=encoder_output_dim,
-                encoder_outputs=encoder_outputs,
-                encoder_lengths=encoder_lengths,
-                decoder_cell=decoder_cell,
-                decoder_state_dim=decoder_num_units,
-                name=self.scope('attention_decoder'),
-                attention_type=self._get_attention_type(attention_type),
-                weighted_encoder_outputs=weighted_encoder_outputs,
-                attention_memory_optimization=True,
-            )
-            self.use_attention = True
-            self.decoder_output_dim = decoder_num_units + encoder_output_dim
-
-            self.output_indices = decoder_cell.output_indices
-            self.output_indices.append(2 * self.num_layers)
-
-    def get_state_names(self):
-        return self.cell.get_state_names()
-
-    def get_outputs_with_grads(self):
-        # sequence (all) output locations are at twice their state index
-        return [2 * i for i in self.output_indices]
-
-    def get_output_dim(self):
-        return self.decoder_output_dim
-
-    def get_attention_weights(self):
-        assert self.use_attention
-        # [batch_size, encoder_length, 1]
-        return self.cell.get_attention_weights()
-
-    def apply(
-        self,
-        model,
-        input_t,
-        seq_lengths,
-        states,
-        timestep,
-    ):
-        return self.cell.apply(
-            model=model,
-            input_t=input_t,
-            seq_lengths=seq_lengths,
-            states=states,
-            timestep=timestep,
-        )
-
-    def apply_over_sequence(
-        self,
-        model,
-        inputs,
-        seq_lengths,
-        initial_states,
-    ):
-        return self.cell.apply_over_sequence(
-            model=model,
-            inputs=inputs,
-            seq_lengths=seq_lengths,
-            initial_states=initial_states,
-            outputs_with_grads=self.get_outputs_with_grads(),
-        )
-
-
-def build_initial_rnn_decoder_states(
-    model,
-    encoder_units_per_layer,
-    decoder_units_per_layer,
-    final_encoder_hidden_states,
-    final_encoder_cell_states,
-    use_attention,
-):
-    num_encoder_layers = len(encoder_units_per_layer)
-    num_decoder_layers = len(decoder_units_per_layer)
-    if num_encoder_layers > num_decoder_layers:
-        offset = num_encoder_layers - num_decoder_layers
-    else:
-        offset = 0
-
-    initial_states = []
-    for i, decoder_num_units in enumerate(decoder_units_per_layer):
-
-        if (
-            final_encoder_hidden_states and
-            len(final_encoder_hidden_states) > (i + offset)
-        ):
-            final_encoder_hidden_state = final_encoder_hidden_states[i + offset]
-        else:
-            final_encoder_hidden_state = None
-
-        if final_encoder_hidden_state is None:
-            decoder_initial_hidden_state = model.param_init_net.ConstantFill(
-                [],
-                'decoder_initial_hidden_state_{}'.format(i),
-                shape=[decoder_num_units],
-                value=0.0,
-            )
-            model.params.append(decoder_initial_hidden_state)
-        elif decoder_num_units != encoder_units_per_layer[i + offset]:
-            decoder_initial_hidden_state = brew.fc(
-                model,
-                final_encoder_hidden_state,
-                'decoder_initial_hidden_state_{}'.format(i),
-                encoder_units_per_layer[i + offset],
-                decoder_num_units,
-                axis=2,
-            )
-        else:
-            decoder_initial_hidden_state = final_encoder_hidden_state
-        initial_states.append(decoder_initial_hidden_state)
-
-        if (
-            final_encoder_cell_states and
-            len(final_encoder_cell_states) > (i + offset)
-        ):
-            final_encoder_cell_state = final_encoder_cell_states[i + offset]
-        else:
-            final_encoder_cell_state = None
-
-        if final_encoder_cell_state is None:
-            decoder_initial_cell_state = model.param_init_net.ConstantFill(
-                [],
-                'decoder_initial_cell_state_{}'.format(i),
-                shape=[decoder_num_units],
-                value=0.0,
-            )
-            model.params.append(decoder_initial_cell_state)
-        elif decoder_num_units != encoder_units_per_layer[i + offset]:
-            decoder_initial_cell_state = brew.fc(
-                model,
-                final_encoder_cell_state,
-                'decoder_initial_cell_state_{}'.format(i),
-                encoder_units_per_layer[i + offset],
-                decoder_num_units,
-                axis=2,
-            )
-        else:
-            decoder_initial_cell_state = final_encoder_cell_state
-        initial_states.append(decoder_initial_cell_state)
-
-    if use_attention:
-        initial_attention_weighted_encoder_context = (
-            model.param_init_net.ConstantFill(
-                [],
-                'initial_attention_weighted_encoder_context',
-                shape=[encoder_units_per_layer[-1]],
-                value=0.0,
-            )
-        )
-        model.params.append(initial_attention_weighted_encoder_context)
-        initial_states.append(initial_attention_weighted_encoder_context)
-
-    return initial_states
-
-
-def build_embedding_decoder(
-    model,
-    decoder_layer_configs,
-    inputs,
-    input_lengths,
-    encoder_lengths,
-    encoder_outputs,
-    weighted_encoder_outputs,
-    final_encoder_hidden_states,
-    final_encoder_cell_states,
-    encoder_units_per_layer,
-    vocab_size,
-    embeddings,
-    embedding_size,
-    attention_type,
-    forward_only,
-    num_gpus=0,
-    scope=None,
-):
-    with core.NameScope(scope or ''):
-        if num_gpus == 0:
-            embedded_decoder_inputs = model.net.Gather(
-                [embeddings, inputs],
-                ['embedded_decoder_inputs'],
-            )
-        else:
-            with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-                embedded_decoder_inputs_cpu = model.net.Gather(
-                    [embeddings, inputs],
-                    ['embedded_decoder_inputs_cpu'],
-                )
-            embedded_decoder_inputs = model.CopyCPUToGPU(
-                embedded_decoder_inputs_cpu,
-                'embedded_decoder_inputs',
-            )
-
-    decoder_cells = []
-    decoder_units_per_layer = []
-    for i, layer_config in enumerate(decoder_layer_configs):
-        num_units = layer_config['num_units']
-        decoder_units_per_layer.append(num_units)
-
-        if i == 0:
-            input_size = embedding_size
-        else:
-            input_size = decoder_cells[-1].get_output_dim()
-
-        cell = rnn_cell.LSTMCell(
-            forward_only=forward_only,
-            input_size=input_size,
-            hidden_size=num_units,
-            forget_bias=0.0,
-            memory_optimization=False,
-        )
-
-        dropout_keep_prob = layer_config.get('dropout_keep_prob', None)
-        if dropout_keep_prob is not None:
-            dropout_ratio = 1.0 - layer_config.dropout_keep_prob
-            cell = rnn_cell.DropoutCell(
-                internal_cell=cell,
-                dropout_ratio=dropout_ratio,
-                forward_only=forward_only,
-                is_test=False,
-                name=get_layer_scope(scope, 'decoder_dropout', i),
-            )
-
-        decoder_cells.append(cell)
-
-    states = build_initial_rnn_decoder_states(
-        model=model,
-        encoder_units_per_layer=encoder_units_per_layer,
-        decoder_units_per_layer=decoder_units_per_layer,
-        final_encoder_hidden_states=final_encoder_hidden_states,
-        final_encoder_cell_states=final_encoder_cell_states,
-        use_attention=(attention_type != 'none'),
-    )
-    attention_decoder = LSTMWithAttentionDecoder(
-        encoder_outputs=encoder_outputs,
-        encoder_output_dim=encoder_units_per_layer[-1],
-        encoder_lengths=encoder_lengths,
-        vocab_size=vocab_size,
-        attention_type=attention_type,
-        embedding_size=embedding_size,
-        decoder_num_units=decoder_units_per_layer[-1],
-        decoder_cells=decoder_cells,
-        weighted_encoder_outputs=weighted_encoder_outputs,
-        name=scope,
-    )
-    decoder_outputs, _ = attention_decoder.apply_over_sequence(
-        model=model,
-        inputs=embedded_decoder_inputs,
-        seq_lengths=input_lengths,
-        initial_states=states,
-    )
-
-    # we do softmax over the whole sequence
-    # (max_length in the batch * batch_size) x decoder embedding size
-    # -1 because we don't know max_length yet
-    decoder_outputs_flattened, _ = model.net.Reshape(
-        [decoder_outputs],
-        [
-            'decoder_outputs_flattened',
-            'decoder_outputs_and_contexts_combination_old_shape',
-        ],
-        shape=[-1, attention_decoder.get_output_dim()],
-    )
-
-    decoder_outputs = decoder_outputs_flattened
-    decoder_output_dim = attention_decoder.get_output_dim()
-
-    return (decoder_outputs, decoder_output_dim)
-
-
-def output_projection(
-    model,
-    decoder_outputs,
-    decoder_output_size,
-    target_vocab_size,
-    decoder_softmax_size,
-):
-    if decoder_softmax_size is not None:
-        decoder_outputs = brew.fc(
-            model,
-            decoder_outputs,
-            'decoder_outputs_scaled',
-            dim_in=decoder_output_size,
-            dim_out=decoder_softmax_size,
-        )
-        decoder_output_size = decoder_softmax_size
-
-    output_projection_w = model.param_init_net.XavierFill(
-        [],
-        'output_projection_w',
-        shape=[target_vocab_size, decoder_output_size],
-    )
-
-    output_projection_b = model.param_init_net.XavierFill(
-        [],
-        'output_projection_b',
-        shape=[target_vocab_size],
-    )
-    model.params.extend([
-        output_projection_w,
-        output_projection_b,
-    ])
-    output_logits = model.net.FC(
-        [
-            decoder_outputs,
-            output_projection_w,
-            output_projection_b,
-        ],
-        ['output_logits'],
-    )
-    return output_logits
diff --git a/caffe2/python/models/seq2seq/train.py b/caffe2/python/models/seq2seq/train.py
deleted file mode 100644
index 95a3d3485ab7..000000000000
--- a/caffe2/python/models/seq2seq/train.py
+++ /dev/null
@@ -1,769 +0,0 @@
-## @package train
-# Module caffe2.python.models.seq2seq.train
-
-
-
-
-
-import argparse
-import collections
-import logging
-import math
-import numpy as np
-import random
-import time
-import sys
-import os
-
-import caffe2.proto.caffe2_pb2 as caffe2_pb2
-from caffe2.python import core, workspace, data_parallel_model
-import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
-from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
-
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-logger.addHandler(logging.StreamHandler(sys.stderr))
-
-Batch = collections.namedtuple('Batch', [
-    'encoder_inputs',
-    'encoder_lengths',
-    'decoder_inputs',
-    'decoder_lengths',
-    'targets',
-    'target_weights',
-])
-
-
-def prepare_batch(batch):
-    encoder_lengths = [len(entry[0]) for entry in batch]
-    max_encoder_length = max(encoder_lengths)
-    decoder_lengths = []
-    max_decoder_length = max([len(entry[1]) for entry in batch])
-
-    batch_encoder_inputs = []
-    batch_decoder_inputs = []
-    batch_targets = []
-    batch_target_weights = []
-
-    for source_seq, target_seq in batch:
-        encoder_pads = (
-            [seq2seq_util.PAD_ID] * (max_encoder_length - len(source_seq))
-        )
-        batch_encoder_inputs.append(
-            list(reversed(source_seq)) + encoder_pads
-        )
-
-        decoder_pads = (
-            [seq2seq_util.PAD_ID] * (max_decoder_length - len(target_seq))
-        )
-        target_seq_with_go_token = [seq2seq_util.GO_ID] + target_seq
-        decoder_lengths.append(len(target_seq_with_go_token))
-        batch_decoder_inputs.append(target_seq_with_go_token + decoder_pads)
-
-        target_seq_with_eos = target_seq + [seq2seq_util.EOS_ID]
-        targets = target_seq_with_eos + decoder_pads
-        batch_targets.append(targets)
-
-        if len(source_seq) + len(target_seq) == 0:
-            target_weights = [0] * len(targets)
-        else:
-            target_weights = [
-                1 if target != seq2seq_util.PAD_ID else 0
-                for target in targets
-            ]
-        batch_target_weights.append(target_weights)
-
-    return Batch(
-        encoder_inputs=np.array(
-            batch_encoder_inputs,
-            dtype=np.int32,
-        ).transpose(),
-        encoder_lengths=np.array(encoder_lengths, dtype=np.int32),
-        decoder_inputs=np.array(
-            batch_decoder_inputs,
-            dtype=np.int32,
-        ).transpose(),
-        decoder_lengths=np.array(decoder_lengths, dtype=np.int32),
-        targets=np.array(
-            batch_targets,
-            dtype=np.int32,
-        ).transpose(),
-        target_weights=np.array(
-            batch_target_weights,
-            dtype=np.float32,
-        ).transpose(),
-    )
-
-
-class Seq2SeqModelCaffe2:
-
-    def _build_model(
-        self,
-        init_params,
-    ):
-        model = Seq2SeqModelHelper(init_params=init_params)
-        self._build_shared(model)
-        self._build_embeddings(model)
-
-        forward_model = Seq2SeqModelHelper(init_params=init_params)
-        self._build_shared(forward_model)
-        self._build_embeddings(forward_model)
-
-        if self.num_gpus == 0:
-            loss_blobs = self.model_build_fun(model)
-            model.AddGradientOperators(loss_blobs)
-            self.norm_clipped_grad_update(
-                model,
-                scope='norm_clipped_grad_update'
-            )
-            self.forward_model_build_fun(forward_model)
-
-        else:
-            assert (self.batch_size % self.num_gpus) == 0
-
-            data_parallel_model.Parallelize_GPU(
-                forward_model,
-                input_builder_fun=lambda m: None,
-                forward_pass_builder_fun=self.forward_model_build_fun,
-                param_update_builder_fun=None,
-                devices=list(range(self.num_gpus)),
-            )
-
-            def clipped_grad_update_bound(model):
-                self.norm_clipped_grad_update(
-                    model,
-                    scope='norm_clipped_grad_update',
-                )
-
-            data_parallel_model.Parallelize_GPU(
-                model,
-                input_builder_fun=lambda m: None,
-                forward_pass_builder_fun=self.model_build_fun,
-                param_update_builder_fun=clipped_grad_update_bound,
-                devices=list(range(self.num_gpus)),
-            )
-        self.norm_clipped_sparse_grad_update(
-            model,
-            scope='norm_clipped_sparse_grad_update',
-        )
-        self.model = model
-        self.forward_net = forward_model.net
-
-    def _build_shared(self, model):
-        optimizer_params = self.model_params['optimizer_params']
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-            self.learning_rate = model.AddParam(
-                name='learning_rate',
-                init_value=float(optimizer_params['learning_rate']),
-                trainable=False,
-            )
-            self.global_step = model.AddParam(
-                name='global_step',
-                init_value=0,
-                trainable=False,
-            )
-            self.start_time = model.AddParam(
-                name='start_time',
-                init_value=time.time(),
-                trainable=False,
-            )
-
-    def _build_embeddings(self, model):
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-            sqrt3 = math.sqrt(3)
-            self.encoder_embeddings = model.param_init_net.UniformFill(
-                [],
-                'encoder_embeddings',
-                shape=[
-                    self.source_vocab_size,
-                    self.model_params['encoder_embedding_size'],
-                ],
-                min=-sqrt3,
-                max=sqrt3,
-            )
-            model.params.append(self.encoder_embeddings)
-            self.decoder_embeddings = model.param_init_net.UniformFill(
-                [],
-                'decoder_embeddings',
-                shape=[
-                    self.target_vocab_size,
-                    self.model_params['decoder_embedding_size'],
-                ],
-                min=-sqrt3,
-                max=sqrt3,
-            )
-            model.params.append(self.decoder_embeddings)
-
-    def model_build_fun(self, model, forward_only=False, loss_scale=None):
-        encoder_inputs = model.net.AddExternalInput(
-            workspace.GetNameScope() + 'encoder_inputs',
-        )
-        encoder_lengths = model.net.AddExternalInput(
-            workspace.GetNameScope() + 'encoder_lengths',
-        )
-        decoder_inputs = model.net.AddExternalInput(
-            workspace.GetNameScope() + 'decoder_inputs',
-        )
-        decoder_lengths = model.net.AddExternalInput(
-            workspace.GetNameScope() + 'decoder_lengths',
-        )
-        targets = model.net.AddExternalInput(
-            workspace.GetNameScope() + 'targets',
-        )
-        target_weights = model.net.AddExternalInput(
-            workspace.GetNameScope() + 'target_weights',
-        )
-        attention_type = self.model_params['attention']
-        assert attention_type in ['none', 'regular', 'dot']
-
-        (
-            encoder_outputs,
-            weighted_encoder_outputs,
-            final_encoder_hidden_states,
-            final_encoder_cell_states,
-            encoder_units_per_layer,
-        ) = seq2seq_util.build_embedding_encoder(
-            model=model,
-            encoder_params=self.encoder_params,
-            num_decoder_layers=len(self.model_params['decoder_layer_configs']),
-            inputs=encoder_inputs,
-            input_lengths=encoder_lengths,
-            vocab_size=self.source_vocab_size,
-            embeddings=self.encoder_embeddings,
-            embedding_size=self.model_params['encoder_embedding_size'],
-            use_attention=(attention_type != 'none'),
-            num_gpus=self.num_gpus,
-        )
-
-        (
-            decoder_outputs,
-            decoder_output_size,
-        ) = seq2seq_util.build_embedding_decoder(
-            model,
-            decoder_layer_configs=self.model_params['decoder_layer_configs'],
-            inputs=decoder_inputs,
-            input_lengths=decoder_lengths,
-            encoder_lengths=encoder_lengths,
-            encoder_outputs=encoder_outputs,
-            weighted_encoder_outputs=weighted_encoder_outputs,
-            final_encoder_hidden_states=final_encoder_hidden_states,
-            final_encoder_cell_states=final_encoder_cell_states,
-            encoder_units_per_layer=encoder_units_per_layer,
-            vocab_size=self.target_vocab_size,
-            embeddings=self.decoder_embeddings,
-            embedding_size=self.model_params['decoder_embedding_size'],
-            attention_type=attention_type,
-            forward_only=False,
-            num_gpus=self.num_gpus,
-        )
-
-        output_logits = seq2seq_util.output_projection(
-            model=model,
-            decoder_outputs=decoder_outputs,
-            decoder_output_size=decoder_output_size,
-            target_vocab_size=self.target_vocab_size,
-            decoder_softmax_size=self.model_params['decoder_softmax_size'],
-        )
-        targets, _ = model.net.Reshape(
-            [targets],
-            ['targets', 'targets_old_shape'],
-            shape=[-1],
-        )
-        target_weights, _ = model.net.Reshape(
-            [target_weights],
-            ['target_weights', 'target_weights_old_shape'],
-            shape=[-1],
-        )
-        _, loss_per_word = model.net.SoftmaxWithLoss(
-            [output_logits, targets, target_weights],
-            ['OutputProbs_INVALID', 'loss_per_word'],
-            only_loss=True,
-        )
-
-        num_words = model.net.SumElements(
-            [target_weights],
-            'num_words',
-        )
-        total_loss_scalar = model.net.Mul(
-            [loss_per_word, num_words],
-            'total_loss_scalar',
-        )
-        total_loss_scalar_weighted = model.net.Scale(
-            [total_loss_scalar],
-            'total_loss_scalar_weighted',
-            scale=1.0 / self.batch_size,
-        )
-        return [total_loss_scalar_weighted]
-
-    def forward_model_build_fun(self, model, loss_scale=None):
-        return self.model_build_fun(
-            model=model,
-            forward_only=True,
-            loss_scale=loss_scale
-        )
-
-    def _calc_norm_ratio(self, model, params, scope, ONE):
-        with core.NameScope(scope):
-            grad_squared_sums = []
-            for i, param in enumerate(params):
-                logger.info(param)
-                grad = (
-                    model.param_to_grad[param]
-                    if not isinstance(
-                        model.param_to_grad[param],
-                        core.GradientSlice,
-                    ) else model.param_to_grad[param].values
-                )
-                grad_squared = model.net.Sqr(
-                    [grad],
-                    'grad_{}_squared'.format(i),
-                )
-                grad_squared_sum = model.net.SumElements(
-                    grad_squared,
-                    'grad_{}_squared_sum'.format(i),
-                )
-                grad_squared_sums.append(grad_squared_sum)
-
-            grad_squared_full_sum = model.net.Sum(
-                grad_squared_sums,
-                'grad_squared_full_sum',
-            )
-            global_norm = model.net.Pow(
-                grad_squared_full_sum,
-                'global_norm',
-                exponent=0.5,
-            )
-            clip_norm = model.param_init_net.ConstantFill(
-                [],
-                'clip_norm',
-                shape=[],
-                value=float(self.model_params['max_gradient_norm']),
-            )
-            max_norm = model.net.Max(
-                [global_norm, clip_norm],
-                'max_norm',
-            )
-            norm_ratio = model.net.Div(
-                [clip_norm, max_norm],
-                'norm_ratio',
-            )
-            return norm_ratio
-
-    def _apply_norm_ratio(
-        self, norm_ratio, model, params, learning_rate, scope, ONE
-    ):
-        for param in params:
-            param_grad = model.param_to_grad[param]
-            nlr = model.net.Negative(
-                [learning_rate],
-                'negative_learning_rate',
-            )
-            with core.NameScope(scope):
-                update_coeff = model.net.Mul(
-                    [nlr, norm_ratio],
-                    'update_coeff',
-                    broadcast=1,
-                )
-            if isinstance(param_grad, core.GradientSlice):
-                param_grad_values = param_grad.values
-
-                model.net.ScatterWeightedSum(
-                    [
-                        param,
-                        ONE,
-                        param_grad.indices,
-                        param_grad_values,
-                        update_coeff,
-                    ],
-                    param,
-                )
-            else:
-                model.net.WeightedSum(
-                    [
-                        param,
-                        ONE,
-                        param_grad,
-                        update_coeff,
-                    ],
-                    param,
-                )
-
-    def norm_clipped_grad_update(self, model, scope):
-
-        if self.num_gpus == 0:
-            learning_rate = self.learning_rate
-        else:
-            learning_rate = model.CopyCPUToGPU(self.learning_rate, 'LR')
-
-        params = []
-        for param in model.GetParams(top_scope=True):
-            if param in model.param_to_grad:
-                if not isinstance(
-                    model.param_to_grad[param],
-                    core.GradientSlice,
-                ):
-                    params.append(param)
-
-        ONE = model.param_init_net.ConstantFill(
-            [],
-            'ONE',
-            shape=[1],
-            value=1.0,
-        )
-        logger.info('Dense trainable variables: ')
-        norm_ratio = self._calc_norm_ratio(model, params, scope, ONE)
-        self._apply_norm_ratio(
-            norm_ratio, model, params, learning_rate, scope, ONE
-        )
-
-    def norm_clipped_sparse_grad_update(self, model, scope):
-        learning_rate = self.learning_rate
-
-        params = []
-        for param in model.GetParams(top_scope=True):
-            if param in model.param_to_grad:
-                if isinstance(
-                    model.param_to_grad[param],
-                    core.GradientSlice,
-                ):
-                    params.append(param)
-
-        ONE = model.param_init_net.ConstantFill(
-            [],
-            'ONE',
-            shape=[1],
-            value=1.0,
-        )
-        logger.info('Sparse trainable variables: ')
-        norm_ratio = self._calc_norm_ratio(model, params, scope, ONE)
-        self._apply_norm_ratio(
-            norm_ratio, model, params, learning_rate, scope, ONE
-        )
-
-    def total_loss_scalar(self):
-        if self.num_gpus == 0:
-            return workspace.FetchBlob('total_loss_scalar')
-        else:
-            total_loss = 0
-            for i in range(self.num_gpus):
-                name = 'gpu_{}/total_loss_scalar'.format(i)
-                gpu_loss = workspace.FetchBlob(name)
-                total_loss += gpu_loss
-            return total_loss
-
-    def _init_model(self):
-        workspace.RunNetOnce(self.model.param_init_net)
-
-        def create_net(net):
-            workspace.CreateNet(
-                net,
-                input_blobs=[str(i) for i in net.external_inputs],
-            )
-
-        create_net(self.model.net)
-        create_net(self.forward_net)
-
-    def __init__(
-        self,
-        model_params,
-        source_vocab_size,
-        target_vocab_size,
-        num_gpus=1,
-        num_cpus=1,
-    ):
-        self.model_params = model_params
-        self.encoder_type = 'rnn'
-        self.encoder_params = model_params['encoder_type']
-        self.source_vocab_size = source_vocab_size
-        self.target_vocab_size = target_vocab_size
-        self.num_gpus = num_gpus
-        self.num_cpus = num_cpus
-        self.batch_size = model_params['batch_size']
-
-        workspace.GlobalInit([
-            'caffe2',
-            # NOTE: modify log level for debugging purposes
-            '--caffe2_log_level=0',
-            # NOTE: modify log level for debugging purposes
-            '--v=0',
-            # Fail gracefully if one of the threads fails
-            '--caffe2_handle_executor_threads_exceptions=1',
-            '--caffe2_mkl_num_threads=' + str(self.num_cpus),
-        ])
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        workspace.ResetWorkspace()
-
-    def initialize_from_scratch(self):
-        logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Start')
-        self._build_model(init_params=True)
-        self._init_model()
-        logger.info('Initializing Seq2SeqModelCaffe2 from scratch: Finish')
-
-    def get_current_step(self):
-        return workspace.FetchBlob(self.global_step)[0]
-
-    def inc_current_step(self):
-        workspace.FeedBlob(
-            self.global_step,
-            np.array([self.get_current_step() + 1]),
-        )
-
-    def step(
-        self,
-        batch,
-        forward_only
-    ):
-        if self.num_gpus < 1:
-            batch_obj = prepare_batch(batch)
-            for batch_obj_name, batch_obj_value in zip(
-                Batch._fields,
-                batch_obj,
-            ):
-                workspace.FeedBlob(batch_obj_name, batch_obj_value)
-        else:
-            for i in range(self.num_gpus):
-                gpu_batch = batch[i::self.num_gpus]
-                batch_obj = prepare_batch(gpu_batch)
-                for batch_obj_name, batch_obj_value in zip(
-                    Batch._fields,
-                    batch_obj,
-                ):
-                    name = 'gpu_{}/{}'.format(i, batch_obj_name)
-                    if batch_obj_name in ['encoder_inputs', 'decoder_inputs']:
-                        dev = core.DeviceOption(caffe2_pb2.CPU)
-                    else:
-                        dev = core.DeviceOption(workspace.GpuDeviceType, i)
-                    workspace.FeedBlob(name, batch_obj_value, device_option=dev)
-
-        if forward_only:
-            workspace.RunNet(self.forward_net)
-        else:
-            workspace.RunNet(self.model.net)
-            self.inc_current_step()
-
-        return self.total_loss_scalar()
-
-    def save(self, checkpoint_path_prefix, current_step):
-        checkpoint_path = '{0}-{1}'.format(
-            checkpoint_path_prefix,
-            current_step,
-        )
-
-        assert workspace.RunOperatorOnce(core.CreateOperator(
-            'Save',
-            self.model.GetAllParams(),
-            [],
-            absolute_path=True,
-            db=checkpoint_path,
-            db_type='minidb',
-        ))
-
-        checkpoint_config_path = os.path.join(
-            os.path.dirname(checkpoint_path_prefix),
-            'checkpoint',
-        )
-        with open(checkpoint_config_path, 'w') as checkpoint_config_file:
-            checkpoint_config_file.write(
-                'model_checkpoint_path: "' + checkpoint_path + '"\n'
-                'all_model_checkpoint_paths: "' + checkpoint_path + '"\n'
-            )
-            logger.info('Saved checkpoint file to ' + checkpoint_path)
-
-        return checkpoint_path
-
-def gen_batches(source_corpus, target_corpus, source_vocab, target_vocab,
-                batch_size, max_length):
-    with open(source_corpus) as source, open(target_corpus) as target:
-        parallel_sentences = []
-        for source_sentence, target_sentence in zip(source, target):
-            numerized_source_sentence = seq2seq_util.get_numberized_sentence(
-                source_sentence,
-                source_vocab,
-            )
-            numerized_target_sentence = seq2seq_util.get_numberized_sentence(
-                target_sentence,
-                target_vocab,
-            )
-            if (
-                len(numerized_source_sentence) > 0 and
-                len(numerized_target_sentence) > 0 and
-                (
-                    max_length is None or (
-                        len(numerized_source_sentence) <= max_length and
-                        len(numerized_target_sentence) <= max_length
-                    )
-                )
-            ):
-                parallel_sentences.append((
-                    numerized_source_sentence,
-                    numerized_target_sentence,
-                ))
-    parallel_sentences.sort(key=lambda s_t: (len(s_t[0]), len(s_t[1])))
-
-    batches, batch = [], []
-    for sentence_pair in parallel_sentences:
-        batch.append(sentence_pair)
-        if len(batch) >= batch_size:
-            batches.append(batch)
-            batch = []
-    if len(batch) > 0:
-        while len(batch) < batch_size:
-            batch.append(batch[-1])
-        assert len(batch) == batch_size
-        batches.append(batch)
-    random.shuffle(batches)
-    return batches
-
-
-def run_seq2seq_model(args, model_params=None):
-    source_vocab = seq2seq_util.gen_vocab(
-        args.source_corpus,
-        args.unk_threshold,
-    )
-    target_vocab = seq2seq_util.gen_vocab(
-        args.target_corpus,
-        args.unk_threshold,
-    )
-    logger.info('Source vocab size {}'.format(len(source_vocab)))
-    logger.info('Target vocab size {}'.format(len(target_vocab)))
-
-    batches = gen_batches(args.source_corpus, args.target_corpus, source_vocab,
-                          target_vocab, model_params['batch_size'],
-                          args.max_length)
-    logger.info('Number of training batches {}'.format(len(batches)))
-
-    batches_eval = gen_batches(args.source_corpus_eval, args.target_corpus_eval,
-                               source_vocab, target_vocab,
-                               model_params['batch_size'], args.max_length)
-    logger.info('Number of eval batches {}'.format(len(batches_eval)))
-
-    with Seq2SeqModelCaffe2(
-        model_params=model_params,
-        source_vocab_size=len(source_vocab),
-        target_vocab_size=len(target_vocab),
-        num_gpus=args.num_gpus,
-        num_cpus=20,
-    ) as model_obj:
-        model_obj.initialize_from_scratch()
-        for i in range(args.epochs):
-            logger.info('Epoch {}'.format(i))
-            total_loss = 0
-            for batch in batches:
-                total_loss += model_obj.step(
-                    batch=batch,
-                    forward_only=False,
-                )
-            logger.info('\ttraining loss {}'.format(total_loss))
-            total_loss = 0
-            for batch in batches_eval:
-                total_loss += model_obj.step(
-                    batch=batch,
-                    forward_only=True,
-                )
-            logger.info('\teval loss {}'.format(total_loss))
-            if args.checkpoint is not None:
-                model_obj.save(args.checkpoint, i)
-
-
-def main():
-    random.seed(31415)
-    parser = argparse.ArgumentParser(
-        description='Caffe2: Seq2Seq Training'
-    )
-    parser.add_argument('--source-corpus', type=str, default=None,
-                        help='Path to source corpus in a text file format. Each '
-                        'line in the file should contain a single sentence',
-                        required=True)
-    parser.add_argument('--target-corpus', type=str, default=None,
-                        help='Path to target corpus in a text file format',
-                        required=True)
-    parser.add_argument('--max-length', type=int, default=None,
-                        help='Maximal lengths of train and eval sentences')
-    parser.add_argument('--unk-threshold', type=int, default=50,
-                        help='Threshold frequency under which token becomes '
-                        'labeled unknown token')
-
-    parser.add_argument('--batch-size', type=int, default=32,
-                        help='Training batch size')
-    parser.add_argument('--epochs', type=int, default=10,
-                        help='Number of iterations over training data')
-    parser.add_argument('--learning-rate', type=float, default=0.5,
-                        help='Learning rate')
-    parser.add_argument('--max-gradient-norm', type=float, default=1.0,
-                        help='Max global norm of gradients at the end of each '
-                        'backward pass. We do clipping to match the number.')
-    parser.add_argument('--num-gpus', type=int, default=0,
-                        help='Number of GPUs for data parallel model')
-
-    parser.add_argument('--use-bidirectional-encoder', action='store_true',
-                        help='Set flag to use bidirectional recurrent network '
-                        'for first layer of encoder')
-    parser.add_argument('--use-attention', action='store_true',
-                        help='Set flag to use seq2seq with attention model')
-    parser.add_argument('--source-corpus-eval', type=str, default=None,
-                        help='Path to source corpus for evaluation in a text '
-                        'file format', required=True)
-    parser.add_argument('--target-corpus-eval', type=str, default=None,
-                        help='Path to target corpus for evaluation in a text '
-                        'file format', required=True)
-    parser.add_argument('--encoder-cell-num-units', type=int, default=512,
-                        help='Number of cell units per encoder layer')
-    parser.add_argument('--encoder-num-layers', type=int, default=2,
-                        help='Number encoder layers')
-    parser.add_argument('--decoder-cell-num-units', type=int, default=512,
-                        help='Number of cell units in the decoder layer')
-    parser.add_argument('--decoder-num-layers', type=int, default=2,
-                        help='Number decoder layers')
-    parser.add_argument('--encoder-embedding-size', type=int, default=256,
-                        help='Size of embedding in the encoder layer')
-    parser.add_argument('--decoder-embedding-size', type=int, default=512,
-                        help='Size of embedding in the decoder layer')
-    parser.add_argument('--decoder-softmax-size', type=int, default=None,
-                        help='Size of softmax layer in the decoder')
-
-    parser.add_argument('--checkpoint', type=str, default=None,
-                        help='Path to checkpoint')
-
-    args = parser.parse_args()
-
-    encoder_layer_configs = [
-        dict(
-            num_units=args.encoder_cell_num_units,
-        ),
-    ] * args.encoder_num_layers
-
-    if args.use_bidirectional_encoder:
-        assert args.encoder_cell_num_units % 2 == 0
-        encoder_layer_configs[0]['num_units'] /= 2
-
-    decoder_layer_configs = [
-        dict(
-            num_units=args.decoder_cell_num_units,
-        ),
-    ] * args.decoder_num_layers
-
-    run_seq2seq_model(args, model_params=dict(
-        attention=('regular' if args.use_attention else 'none'),
-        decoder_layer_configs=decoder_layer_configs,
-        encoder_type=dict(
-            encoder_layer_configs=encoder_layer_configs,
-            use_bidirectional_encoder=args.use_bidirectional_encoder,
-        ),
-        batch_size=args.batch_size,
-        optimizer_params=dict(
-            learning_rate=args.learning_rate,
-        ),
-        encoder_embedding_size=args.encoder_embedding_size,
-        decoder_embedding_size=args.decoder_embedding_size,
-        decoder_softmax_size=args.decoder_softmax_size,
-        max_gradient_norm=args.max_gradient_norm,
-    ))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/caffe2/python/models/seq2seq/translate.py b/caffe2/python/models/seq2seq/translate.py
deleted file mode 100644
index fd126ae3c6c0..000000000000
--- a/caffe2/python/models/seq2seq/translate.py
+++ /dev/null
@@ -1,654 +0,0 @@
-## @package translate
-# Module caffe2.python.models.seq2seq.translate
-
-
-
-
-
-from abc import ABCMeta, abstractmethod
-import argparse
-import logging
-import numpy as np
-import sys
-
-from caffe2.python import core, rnn_cell, workspace
-from caffe2.python.models.seq2seq.beam_search import BeamSearchForwardOnly
-from caffe2.python.models.seq2seq.seq2seq_model_helper import Seq2SeqModelHelper
-import caffe2.python.models.seq2seq.seq2seq_util as seq2seq_util
-
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-logger.addHandler(logging.StreamHandler(sys.stderr))
-
-
-def _weighted_sum(model, values, weight, output_name):
-    values_weights = zip(values, [weight] * len(values))
-    values_weights_flattened = [x for v_w in values_weights for x in v_w]
-    return model.net.WeightedSum(
-        values_weights_flattened,
-        output_name,
-    )
-
-
-class Seq2SeqModelCaffe2EnsembleDecoderBase(metaclass=ABCMeta):
-
-    @abstractmethod
-    def get_model_file(self, model):
-        pass
-
-    @abstractmethod
-    def get_db_type(self):
-        pass
-
-    def build_word_rewards(self, vocab_size, word_reward, unk_reward):
-        word_rewards = np.full([vocab_size], word_reward, dtype=np.float32)
-        word_rewards[seq2seq_util.PAD_ID] = 0
-        word_rewards[seq2seq_util.GO_ID] = 0
-        word_rewards[seq2seq_util.EOS_ID] = 0
-        word_rewards[seq2seq_util.UNK_ID] = word_reward + unk_reward
-        return word_rewards
-
-    def load_models(self):
-        db_reader = 'reader'
-        for model, scope_name in zip(
-            self.models,
-            self.decoder_scope_names,
-        ):
-            params_for_current_model = [
-                param
-                for param in self.model.GetAllParams()
-                if str(param).startswith(scope_name)
-            ]
-            assert workspace.RunOperatorOnce(core.CreateOperator(
-                'CreateDB',
-                [], [db_reader],
-                db=self.get_model_file(model),
-                db_type=self.get_db_type())
-            ), 'Failed to create db {}'.format(self.get_model_file(model))
-            assert workspace.RunOperatorOnce(core.CreateOperator(
-                'Load',
-                [db_reader],
-                params_for_current_model,
-                load_all=1,
-                add_prefix=scope_name + '/',
-                strip_prefix='gpu_0/',
-            ))
-            logger.info('Model {} is loaded from a checkpoint {}'.format(
-                scope_name, self.get_model_file(model)))
-
-
-class Seq2SeqModelCaffe2EnsembleDecoder(Seq2SeqModelCaffe2EnsembleDecoderBase):
-
-    def get_model_file(self, model):
-        return model['model_file']
-
-    def get_db_type(self):
-        return 'minidb'
-
-    def scope(self, scope_name, blob_name):
-        return (
-            scope_name + '/' + blob_name
-            if scope_name is not None
-            else blob_name
-        )
-
-    def _build_decoder(
-        self,
-        model,
-        step_model,
-        model_params,
-        scope,
-        previous_tokens,
-        timestep,
-        fake_seq_lengths,
-    ):
-        attention_type = model_params['attention']
-        assert attention_type in ['none', 'regular']
-        use_attention = (attention_type != 'none')
-
-        with core.NameScope(scope):
-            encoder_embeddings = seq2seq_util.build_embeddings(
-                model=model,
-                vocab_size=self.source_vocab_size,
-                embedding_size=model_params['encoder_embedding_size'],
-                name='encoder_embeddings',
-                freeze_embeddings=False,
-            )
-
-        (
-            encoder_outputs,
-            weighted_encoder_outputs,
-            final_encoder_hidden_states,
-            final_encoder_cell_states,
-            encoder_units_per_layer,
-        ) = seq2seq_util.build_embedding_encoder(
-            model=model,
-            encoder_params=model_params['encoder_type'],
-            num_decoder_layers=len(model_params['decoder_layer_configs']),
-            inputs=self.encoder_inputs,
-            input_lengths=self.encoder_lengths,
-            vocab_size=self.source_vocab_size,
-            embeddings=encoder_embeddings,
-            embedding_size=model_params['encoder_embedding_size'],
-            use_attention=use_attention,
-            num_gpus=0,
-            forward_only=True,
-            scope=scope,
-        )
-        with core.NameScope(scope):
-            if use_attention:
-                # [max_source_length, beam_size, encoder_output_dim]
-                encoder_outputs = model.net.Tile(
-                    encoder_outputs,
-                    'encoder_outputs_tiled',
-                    tiles=self.beam_size,
-                    axis=1,
-                )
-
-            if weighted_encoder_outputs is not None:
-                weighted_encoder_outputs = model.net.Tile(
-                    weighted_encoder_outputs,
-                    'weighted_encoder_outputs_tiled',
-                    tiles=self.beam_size,
-                    axis=1,
-                )
-
-            decoder_embeddings = seq2seq_util.build_embeddings(
-                model=model,
-                vocab_size=self.target_vocab_size,
-                embedding_size=model_params['decoder_embedding_size'],
-                name='decoder_embeddings',
-                freeze_embeddings=False,
-            )
-            embedded_tokens_t_prev = step_model.net.Gather(
-                [decoder_embeddings, previous_tokens],
-                'embedded_tokens_t_prev',
-            )
-
-        decoder_cells = []
-        decoder_units_per_layer = []
-        for i, layer_config in enumerate(model_params['decoder_layer_configs']):
-            num_units = layer_config['num_units']
-            decoder_units_per_layer.append(num_units)
-            if i == 0:
-                input_size = model_params['decoder_embedding_size']
-            else:
-                input_size = (
-                    model_params['decoder_layer_configs'][i - 1]['num_units']
-                )
-
-            cell = rnn_cell.LSTMCell(
-                forward_only=True,
-                input_size=input_size,
-                hidden_size=num_units,
-                forget_bias=0.0,
-                memory_optimization=False,
-            )
-            decoder_cells.append(cell)
-
-        with core.NameScope(scope):
-            if final_encoder_hidden_states is not None:
-                for i in range(len(final_encoder_hidden_states)):
-                    if final_encoder_hidden_states[i] is not None:
-                        final_encoder_hidden_states[i] = model.net.Tile(
-                            final_encoder_hidden_states[i],
-                            'final_encoder_hidden_tiled_{}'.format(i),
-                            tiles=self.beam_size,
-                            axis=1,
-                        )
-            if final_encoder_cell_states is not None:
-                for i in range(len(final_encoder_cell_states)):
-                    if final_encoder_cell_states[i] is not None:
-                        final_encoder_cell_states[i] = model.net.Tile(
-                            final_encoder_cell_states[i],
-                            'final_encoder_cell_tiled_{}'.format(i),
-                            tiles=self.beam_size,
-                            axis=1,
-                        )
-            initial_states = \
-                seq2seq_util.build_initial_rnn_decoder_states(
-                    model=model,
-                    encoder_units_per_layer=encoder_units_per_layer,
-                    decoder_units_per_layer=decoder_units_per_layer,
-                    final_encoder_hidden_states=final_encoder_hidden_states,
-                    final_encoder_cell_states=final_encoder_cell_states,
-                    use_attention=use_attention,
-                )
-
-        attention_decoder = seq2seq_util.LSTMWithAttentionDecoder(
-            encoder_outputs=encoder_outputs,
-            encoder_output_dim=encoder_units_per_layer[-1],
-            encoder_lengths=None,
-            vocab_size=self.target_vocab_size,
-            attention_type=attention_type,
-            embedding_size=model_params['decoder_embedding_size'],
-            decoder_num_units=decoder_units_per_layer[-1],
-            decoder_cells=decoder_cells,
-            weighted_encoder_outputs=weighted_encoder_outputs,
-            name=scope,
-        )
-        states_prev = step_model.net.AddExternalInputs(*[
-            '{}/{}_prev'.format(scope, s)
-            for s in attention_decoder.get_state_names()
-        ])
-        decoder_outputs, states = attention_decoder.apply(
-            model=step_model,
-            input_t=embedded_tokens_t_prev,
-            seq_lengths=fake_seq_lengths,
-            states=states_prev,
-            timestep=timestep,
-        )
-
-        state_configs = [
-            BeamSearchForwardOnly.StateConfig(
-                initial_value=initial_state,
-                state_prev_link=BeamSearchForwardOnly.LinkConfig(
-                    blob=state_prev,
-                    offset=0,
-                    window=1,
-                ),
-                state_link=BeamSearchForwardOnly.LinkConfig(
-                    blob=state,
-                    offset=1,
-                    window=1,
-                ),
-            )
-            for initial_state, state_prev, state in zip(
-                initial_states,
-                states_prev,
-                states,
-            )
-        ]
-
-        with core.NameScope(scope):
-            decoder_outputs_flattened, _ = step_model.net.Reshape(
-                [decoder_outputs],
-                [
-                    'decoder_outputs_flattened',
-                    'decoder_outputs_and_contexts_combination_old_shape',
-                ],
-                shape=[-1, attention_decoder.get_output_dim()],
-            )
-            output_logits = seq2seq_util.output_projection(
-                model=step_model,
-                decoder_outputs=decoder_outputs_flattened,
-                decoder_output_size=attention_decoder.get_output_dim(),
-                target_vocab_size=self.target_vocab_size,
-                decoder_softmax_size=model_params['decoder_softmax_size'],
-            )
-            # [1, beam_size, target_vocab_size]
-            output_probs = step_model.net.Softmax(
-                output_logits,
-                'output_probs',
-            )
-            output_log_probs = step_model.net.Log(
-                output_probs,
-                'output_log_probs',
-            )
-            if use_attention:
-                attention_weights = attention_decoder.get_attention_weights()
-            else:
-                attention_weights = step_model.net.ConstantFill(
-                    [self.encoder_inputs],
-                    'zero_attention_weights_tmp_1',
-                    value=0.0,
-                )
-                attention_weights = step_model.net.Transpose(
-                    attention_weights,
-                    'zero_attention_weights_tmp_2',
-                )
-                attention_weights = step_model.net.Tile(
-                    attention_weights,
-                    'zero_attention_weights_tmp',
-                    tiles=self.beam_size,
-                    axis=0,
-                )
-
-        return (
-            state_configs,
-            output_log_probs,
-            attention_weights,
-        )
-
-    def __init__(
-        self,
-        translate_params,
-    ):
-        self.models = translate_params['ensemble_models']
-        decoding_params = translate_params['decoding_params']
-        self.beam_size = decoding_params['beam_size']
-
-        assert len(self.models) > 0
-        source_vocab = self.models[0]['source_vocab']
-        target_vocab = self.models[0]['target_vocab']
-        for model in self.models:
-            assert model['source_vocab'] == source_vocab
-            assert model['target_vocab'] == target_vocab
-
-        self.source_vocab_size = len(source_vocab)
-        self.target_vocab_size = len(target_vocab)
-
-        self.decoder_scope_names = [
-            'model{}'.format(i) for i in range(len(self.models))
-        ]
-
-        self.model = Seq2SeqModelHelper(init_params=True)
-
-        self.encoder_inputs = self.model.net.AddExternalInput('encoder_inputs')
-        self.encoder_lengths = self.model.net.AddExternalInput(
-            'encoder_lengths'
-        )
-        self.max_output_seq_len = self.model.net.AddExternalInput(
-            'max_output_seq_len'
-        )
-
-        fake_seq_lengths = self.model.param_init_net.ConstantFill(
-            [],
-            'fake_seq_lengths',
-            shape=[self.beam_size],
-            value=100000,
-            dtype=core.DataType.INT32,
-        )
-
-        beam_decoder = BeamSearchForwardOnly(
-            beam_size=self.beam_size,
-            model=self.model,
-            go_token_id=seq2seq_util.GO_ID,
-            eos_token_id=seq2seq_util.EOS_ID,
-        )
-        step_model = beam_decoder.get_step_model()
-
-        state_configs = []
-        output_log_probs = []
-        attention_weights = []
-        for model, scope_name in zip(
-            self.models,
-            self.decoder_scope_names,
-        ):
-            (
-                state_configs_per_decoder,
-                output_log_probs_per_decoder,
-                attention_weights_per_decoder,
-            ) = self._build_decoder(
-                model=self.model,
-                step_model=step_model,
-                model_params=model['model_params'],
-                scope=scope_name,
-                previous_tokens=beam_decoder.get_previous_tokens(),
-                timestep=beam_decoder.get_timestep(),
-                fake_seq_lengths=fake_seq_lengths,
-            )
-            state_configs.extend(state_configs_per_decoder)
-            output_log_probs.append(output_log_probs_per_decoder)
-            if attention_weights_per_decoder is not None:
-                attention_weights.append(attention_weights_per_decoder)
-
-        assert len(attention_weights) > 0
-        num_decoders_with_attention_blob = (
-            self.model.param_init_net.ConstantFill(
-                [],
-                'num_decoders_with_attention_blob',
-                value=1 / float(len(attention_weights)),
-                shape=[1],
-            )
-        )
-        # [beam_size, encoder_length, 1]
-        attention_weights_average = _weighted_sum(
-            model=step_model,
-            values=attention_weights,
-            weight=num_decoders_with_attention_blob,
-            output_name='attention_weights_average',
-        )
-
-        num_decoders_blob = self.model.param_init_net.ConstantFill(
-            [],
-            'num_decoders_blob',
-            value=1 / float(len(output_log_probs)),
-            shape=[1],
-        )
-        # [beam_size, target_vocab_size]
-        output_log_probs_average = _weighted_sum(
-            model=step_model,
-            values=output_log_probs,
-            weight=num_decoders_blob,
-            output_name='output_log_probs_average',
-        )
-        word_rewards = self.model.param_init_net.ConstantFill(
-            [],
-            'word_rewards',
-            shape=[self.target_vocab_size],
-            value=0.0,
-            dtype=core.DataType.FLOAT,
-        )
-        (
-            self.output_token_beam_list,
-            self.output_prev_index_beam_list,
-            self.output_score_beam_list,
-            self.output_attention_weights_beam_list,
-        ) = beam_decoder.apply(
-            inputs=self.encoder_inputs,
-            length=self.max_output_seq_len,
-            log_probs=output_log_probs_average,
-            attentions=attention_weights_average,
-            state_configs=state_configs,
-            data_dependencies=[],
-            word_rewards=word_rewards,
-        )
-
-        workspace.RunNetOnce(self.model.param_init_net)
-        workspace.FeedBlob(
-            'word_rewards',
-            self.build_word_rewards(
-                vocab_size=self.target_vocab_size,
-                word_reward=translate_params['decoding_params']['word_reward'],
-                unk_reward=translate_params['decoding_params']['unk_reward'],
-            )
-        )
-
-        workspace.CreateNet(
-            self.model.net,
-            input_blobs=[
-                str(self.encoder_inputs),
-                str(self.encoder_lengths),
-                str(self.max_output_seq_len),
-            ],
-        )
-
-        logger.info('Params created: ')
-        for param in self.model.params:
-            logger.info(param)
-
-    def decode(self, numberized_input, max_output_seq_len):
-        workspace.FeedBlob(
-            self.encoder_inputs,
-            np.array([
-                [token_id] for token_id in reversed(numberized_input)
-            ]).astype(dtype=np.int32),
-        )
-        workspace.FeedBlob(
-            self.encoder_lengths,
-            np.array([len(numberized_input)]).astype(dtype=np.int32),
-        )
-        workspace.FeedBlob(
-            self.max_output_seq_len,
-            np.array([max_output_seq_len]).astype(dtype=np.int64),
-        )
-
-        workspace.RunNet(self.model.net)
-
-        num_steps = max_output_seq_len
-        score_beam_list = workspace.FetchBlob(self.output_score_beam_list)
-        token_beam_list = (
-            workspace.FetchBlob(self.output_token_beam_list)
-        )
-        prev_index_beam_list = (
-            workspace.FetchBlob(self.output_prev_index_beam_list)
-        )
-
-        attention_weights_beam_list = (
-            workspace.FetchBlob(self.output_attention_weights_beam_list)
-        )
-        best_indices = (num_steps, 0)
-        for i in range(num_steps + 1):
-            for hyp_index in range(self.beam_size):
-                if (
-                    (
-                        token_beam_list[i][hyp_index][0] ==
-                        seq2seq_util.EOS_ID or
-                        i == num_steps
-                    ) and
-                    (
-                        score_beam_list[i][hyp_index][0] >
-                        score_beam_list[best_indices[0]][best_indices[1]][0]
-                    )
-                ):
-                    best_indices = (i, hyp_index)
-
-        i, hyp_index = best_indices
-        output = []
-        attention_weights_per_token = []
-        best_score = -score_beam_list[i][hyp_index][0]
-        while i > 0:
-            output.append(token_beam_list[i][hyp_index][0])
-            attention_weights_per_token.append(
-                attention_weights_beam_list[i][hyp_index]
-            )
-            hyp_index = prev_index_beam_list[i][hyp_index][0]
-            i -= 1
-
-        attention_weights_per_token = reversed(attention_weights_per_token)
-        # encoder_inputs are reversed, see get_batch func
-        attention_weights_per_token = [
-            list(reversed(attention_weights))[:len(numberized_input)]
-            for attention_weights in attention_weights_per_token
-        ]
-        output = list(reversed(output))
-        return output, attention_weights_per_token, best_score
-
-
-def run_seq2seq_beam_decoder(args, model_params, decoding_params):
-    source_vocab = seq2seq_util.gen_vocab(
-        args.source_corpus,
-        args.unk_threshold,
-    )
-    logger.info('Source vocab size {}'.format(len(source_vocab)))
-    target_vocab = seq2seq_util.gen_vocab(
-        args.target_corpus,
-        args.unk_threshold,
-    )
-    inversed_target_vocab = {v: k for (k, v) in target_vocab.items()}
-    logger.info('Target vocab size {}'.format(len(target_vocab)))
-
-    decoder = Seq2SeqModelCaffe2EnsembleDecoder(
-        translate_params=dict(
-            ensemble_models=[dict(
-                source_vocab=source_vocab,
-                target_vocab=target_vocab,
-                model_params=model_params,
-                model_file=args.checkpoint,
-            )],
-            decoding_params=decoding_params,
-        ),
-    )
-    decoder.load_models()
-
-    for line in sys.stdin:
-        numerized_source_sentence = seq2seq_util.get_numberized_sentence(
-            line,
-            source_vocab,
-        )
-        translation, alignment, _ = decoder.decode(
-            numerized_source_sentence,
-            2 * len(numerized_source_sentence) + 5,
-        )
-        print(' '.join([inversed_target_vocab[tid] for tid in translation]))
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Caffe2: Seq2Seq Translation',
-    )
-    parser.add_argument('--source-corpus', type=str, default=None,
-                        help='Path to source corpus in a text file format. Each '
-                        'line in the file should contain a single sentence',
-                        required=True)
-    parser.add_argument('--target-corpus', type=str, default=None,
-                        help='Path to target corpus in a text file format',
-                        required=True)
-    parser.add_argument('--unk-threshold', type=int, default=50,
-                        help='Threshold frequency under which token becomes '
-                        'labeled unknown token')
-
-    parser.add_argument('--use-bidirectional-encoder', action='store_true',
-                        help='Set flag to use bidirectional recurrent network '
-                        'in encoder')
-    parser.add_argument('--use-attention', action='store_true',
-                        help='Set flag to use seq2seq with attention model')
-    parser.add_argument('--encoder-cell-num-units', type=int, default=512,
-                        help='Number of cell units per encoder layer')
-    parser.add_argument('--encoder-num-layers', type=int, default=2,
-                        help='Number encoder layers')
-    parser.add_argument('--decoder-cell-num-units', type=int, default=512,
-                        help='Number of cell units in the decoder layer')
-    parser.add_argument('--decoder-num-layers', type=int, default=2,
-                        help='Number decoder layers')
-    parser.add_argument('--encoder-embedding-size', type=int, default=256,
-                        help='Size of embedding in the encoder layer')
-    parser.add_argument('--decoder-embedding-size', type=int, default=512,
-                        help='Size of embedding in the decoder layer')
-    parser.add_argument('--decoder-softmax-size', type=int, default=None,
-                        help='Size of softmax layer in the decoder')
-
-    parser.add_argument('--beam-size', type=int, default=6,
-                        help='Size of beam for the decoder')
-    parser.add_argument('--word-reward', type=float, default=0.0,
-                        help='Reward per each word generated.')
-    parser.add_argument('--unk-reward', type=float, default=0.0,
-                        help='Reward per each UNK token generated. '
-                        'Typically should be negative.')
-
-    parser.add_argument('--checkpoint', type=str, default=None,
-                        help='Path to checkpoint', required=True)
-
-    args = parser.parse_args()
-
-    encoder_layer_configs = [
-        dict(
-            num_units=args.encoder_cell_num_units,
-        ),
-    ] * args.encoder_num_layers
-
-    if args.use_bidirectional_encoder:
-        assert args.encoder_cell_num_units % 2 == 0
-        encoder_layer_configs[0]['num_units'] /= 2
-
-    decoder_layer_configs = [
-        dict(
-            num_units=args.decoder_cell_num_units,
-        ),
-    ] * args.decoder_num_layers
-
-    run_seq2seq_beam_decoder(
-        args,
-        model_params=dict(
-            attention=('regular' if args.use_attention else 'none'),
-            decoder_layer_configs=decoder_layer_configs,
-            encoder_type=dict(
-                encoder_layer_configs=encoder_layer_configs,
-                use_bidirectional_encoder=args.use_bidirectional_encoder,
-            ),
-            encoder_embedding_size=args.encoder_embedding_size,
-            decoder_embedding_size=args.decoder_embedding_size,
-            decoder_softmax_size=args.decoder_softmax_size,
-        ),
-        decoding_params=dict(
-            beam_size=args.beam_size,
-            word_reward=args.word_reward,
-            unk_reward=args.unk_reward,
-        ),
-    )
-
-
-if __name__ == '__main__':
-    main()
diff --git a/caffe2/python/models/shufflenet.py b/caffe2/python/models/shufflenet.py
deleted file mode 100644
index 33a7f7a4b7c5..000000000000
--- a/caffe2/python/models/shufflenet.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# Module caffe2.python.models.shufflenet
-
-
-
-
-
-
-from caffe2.python import brew
-
-"""
-Utilitiy for creating ShuffleNet
-"ShuffleNet V2: Practical Guidelines for EfficientCNN Architecture Design" by Ma et. al. 2018
-"""
-
-OUTPUT_CHANNELS = {
-    '0.5x': [24, 48, 96, 192, 1024],
-    '1.0x': [24, 116, 232, 464, 1024],
-    '1.5x': [24, 176, 352, 704, 1024],
-    '2.0x': [24, 244, 488, 976, 2048],
-}
-
-
-class ShuffleNetV2Builder():
-    def __init__(
-        self,
-        model,
-        data,
-        num_input_channels,
-        num_labels,
-        num_groups=2,
-        width='1.0x',
-        is_test=False,
-        detection=False,
-        bn_epsilon=1e-5,
-    ):
-        self.model = model
-        self.prev_blob = data
-        self.num_input_channels = num_input_channels
-        self.num_labels = num_labels
-        self.num_groups = num_groups
-        self.output_channels = OUTPUT_CHANNELS[width]
-        self.stage_repeats = [3, 7, 3]
-        self.is_test = is_test
-        self.detection = detection
-        self.bn_epsilon = bn_epsilon
-
-    def create(self):
-        in_channels = self.output_channels[0]
-
-        self.prev_blob = brew.conv(self.model, self.prev_blob, 'stage1_conv',
-                                   self.num_input_channels, in_channels,
-                                   weight_init=("MSRAFill", {}),
-                                   kernel=3, stride=2)
-        self.prev_blob = brew.max_pool(self.model, self.prev_blob,
-                                       'stage1_pool', kernel=3, stride=2)
-
-        # adds stage#{2,3,4}; see table 5 of the ShufflenetV2 paper.
-        for idx, (out_channels, n_repeats) in enumerate(zip(
-            self.output_channels[1:4], self.stage_repeats
-        )):
-            prefix = 'stage{}_stride{}'.format(idx + 2, 2)
-            self.add_spatial_ds_unit(prefix, in_channels, out_channels)
-            in_channels = out_channels
-            for i in range(n_repeats):
-                prefix = 'stage{}_stride{}_repeat{}'.format(
-                    idx + 2, 1, i + 1
-                )
-                self.add_basic_unit(prefix, in_channels)
-
-        self.last_conv = brew.conv(self.model, self.prev_blob, 'conv5',
-                                   in_channels, self.output_channels[4],
-                                   kernel=1)
-        self.avg_pool = self.model.AveragePool(self.last_conv, 'avg_pool',
-                                               kernel=7)
-        self.last_out = brew.fc(self.model,
-                                self.avg_pool,
-                                'last_out_L{}'.format(self.num_labels),
-                                self.output_channels[4],
-                                self.num_labels)
-
-    # spatial down sampling unit with stride=2
-    def add_spatial_ds_unit(self, prefix, in_channels, out_channels, stride=2):
-        right = left = self.prev_blob
-        out_channels = out_channels // 2
-
-        # Enlarge the receptive field for detection task
-        if self.detection:
-            left = self.add_detection_unit(left, prefix + '_left_detection',
-                                           in_channels, in_channels)
-
-        left = self.add_dwconv3x3_bn(left, prefix + 'left_dwconv',
-                                     in_channels, stride)
-        left = self.add_conv1x1_bn(left, prefix + '_left_conv1', in_channels,
-                                   out_channels)
-
-        if self.detection:
-            right = self.add_detection_unit(right, prefix + '_right_detection',
-                                            in_channels, in_channels)
-
-        right = self.add_conv1x1_bn(right, prefix + '_right_conv1',
-                                    in_channels, out_channels)
-        right = self.add_dwconv3x3_bn(right, prefix + '_right_dwconv',
-                                      out_channels, stride)
-        right = self.add_conv1x1_bn(right, prefix + '_right_conv2',
-                                    out_channels, out_channels)
-
-        self.prev_blob = brew.concat(self.model, [right, left],
-                                     prefix + '_concat')
-        self.prev_blob = self.model.net.ChannelShuffle(
-            self.prev_blob, prefix + '_ch_shuffle',
-            group=self.num_groups, kernel=1
-        )
-
-    # basic unit with stride=1
-    def add_basic_unit(self, prefix, in_channels, stride=1):
-        in_channels = in_channels // 2
-        left = prefix + '_left'
-        right = prefix + '_right'
-        self.model.net.Split(self.prev_blob, [left, right])
-
-        if self.detection:
-            right = self.add_detection_unit(right, prefix + '_right_detection',
-                                            in_channels, in_channels)
-
-        right = self.add_conv1x1_bn(right, prefix + '_right_conv1',
-                                    in_channels, in_channels)
-        right = self.add_dwconv3x3_bn(right, prefix + '_right_dwconv',
-                                      in_channels, stride)
-        right = self.add_conv1x1_bn(right, prefix + '_right_conv2',
-                                    in_channels, in_channels)
-
-        self.prev_blob = brew.concat(self.model, [right, left],
-                                     prefix + '_concat')
-
-        self.prev_blob = self.model.net.ChannelShuffle(
-            self.prev_blob, prefix + '_ch_shuffle',
-            group=self.num_groups, kernel=1
-        )
-
-    # helper functions to create net's units
-    def add_detection_unit(self, prev_blob, prefix, in_channels, out_channels,
-                           kernel=3, pad=1):
-        out_blob = brew.conv(self.model, prev_blob, prefix + '_conv',
-                             in_channels, out_channels, kernel=kernel,
-                             weight_init=("MSRAFill", {}),
-                             group=in_channels, pad=pad)
-        out_blob = brew.spatial_bn(self.model, out_blob, prefix + '_bn',
-                                   out_channels, epsilon=self.bn_epsilon,
-                                   is_test=self.is_test)
-        return out_blob
-
-    def add_conv1x1_bn(self, prev_blob, blob, in_channels, out_channels):
-        prev_blob = brew.conv(self.model, prev_blob, blob, in_channels,
-                              out_channels, kernel=1,
-                              weight_init=("MSRAFill", {}))
-        prev_blob = brew.spatial_bn(self.model, prev_blob, prev_blob + '_bn',
-                                    out_channels,
-                                    epsilon=self.bn_epsilon,
-                                    is_test=self.is_test)
-        prev_blob = brew.relu(self.model, prev_blob, prev_blob)
-        return prev_blob
-
-    def add_dwconv3x3_bn(self, prev_blob, blob, channels, stride):
-            prev_blob = brew.conv(self.model, prev_blob, blob, channels,
-                                  channels, kernel=3,
-                                  weight_init=("MSRAFill", {}),
-                                  stride=stride, group=channels, pad=1)
-            prev_blob = brew.spatial_bn(self.model, prev_blob,
-                                        prev_blob + '_bn',
-                                        channels,
-                                        epsilon=self.bn_epsilon,
-                                        is_test=self.is_test)
-            return prev_blob
-
-
-def create_shufflenet(
-    model,
-    data,
-    num_input_channels,
-    num_labels,
-    label=None,
-    is_test=False,
-    no_loss=False,
-):
-    builder = ShuffleNetV2Builder(model, data, num_input_channels,
-                                  num_labels,
-                                  is_test=is_test)
-    builder.create()
-
-    if no_loss:
-        return builder.last_out
-
-    if (label is not None):
-        (softmax, loss) = model.SoftmaxWithLoss(
-            [builder.last_out, label],
-            ["softmax", "loss"],
-        )
-        return (softmax, loss)
diff --git a/caffe2/python/models/shufflenet_test.py b/caffe2/python/models/shufflenet_test.py
deleted file mode 100644
index b18c01d8e328..000000000000
--- a/caffe2/python/models/shufflenet_test.py
+++ /dev/null
@@ -1,61 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-import caffe2.python.models.shufflenet as shufflenet
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.models.imagenet_trainer_test_utils as utils
-
-
-class ShufflenetMemongerTest(hu.HypothesisTestCase):
-    @given(with_shapes=st.booleans(), **hu.gcs_cpu_only)
-    @settings(max_examples=2, deadline=None)
-    def test_shufflenet_shared_grads(self, with_shapes, gc, dc):
-        results = utils.test_shared_grads(
-            with_shapes,
-            shufflenet.create_shufflenet,
-            'gpu_0/stage1_conv_w',
-            'gpu_0/last_out_L1000'
-        )
-        self.assertTrue(results[0][0] < results[0][1])
-        np.testing.assert_almost_equal(results[1][0], results[1][1])
-        np.testing.assert_almost_equal(results[2][0], results[2][1])
-
-    def test_shufflenet_forward_only(self):
-        results = utils.test_forward_only(
-            shufflenet.create_shufflenet,
-            'gpu_0/last_out_L1000'
-        )
-        self.assertTrue(results[0][0] < results[0][1])
-        self.assertTrue(results[1] < 10 and results[1] > 0)
-        np.testing.assert_almost_equal(results[2][0], results[2][1])
-
-    def test_shufflenet_forward_only_fast_simplenet(self):
-        '''
-        Test C++ memonger that is only for simple nets
-        '''
-        results = utils.test_forward_only_fast_simplenet(
-            shufflenet.create_shufflenet,
-            'gpu_0/last_out_L1000'
-        )
-
-        self.assertTrue(results[0][0] < results[0][1])
-        self.assertTrue(results[1] < 4 and results[1] > 0)
-        np.testing.assert_almost_equal(results[2][0], results[2][1])
-
-if __name__ == "__main__":
-    import unittest
-    import random
-    random.seed(2006)
-    # pyre-fixme[10]: Name `workspace` is used but not defined in the current scope
-    workspace.GlobalInit([
-        'caffe2',
-        '--caffe2_log_level=0',
-        '--caffe2_print_blob_sizes_at_exit=0',
-        '--caffe2_gpu_memory_tracking=1'])
-    unittest.main()
diff --git a/caffe2/python/modifier_context.py b/caffe2/python/modifier_context.py
deleted file mode 100644
index 574e7b644550..000000000000
--- a/caffe2/python/modifier_context.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# @package modifier_context
-# Module caffe2.python.modifier_context
-
-
-
-
-
-
-DEFAULT_MODIFIER = 'DEFAULT'
-
-
-class ModifierContext:
-    """
-    provide context to allow param_info to have different modifiers
-    """
-
-    def __init__(self):
-        self._modifiers = {}
-        self._modifiers_list = []
-
-    def _rebuild_modifiers(self):
-        self._modifiers = {}
-        for m in self._modifiers_list:
-            self._modifiers.update(m)
-
-    def _has_modifier(self, name):
-        return name in self._modifiers
-
-    def _get_modifier(self, name):
-        return self._modifiers.get(name)
-
-    def push_modifiers(self, modifiers):
-        # modifier override is allowed
-        self._modifiers_list.append(modifiers)
-        self._modifiers.update(modifiers)
-
-    def pop_modifiers(self):
-        assert len(self._modifiers_list) > 0
-        self._modifiers_list.pop()
-        self._rebuild_modifiers()
-
-
-class UseModifierBase:
-    '''
-    context class to allow setting the current context.
-    Example usage with layer:
-        modifiers = {'modifier1': modifier1, 'modifier2': modifier2}
-        with Modifiers(modifiers):
-            modifier = ModifierContext.current().get_modifier('modifier1')
-            layer(modifier=modifier)
-    '''
-
-    def __init__(self, modifier_or_dict):
-        if isinstance(modifier_or_dict, dict):
-            self._modifiers = modifier_or_dict
-        else:
-            self._modifiers = {DEFAULT_MODIFIER: modifier_or_dict}
-
-    def _context_class(self):
-        raise NotImplementedError
-
-    def __enter__(self):
-        self._context_class().current().push_modifiers(self._modifiers)
-        return self
-
-    def __exit__(self, type, value, traceback):
-        self._context_class().current().pop_modifiers()
diff --git a/caffe2/python/mpi_python.cc b/caffe2/python/mpi_python.cc
deleted file mode 100644
index 96d5dbb72f17..000000000000
--- a/caffe2/python/mpi_python.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "caffe2/mpi/mpi_common.h"
-
-namespace caffe2 {
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(mpi_utils, m) {
-  m.doc() = "MPI helper functions";
-  m.def(
-      "SetupPeers",
-      &MPISetupPeers,
-      py::arg("replicas"),
-      py::arg("role"),
-      py::arg("job_path"));
-  m.def("CommSize", [] {
-    auto comm = GlobalMPIComm();
-    return MPICommSize(comm);
-  });
-  m.def("CommRank", [] {
-    auto comm = GlobalMPIComm();
-    return MPICommRank(comm);
-  });
-  m.def("Finalize", [] {
-    // NOTE(pietern): Doesn't seem to work when calling it
-    // from Python. It ends up calling pthread_join on a
-    // thread that doesn't exit. For now, running mpirun
-    // with `-quiet` and skipping the finalize call.
-    MPI_Finalize();
-  });
-  m.def("Broadcast", [](py::bytes in) -> py::bytes {
-    std::string str = in;
-    auto comm = GlobalMPIComm();
-    auto length = str.length();
-    MPI_Bcast(&length, sizeof(length), MPI_CHAR, 0, comm);
-    auto ptr = std::make_unique<char[]>(length);
-    if (MPICommRank(comm) == 0) {
-      memcpy(ptr.get(), str.data(), str.length());
-    }
-    MPI_Bcast(ptr.get(), length, MPI_CHAR, 0, comm);
-    return std::string(ptr.get(), length);
-  });
-}
-
-} // namespace caffe2
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
deleted file mode 100644
index 346473946e4c..000000000000
--- a/caffe2/python/muji.py
+++ /dev/null
@@ -1,264 +0,0 @@
-## @package muji
-# Module caffe2.python.muji
-"""muji.py does multi-gpu training for caffe2 with no need to change the c++
-side code. Everything is defined on the computation graph level.
-
-We support the following use cases:
-  - 2 gpus, where peer access is enabled between them.
-  - 4 gpus, where peer access are enabled between all of them.
-  - 4 gpus, where peer access are enabled in two groups,
-    between {1, 2} and {3, 4}
-  - 8 gpus, where peer access are enabled in two groups,
-    between {1, 2, 3, 4} and {5, 6, 7, 8}.
-If above cases are not satisfied, a fallback function which does not rely on
-peer access will be called.
-"""
-
-import numpy as np
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import workspace
-
-
-def OnGPU(gpu_id):
-    """A utility function that returns a device option protobuf of the
-  specified gpu id.
-  """
-    device_option = caffe2_pb2.DeviceOption()
-    device_option.device_type = workspace.GpuDeviceType
-    device_option.device_id = gpu_id
-    return device_option
-
-
-def OnCPU():
-    device_option = caffe2_pb2.DeviceOption()
-    device_option.device_type = caffe2_pb2.CPU
-    return device_option
-
-
-def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None):
-    """The general Allreduce interface that reroutes the function calls.
-    CPUs and AMD GPUs are not supported because
-    GetGpuPeerAccessPattern is called to get gpu peer access pattern.
-  """
-    if gpu_indices is None:
-        gpu_indices = list(range(len(blobs)))
-    if len(gpu_indices) != len(blobs):
-        raise RuntimeError(
-            "gpu_indices length and blobs length mismatch: %d vs %d" %
-            (len(gpu_indices), len(blobs))
-        )
-    pattern = workspace.GetGpuPeerAccessPattern()
-    if len(blobs) == 2 and pattern.shape[0] >= 2 and np.all(pattern[:2, :2]):
-        return Allreduce2(net, blobs, reduced_affix, gpu_indices)
-    elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(pattern[:4, :4]):
-        return Allreduce4(net, blobs, reduced_affix, gpu_indices)
-    elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(pattern[:2, :2]) and np.all(pattern[2:4, 2:4]):
-        return Allreduce4Group2(net, blobs, reduced_affix, gpu_indices)
-    elif len(blobs) == 8 and pattern.shape[0] >= 8 and np.all(pattern[:8, :8]):
-        return Allreduce8(net, blobs, reduced_affix, gpu_indices)
-    else:
-        return AllreduceFallback(net, blobs, reduced_affix, gpu_indices)
-
-
-def Allreduce2(net, blobs, reduced_affix, gpu_indices):
-    """Allreduce for 2 gpus.
-
-  Algorithm: 0r <- 0 + 1, 1r <- 0r, where r means "reduced"
-  """
-    a, b = blobs
-    gpu_a, gpu_b = gpu_indices
-    a_reduced = net.Add([a, b], a + reduced_affix, device_option=OnGPU(gpu_a))
-    b_reduced = a_reduced.Copy(
-        [],
-        b + reduced_affix,
-        device_option=OnGPU(gpu_b)
-    )
-    return a_reduced, b_reduced
-
-
-def Allreduce4(net, blobs, reduced_affix, gpu_indices):
-    """Allreduce for 4 gpus.
-
-  Algorithm: 2 level reduction.
-      0r <- 0 + 1, 2r <- 2 + 3
-      0r <- 0r + 2r
-      2r <- 0r,
-      1r <- 0r, 3r <- 2r
-  """
-    a, b, c, d = blobs
-    gpu_a, gpu_b, gpu_c, gpu_d = gpu_indices
-    # a_reduced <- a+b, c_reduced <- c + d
-    a_reduced = net.Add(
-        [a, b],
-        str(a) + reduced_affix,
-        device_option=OnGPU(gpu_a)
-    )
-    c_reduced = net.Add(
-        [c, d],
-        str(c) + reduced_affix,
-        device_option=OnGPU(gpu_c)
-    )
-    # a_reduced <- a_reduced + c_reduced
-    a_reduced = a_reduced.Add(c_reduced, a_reduced, device_option=OnGPU(gpu_a))
-    # broadcast a_reduced to c_reduced
-    c_reduced = a_reduced.Copy([], c_reduced, device_option=OnGPU(gpu_c))
-    # broadcast to b and d
-    b_reduced = a_reduced.Copy(
-        [],
-        str(b) + reduced_affix,
-        device_option=OnGPU(gpu_b)
-    )
-    d_reduced = c_reduced.Copy(
-        [],
-        str(d) + reduced_affix,
-        device_option=OnGPU(gpu_d)
-    )
-    return a_reduced, b_reduced, c_reduced, d_reduced
-
-
-def Allreduce4Group2(net, blobs, reduced_affix, gpu_indices):
-    """Allreduce for 4 gpus where peer access are enabled in {0,1} and {2,3}
-
-  Algorithm: 2 level reduction.
-      0r <- 0 + 1, 2r <- 2 + 3
-      0r <- 0r + 2r
-      2r <- 0r,
-      1r <- 0r, 3r <- 2r
-  """
-    a, b, c, d = blobs
-    gpu_a, gpu_b, gpu_c, gpu_d = gpu_indices
-    # a_reduced <- a+b, c_reduced <- c + d
-    a_reduced = net.Add(
-        [a, b],
-        str(a) + reduced_affix,
-        device_option=OnGPU(gpu_a)
-    )
-    c_reduced = net.Add(
-        [c, d],
-        str(c) + reduced_affix,
-        device_option=OnGPU(gpu_c)
-    )
-    # copy from c_reduce(gpu_c) to c_reduce_copy(gpu_a)
-    c_reduced_copy = c_reduced.Copy(
-        [],
-        str(c_reduced) + '_copy',
-        device_option=OnGPU(gpu_a)
-    )
-    # a_reduced <- a_reduced + c_reduced_copy
-    a_reduced = a_reduced.Add(c_reduced_copy, a_reduced, device_option=OnGPU(gpu_a))
-    # broadcast a_reduced to c_reduced
-    c_reduced = a_reduced.Copy([], c_reduced, device_option=OnGPU(gpu_c))
-    # broadcast to b and d
-    b_reduced = a_reduced.Copy(
-        [],
-        str(b) + reduced_affix,
-        device_option=OnGPU(gpu_b)
-    )
-    d_reduced = c_reduced.Copy(
-        [],
-        str(d) + reduced_affix,
-        device_option=OnGPU(gpu_d)
-    )
-    return a_reduced, b_reduced, c_reduced, d_reduced
-
-
-def Allreduce8(net, blobs, reduced_affix, gpu_indices):
-    """Allreduce for 8 gpus.
-
-  Algorithm: 3 level reduction.
-      0r <- 0 + 1, 2r <- 2 + 3, 4r <- 4 + 5, 6r <- 6 + 7
-      0r <- 0r + 2r, 4r <- 4r + 6r
-      0r <- 0r + 4r
-      4r <- 0r
-      2r <- 0r, 6r <- 4r
-      1r <- 0r, 3r <- 2r, 5r <- 4r, 7r <- 6r
-  """
-    reduced = [None] * 8
-    # Reduction level 1
-    for i in [0, 2, 4, 6]:
-        reduced[i] = net.Add(
-            [blobs[i], blobs[i + 1]],
-            blobs[i] + reduced_affix,
-            device_option=OnGPU(gpu_indices[i])
-        )
-    # Reduction level 2
-    for i in [0, 4]:
-        reduced[i] = net.Add(
-            [reduced[i], reduced[i + 2]],
-            str(blobs[i]) + reduced_affix,
-            device_option=OnGPU(gpu_indices[i])
-        )
-    # Reduction level 3: this involves a copy.
-    reduced_4_copy = reduced[4].Copy(
-        [],
-        str(reduced[4]) + '_copy',
-        device_option=OnGPU(gpu_indices[0])
-    )
-    reduced[0] = reduced[0].Add(
-        reduced_4_copy,
-        reduced[0],
-        device_option=OnGPU(gpu_indices[0])
-    )
-    # Broadcast level 1
-    reduced[4] = reduced[0].Copy(
-        [],
-        reduced[4],
-        device_option=OnGPU(gpu_indices[4])
-    )
-    # Broadcast level 2
-    for i in [2, 6]:
-        reduced[i] = reduced[i - 2].Copy(
-            [],
-            reduced[i],
-            device_option=OnGPU(gpu_indices[i])
-        )
-    # Broadcast level 3
-    for i in [1, 3, 5, 7]:
-        reduced[i] = reduced[i - 1].Copy(
-            [],
-            blobs[i] + reduced_affix,
-            device_option=OnGPU(gpu_indices[i])
-        )
-    return reduced
-
-
-def AllreduceFallback(net, blobs, reduced_affix, gpu_indices):
-    """A fallback option for Allreduce with no assumption on p2p.
-
-  Algorithm: a flat operation on gpu 0
-      0r <- 0
-      0r <- 0r + i for i in gpu_indices[1:]
-      ir <- 0r for i in gpu_indices[1:]
-  """
-    reduced = [None] * len(gpu_indices)
-    if reduced_affix != '':
-        # copy first
-        reduced[0] = net.Copy(
-            blobs[0],
-            blobs[0] + reduced_affix,
-            device_option=OnGPU(gpu_indices[0])
-        )
-    else:
-        reduced[0] = blobs[0]
-    # do temp copy and add
-    temp_name = reduced[0] + '_temp_copy'
-    for i in range(1, len(gpu_indices)):
-        temp = net.Copy(
-            blobs[i],
-            temp_name,
-            device_option=OnGPU(gpu_indices[0])
-        )
-        reduced[0] = net.Add(
-            [temp, reduced[0]],
-            reduced[0],
-            device_option=OnGPU(gpu_indices[0])
-        )
-    # Broadcast to everyone else
-    for i in range(1, len(gpu_indices)):
-        reduced[i] = net.Copy(
-            reduced[0],
-            blobs[i] + reduced_affix,
-            device_option=OnGPU(gpu_indices[i])
-        )
-    return reduced
diff --git a/caffe2/python/muji_test.py b/caffe2/python/muji_test.py
deleted file mode 100644
index 8adc2daad117..000000000000
--- a/caffe2/python/muji_test.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import numpy as np
-import unittest
-
-from caffe2.python import core, workspace, muji, test_util
-
-
-@unittest.skipIf(not workspace.has_gpu_support, "no gpu")
-class TestMuji(test_util.TestCase):
-    def RunningAllreduceWithGPUs(self, gpu_ids, allreduce_function):
-        """A base function to test different scenarios."""
-        net = core.Net("mujitest")
-        for id in gpu_ids:
-            net.ConstantFill(
-                [],
-                "testblob_gpu_" + str(id),
-                shape=[1, 2, 3, 4],
-                value=float(id + 1),
-                device_option=muji.OnGPU(id)
-            )
-        allreduce_function(
-            net, ["testblob_gpu_" + str(i)
-                  for i in gpu_ids], "_reduced", gpu_ids
-        )
-        workspace.RunNetOnce(net)
-        target_value = sum(gpu_ids) + len(gpu_ids)
-        all_blobs = workspace.Blobs()
-        all_blobs.sort()
-        for blob in all_blobs:
-            print('{} {}'.format(blob, workspace.FetchBlob(blob)))
-
-        for idx in gpu_ids:
-            blob = workspace.FetchBlob("testblob_gpu_" + str(idx) + "_reduced")
-            np.testing.assert_array_equal(
-                blob,
-                target_value,
-                err_msg="gpu id %d of %s" % (idx, str(gpu_ids))
-            )
-
-    def testAllreduceFallback(self):
-        self.RunningAllreduceWithGPUs(
-            list(range(workspace.NumGpuDevices())), muji.AllreduceFallback
-        )
-
-    def testAllreduceSingleGPU(self):
-        for i in range(workspace.NumGpuDevices()):
-            self.RunningAllreduceWithGPUs([i], muji.Allreduce)
-
-    def testAllreduceWithTwoGPUs(self):
-        pattern = workspace.GetGpuPeerAccessPattern()
-        if pattern.shape[0] >= 2 and np.all(pattern[:2, :2]):
-            self.RunningAllreduceWithGPUs([0, 1], muji.Allreduce2)
-        else:
-            print('Skipping allreduce with 2 gpus. Not peer access ready.')
-
-    def testAllreduceWithFourGPUs(self):
-        pattern = workspace.GetGpuPeerAccessPattern()
-        if pattern.shape[0] >= 4 and np.all(pattern[:4, :4]):
-            self.RunningAllreduceWithGPUs([0, 1, 2, 3], muji.Allreduce4)
-        else:
-            print('Skipping allreduce with 4 gpus. Not peer access ready.')
-
-    def testAllreduceWithFourGPUsAndTwoGroups(self):
-        pattern = workspace.GetGpuPeerAccessPattern()
-        if pattern.shape[0] >= 4 and np.all(pattern[:2, :2]) and np.all(pattern[2:4, 2:4]):
-            self.RunningAllreduceWithGPUs([0, 1, 2, 3], muji.Allreduce4Group2)
-        else:
-            print('Skipping allreduce with 4 gpus and 2 groups. Not peer access ready.')
-
-    def testAllreduceWithEightGPUs(self):
-        pattern = workspace.GetGpuPeerAccessPattern()
-        if (
-            pattern.shape[0] >= 8 and np.all(pattern[:4, :4]) and
-            np.all(pattern[4:, 4:])
-        ):
-            self.RunningAllreduceWithGPUs(
-                list(range(8)), muji.Allreduce8)
-        else:
-            print('Skipping allreduce with 8 gpus. Not peer access ready.')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/net_builder.py b/caffe2/python/net_builder.py
deleted file mode 100644
index a6e57f4dd972..000000000000
--- a/caffe2/python/net_builder.py
+++ /dev/null
@@ -1,743 +0,0 @@
-## @package net_builder
-# Module caffe2.python.net_builder
-
-
-
-
-
-from caffe2.python import core, context
-from caffe2.python.task import Task, TaskGroup
-from caffe2.python.control_ops_util import add_if_op, add_while_op
-
-
-class NetBuilder(context.Managed):
-    """
-    Scope-driven mechanism for building nets, loops and conditional blocks.
-    Args:
-      name: NetBuilder's name
-      initial_scope: list of blobs that are available for reading/writing
-    Example:
-        from caffe2.python.net_builder import NetBuilder, ops
-        with NetBuilder() as nb:
-            c = ops.Const(5)
-            d = ops.Const(0)
-            with ops.loop():
-                ops.stop_if(ops.LE([c, ops.Const(0)]))
-                ops.Add([c, ops.Const(-1)], [c])
-                with ops.If(ops.GE([c, ops.Const(3)])):
-                    ops.Add([d, ops.Const(10)], [d])
-            ops.Print(c, [])
-            ops.Print(d, [])
-        step = core.to_execution_step(nb)
-    """
-    def __init__(self, name=None, initial_scope=None, _stop_blob_required=False,
-                 _stop_blob=None, _fullname=None, _use_control_ops=False):
-        parent = NetBuilder.current(required=False)
-        assert not _fullname or not name, 'Cannot set both _fullname and name'
-        assert not _use_control_ops or \
-            (not _stop_blob_required and not _stop_blob), \
-            'Stop blobs are not used with control operators'
-        self.name = _fullname or '/'.join(
-            n for n in (parent.name if parent else None, name) if n
-        )
-        self._frozen = False
-        self._current_net = None
-        self._children = []
-        if parent:
-            # make sure parent has an up to date lexical scope computed
-            parent._update_lexical_scope()
-        self._init_lexical_scope = set(parent._lexical_scope) if parent else set()
-        if initial_scope:
-            self._init_lexical_scope |= set([str(b) for b in initial_scope])
-        self._lexical_scope = set(self._init_lexical_scope)
-        self._stop_blob = _stop_blob
-        self._stop_blob_required = _stop_blob_required
-        self._use_control_ops = _use_control_ops
-
-    def stop_blob(self):
-        """
-        Returns the BlobReference to the stop_blob of this NetBuilder.
-        If one is not yet available, creates one.
-        This function assumes that the stop_blob() will be used immediatelly
-        in the current net, so it doesn't initialize it if the current net is
-        the first of the builder.
-        """
-        assert not self._use_control_ops, \
-            'Stop blobs are not used with control operators'
-        if self._stop_blob is None:
-            net = self.current_net()
-            self._stop_blob = core.BlobReference(
-                net.NextName('stop_blob'), net=net)
-            net.Const(False, blob_out=self._stop_blob)
-            if self._current_net != self._children[0]:
-                self._children.insert(0, core.Net('stop_blob_init'))
-                self._children[0].Const(False, blob_out=self._stop_blob)
-        return self._stop_blob
-
-    def stop_if(self, blob):
-        assert not self._use_control_ops, \
-            'Stop blobs are not used with control operators'
-        stop_blob = self.stop_blob()
-        ops.Or([stop_blob, blob], [stop_blob])
-        self._current_net = None
-
-    def _assert_mutable(self):
-        assert not self._frozen, (
-            'This NetBuilder (%s) has been built already.' % self.name)
-
-    def _update_lexical_scope(self):
-        """
-        Updates lexical scope based on the current list of children.
-        Lexical scope contains names of blobs that are currently available
-        and were introduced in the net builder
-        """
-        self._lexical_scope = set(self._init_lexical_scope)
-        for child in self._children:
-            if isinstance(child, core.Net):
-                self._lexical_scope |= child.UsedBlobNames()
-            elif isinstance(child, NetBuilder) and child._use_control_ops:
-                self._lexical_scope |= child._lexical_scope
-
-    def _reset_children(self):
-        self._current_net = None
-        self._children = []
-        self._lexical_scope = set(self._init_lexical_scope)
-
-    def add(self, child):
-        self._assert_mutable()
-
-        if self._use_control_ops:
-            assert isinstance(child, core.Net) or (
-                isinstance(child, NetBuilder) and child._use_control_ops), \
-                "Expected Net or NetBuilder with control ops"
-
-        self._current_net = None
-        self._children.append(child)
-        # to-do : check it's not a dag net
-        if isinstance(child, core.Net):
-            self._current_net = child
-        self._update_lexical_scope()
-        return child
-
-    def current_net(self, name=None):
-        self._assert_mutable()
-        if self._current_net is None or name is not None:
-            self.add(core.Net(name))
-        return self._current_net
-
-    def freeze(self):
-        for child in self._children:
-            if hasattr(child, 'freeze'):
-                child.freeze()
-        self._current_net = None
-        self._frozen = True
-
-    def get(self):
-        self.freeze()
-        return self._children
-
-    def __exit__(self, etype, *args):
-        super().__exit__(etype, *args)
-
-        if self._use_control_ops and len(self._children) > 0:
-            _children = self._children
-            self._reset_children()
-            merged_net = NetBuilder.merge_nets(
-                _children, self._lexical_scope)
-            assert merged_net, "Expected a non-empty merge of children"
-            self._children = [merged_net]
-
-        self.freeze()
-        if etype is not None:
-            return
-        assert (not self._stop_blob_required) or self._stop_blob is not None, (
-            'This NetBuilder (%s) requires a stop condition ' % self.name +
-            'to be set with `stop` or `stop_if`')
-
-    @staticmethod
-    def merge_nets(nets_or_builders, outer_blob_names):
-        # Only nets or builders with control ops are allowed.
-        # Need to pay attention to external outputs, e.g.
-        #   ...
-        #   IfNet1 (cond_blob):
-        #       (Net1)
-        #           X = 1
-        #       IfNet2 (...):
-        #           X = X + 1
-        #   ...
-        # In this example there're two children in then branch of IfNet1:
-        # a subnet Net1 that creates blob X and sets its value to one, and
-        # a net builder IfNet2 that (conditionally) increments X.
-        # From IfNet2's point of view X is an external input
-        # and output blob, it will be put into IfNet2 net's external_output.
-        # At the same time, from the point of view of IfNet1 X is purely local.
-        # Net.AppendNet just merges external outputs of the networks, so
-        # without checking this the result of Net1.AppendNet(IfNet2's net)
-        # would have blob X in external_output
-
-        net = None
-        for n in nets_or_builders:
-            cur = None
-            if isinstance(n, NetBuilder):
-                assert n._use_control_ops, \
-                    "Merging of NetBuilder supported only for control ops"
-                nets = n.get()
-                assert len(nets) == 1 and isinstance(nets[0], core.Net), \
-                    "Invalid control op net builder"
-                cur = nets[0]
-            else:
-                assert isinstance(n, core.Net)
-                cur = n
-            if net:
-                net.AppendNet(cur)
-            else:
-                net = cur
-        if net:
-            # correct external output
-            external_outputs = [o for o in net.Proto().external_output
-                                    if o in outer_blob_names]
-            net.Proto().external_output[:] = external_outputs
-        return net
-
-    def __str__(self):
-        return self.name or 'Un-named NetBuilder'
-
-
-class Operations:
-    """
-    Operations to be used in the context of a NetBuilder.
-    """
-    def net(self, net=None, name=None):
-        """
-        Retrieves the current net, or add a new net to the builder.
-        Args:
-            net:   If provided, add the given net to the active builder.
-                   Else, returns the current Net or creates a new one as needed.
-            name:  if provided, creates a new Net with given name and makes
-                   it the new current net of the active builder. Cannot
-                   be provided if net is provided.
-        """
-        assert name is None or net is None, (
-            'Cannot provide both `net` and `name`.')
-        if net is not None:
-            NetBuilder.current().add(net)
-            return net
-        return NetBuilder.current().current_net(name=name)
-
-    def __getattr__(self, op_type):
-        """
-        Adds an operator call to the currently active Net.
-        """
-        if op_type.startswith('__'):
-            raise AttributeError()
-        # We want hasattr to work properly even if no context is active.
-        if NetBuilder.current(required=False) is None:
-            raise AttributeError('No active NetBuilder.')
-        return getattr(self.net(), op_type)
-
-    def task_group(self):
-        """
-        Creates a local task group which will execute as the next step of
-        the current NetBuilder.
-        """
-        from caffe2.python import task
-        group = NetBuilder.current()
-        with task.Cluster():
-            with task.Node('local'):
-                tg = task.TaskGroup()
-                group.add(tg)
-                return tg
-
-    def stop(self):
-        """
-        Stop execution of the current execution step.
-            Example:
-                ops.Print(a, 0)
-                ops.stop()
-                ops.Print(b, 0)
-            In the example, 'b' will never be printed.
-        """
-        return self.stop_if(ops.Const(True))
-
-    def stop_if(self, blob):
-        """
-        Stop execution of the current execution step if the
-        condition `blob` is met.
-            Example:
-                ops.Print(a, 0)
-                ops.stop_if(ops.LE([x, ops.Const(0)]))
-                ops.Print(b, 0)
-            In the example, 'b' will only be printed if the value of scalar
-            tensor 'x' is greater than 0.
-        """
-        return NetBuilder.current().stop_if(blob)
-
-    def loop(self, iters=None, name=None):
-        """
-        Creates a NetBuilder that will execute in a loop as the next step of
-        the current NetBuilder. If `iters` is provided, the loop will execute
-        for `iters` iterations and then stop. `iters` can be a constant or a
-        BlobReference. If `iters` is not provided, the loop will execute
-        until `ops.stop` or `ops.stop_if` is called.
-            Examples:
-                a = ops.Const(5)
-                with ops.loop():
-                    ops.stop_if(ops.LE([a, ops.Const(0)]))
-                    ops.Print(a, 0)
-                    ops.Add([a, ops.Const(-1)], [a])
-            Above, 'a' will be printed 5 times, with values 5 to 1.
-
-                with ops.loop(10) as loop:
-                    ops.LogInfo(loop.iter())
-            This will print the numbers from 0 to 9.
-
-                x = ops.Add([ops.Const(10), ops.Const(10)])
-                with ops.loop(x) as loop:
-                    ops.LogInfo(loop.iter())
-            This will print the numbers from 0 to 19.
-        """
-        return NetBuilder.current().add(_Loop(iters, name=name))
-
-    def stop_guard(self, has_stopped_blob=None, name=None):
-        """
-        Creates a NetBuilder that will execute once as the next step of the
-        current NetBuilder. After execution, a bool tensor will indicate
-        whether the inner execution was halted with `stop` or `stop_if`.
-            Example:
-                a = ops.Const(True)
-                with ops.stop_guard() as sg1:
-                    ops.stop_if(a)
-                    ops.Print(ops.Const('did not stop'))
-                b = ops.Const(False)
-                with ops.stop_guard() as sg2:
-                    ops.stop_if(b)
-                    ops.Print(ops.Const('did not stop'))
-                ops.Print(sg1.has_stopped(), [])
-                ops.Print(sg2.has_stopped(), [])
-            In the example, 'did not stop' will be printed once,
-            followed by True and False.
-        """
-        return NetBuilder.current().add(
-            _StopGuard(has_stopped_blob=has_stopped_blob, name=name))
-
-    def If(self, cond, name=None):
-        """
-        Creates a NetBuilder that will execute once as the next step of the
-        current NetBuilder if the blob `cond` is True.
-            Example:
-                with ops.If(ops.Const(True)):
-                    ops.Print(ops.Const('Will print'))
-                with ops.If(ops.Const(False)):
-                    ops.Print(ops.Const('Wont print'))
-            The example will print 'Will print' once.
-        """
-        return NetBuilder.current().add(_RunIf(cond, name=name))
-
-    def IfNet(self, cond, name=None):
-        """
-        Same as If, but uses 'If' operator instead of execution step logic
-        """
-        return NetBuilder.current().add(_RunIfNet(cond, name=name))
-
-    def Else(self, name=None):
-        """
-        Else branch of IfNet, has to be specified immediately after IfNet.
-            Example:
-                with ops.IfNet(ops.LT([x, y])):
-                    ...
-                with ops.Else():
-                    ...
-        """
-        return _RunElseNet(name=name)
-
-    def WhileNet(self, name=None):
-        """
-        NetBuilder for 'While' control operator
-        """
-        return NetBuilder.current().add(_RunWhileNet(name=name))
-
-    def Condition(self, name=None):
-        """
-        Loop's condition, executed within WhileNet context
-        """
-        assert isinstance(NetBuilder.current(), _RunWhileNet), \
-            "Use of Condition outside of WhileNet"
-        return _RunWhileCondition(name=name)
-
-    def task_init(self):
-        """
-        Defines operations that will be executed once at task startup.
-        Useful when implementing processors, that don't have access to the Task
-        top-level structure.
-
-        This setup will be run only once, even if multiple instances of the task
-        will run in parallel. For instance-local initialization, use
-        `task_instance_init` instead.
-
-            Example:
-                def my_processor(rec):
-                    with ops.task_init():
-                        one = ops.Const(1)
-                        two = ops.Const(1)
-                    return Tuple(
-                        ops.Add(rec[0](), zero), ops.Add(rec[1](), two))
-        """
-        setup = _SetupBuilder(_SetupBuilder.INIT)
-        self.net().add_attribute(Task.TASK_SETUP, setup)
-        return setup
-
-    def task_exit(self):
-        """
-        Define operations to be executed once at task shutdown.
-        Useful when implementing processors, that don't have access to the Task
-        top-level structure.
-
-        This shutdown will be run only once, after all concurrent instances of
-        the task have already finished. For instance-local shutdown,
-        use `task_instance_exit` instead.
-
-            Example:
-                def read_queue(queue):
-                    with ops.task_exit():
-                        queue.close(ops.net())
-                    return queue.read(ops.net())
-        """
-        setup = _SetupBuilder(_SetupBuilder.EXIT)
-        self.net().add_attribute(Task.TASK_SETUP, setup)
-        return setup
-
-    def task_instance_init(self):
-        """
-        Defines operations that will be executed once at startup of each
-        instance of a task. This can be seen as "thread_local" initialization.
-        It is guaranteed to run only after all `task_init` logic finishes.
-
-        This setup will be run concurrently for each instance of a task.
-        For global task initialization, use `task_init` instead.
-        """
-        setup = _SetupBuilder(_SetupBuilder.INIT)
-        self.net().add_attribute(Task.TASK_INSTANCE_SETUP, setup)
-        return setup
-
-    def task_instance_exit(self):
-        """
-        Defines operations that will be executed once at shutdown of each
-        instance of a task. This can be seen as "thread_local" finalization.
-
-        This shutdown will be run concurrently for each instance of a task.
-        For global task shutdown, use `task_exit` instead.
-        """
-        setup = _SetupBuilder(_SetupBuilder.EXIT)
-        self.net().add_attribute(Task.TASK_INSTANCE_SETUP, setup)
-        return setup
-
-    def local_init(self):
-        """
-        Similar to `task_init`, but executes at TaskGroup's startup instead,
-        before any task of the group starts executing. This will run only
-        once on each node, before initialization of any task, so it can be
-        used e.g. to initialize blobs shared across tasks.
-        """
-        setup = _SetupBuilder(_SetupBuilder.INIT)
-        self.net().add_attribute(TaskGroup.LOCAL_SETUP, setup)
-        return setup
-
-    def local_exit(self, name=None):
-        """
-        Similar to `task_exit`, but executes at TaskGroup's exit instead,
-        after all tasks of the group finished execution.
-        This will run only once on each node.
-        """
-        setup = _SetupBuilder(_SetupBuilder.EXIT, name)
-        self.net().add_attribute(TaskGroup.LOCAL_SETUP, setup)
-        return setup
-
-    def task_reporter(self, interval_ms=1000, name=None):
-        """
-        Define operations to be executed at every time interval from
-        task start-up to finish. These operations are guaranteed to
-        execute at least once after all other operations of the task are
-        finished.
-
-            Example:
-                with ops.task_reporter(interval_ms=10000):
-                    ops.LogInfo('10s elapsed')
-        """
-        return _ReporterBuilder(interval_ms, net=self.net(), name=name)
-
-    def local_reporter(self, interval_ms=1000, name=None):
-        """
-        Similar to task_report, but operations defined within this block
-        will run repeatedly for as long as any of the tasks in the current
-        TaskGroup have not finished.
-        """
-        return _ReporterBuilder(interval_ms, name=name)
-
-
-ops = Operations()
-
-
-class _ReporterBuilder(NetBuilder):
-    def __init__(self, interval_ms, net=None, name=None):
-        NetBuilder.__init__(self, name)
-        self._net = net
-        self.interval_ms = interval_ms
-
-    def __exit__(self, etype, *args):
-        if etype is None:
-            step = core.to_execution_step(self)
-            step.RunEveryMillis(self.interval_ms)
-            if self._net:
-                self._net.add_attribute(Task.REPORT_STEP, step)
-            else:
-                TaskGroup.current().report_step(
-                    step, interval_ms=self.interval_ms)
-        NetBuilder.__exit__(self, etype, *args)
-
-
-class _SetupBuilder(NetBuilder):
-    INIT = 'init'
-    EXIT = 'exit'
-
-    def __init__(self, type, name=None):
-        NetBuilder.__init__(self, name)
-        self.type = type
-
-    def setup(self, net):
-        if self.type == _SetupBuilder.INIT:
-            return core.to_execution_step(self)
-
-    def exit(self, net):
-        if self.type == _SetupBuilder.EXIT:
-            return core.to_execution_step(self)
-
-
-class _RunOnce(NetBuilder):
-    def __init__(self, name=None):
-        NetBuilder.__init__(self, name)
-
-    def __exit__(self, etype, *args):
-        if etype is None and self._stop_blob is not None:
-            ops.stop()
-        NetBuilder.__exit__(self, etype, *args)
-
-
-class _StopGuard(_RunOnce):
-    def __init__(self, has_stopped_blob=None, name=None):
-        _RunOnce.__init__(self, name)
-        self._stopped = has_stopped_blob
-        self._ran = False
-
-    def __enter__(self):
-        r = _RunOnce.__enter__(self)
-        self._stopped = ops.Const(True, blob_out=self._stopped)
-        return r
-
-    def __exit__(self, etype, *args):
-        if etype is None:
-            self._ran = True
-            ops.Const(False, blob_out=self._stopped)
-        _RunOnce.__exit__(self, etype, *args)
-
-    def has_stopped(self):
-        """
-        Return a blob that will be set to scalar bool `True` after
-        this net builder ran, iff it was halted early.
-        """
-        assert self._ran, 'Context not used yet.'
-        return self._stopped
-
-
-class _Loop(NetBuilder):
-    def __init__(self, iters=None, name=None):
-        NetBuilder.__init__(self, name, _stop_blob_required=True)
-        if iters is not None:
-            self._inc = ops.Const(1)
-            self._iter = ops.Const(0)
-            self._num_iters = (
-                iters if isinstance(iters, core.BlobReference)
-                else ops.Const(iters))
-        else:
-            self._num_iters = None
-
-    def iter(self):
-        assert self._num_iters is not None, (
-            'This loop does not have a number of iterations.')
-        assert self._iter is not None, (
-            'iter() must be called from inside the loop context')
-        return self._iter
-
-    def __enter__(self):
-        builder = NetBuilder.__enter__(self)
-        if self._num_iters is not None:
-            ops.stop_if(ops.GE([self._iter, self._num_iters]))
-        return builder
-
-    def __exit__(self, type, *args):
-        if type is None and self._num_iters is not None:
-            self.current_net().Add([self._iter, self._inc], [self._iter])
-        NetBuilder.__exit__(self, type, *args)
-
-
-class _RunIf(_RunOnce):
-    def __init__(self, cond_blob=None, name=None, _already_ran=None):
-        _RunOnce.__init__(self, name)
-        assert cond_blob or _already_ran
-        self._is_else = cond_blob is None
-        if _already_ran is None:
-            self._else_blob = ops.Not(cond_blob)
-            self._already_ran = ops.Const(False)
-        else:
-            self._already_ran = _already_ran
-            self._else_blob = _already_ran if cond_blob is None else (
-                ops.Or([_already_ran, ops.Not(cond_blob)]))
-
-    def __enter__(self):
-        r = _RunOnce.__enter__(self)
-        ops.stop_if(self._else_blob)
-        ops.Const(True, blob_out=self._already_ran)
-        return r
-
-    def Elif(self, cond, name=None):
-        assert not self._is_else, 'Else not allowed for an Else.'
-        return NetBuilder.current().add(_RunIf(
-            cond, name=name or self.name, _already_ran=self._already_ran))
-
-    def Else(self, name=None):
-        assert not self._is_else, 'Elif not allowed for an Else.'
-        return NetBuilder.current().add(
-            _RunIf(name=name or self.name, _already_ran=self._already_ran))
-
-
-class _RunIfNet(NetBuilder):
-    """
-    Generates a single net that uses If operator
-    """
-    def __init__(self, cond_blob, name=None):
-        NetBuilder.__init__(self, name=name, _use_control_ops=True)
-        assert cond_blob, 'Conditional blob is not specified for an If net'
-        self._cond_blob = cond_blob
-        self._then_net = None
-        self._else_net = None
-
-    def add(self, child):
-        return NetBuilder.add(self, child)
-
-    def __exit__(self, type, *args):
-        if type is None:
-            _then_nets = self._children
-            self._reset_children()
-
-            self._then_net = NetBuilder.merge_nets(
-                _then_nets, self._lexical_scope)
-            if not self._then_net:
-                self._then_net = core.Net('empty_then_net')
-
-            if_net = core.Net(self.name + '/if_net')
-            add_if_op(if_net, self._cond_blob, self._lexical_scope,
-                        self._then_net, self._else_net)
-
-            self._current_net = if_net
-            self._children = [if_net]
-        NetBuilder.__exit__(self, type, *args)
-
-
-class _RunElseNet(NetBuilder):
-    """
-    Else branch for _RunIfNet builder
-    """
-    def __init__(self, name=None):
-        NetBuilder.__init__(self, name=name, _use_control_ops=True)
-        parent = NetBuilder.current(required=False)
-        assert parent and len(parent._children) > 0 and \
-            isinstance(parent._children[-1], _RunIfNet), \
-            'Invalid use of Else builder'
-        self._if_builder = parent._children[-1]
-
-    def __exit__(self, type, *args):
-        if type is None:
-            _else_nets = self._children
-            self._reset_children()
-
-            self._if_builder._else_net = NetBuilder.merge_nets(
-                _else_nets, self._lexical_scope)
-            if self._if_builder._else_net:
-                if_else_net = core.Net(self.name + '/if_else_net')
-                add_if_op(
-                    if_else_net,
-                    self._if_builder._cond_blob,
-                    self._lexical_scope,
-                    self._if_builder._then_net,
-                    self._if_builder._else_net)
-                self._if_builder._current_net = if_else_net
-                self._if_builder._children = [if_else_net]
-        NetBuilder.__exit__(self, type, *args)
-
-
-class _RunWhileNet(NetBuilder):
-    """
-    Generates a single net that uses While operator
-    """
-    def __init__(self, name=None):
-        NetBuilder.__init__(self, name=name, _use_control_ops=True)
-        self._cond_builder = None
-
-    def __exit__(self, type, *args):
-        if type is None:
-            assert self._cond_builder, \
-                'Condition builder must be specified in While op'
-
-            _cond_blob = self._cond_builder._cond_blob
-            _cond_net = self._cond_builder._cond_net
-
-            loop_body = self._children
-            self._reset_children()
-            loop_body_net = NetBuilder.merge_nets(
-                loop_body, self._lexical_scope)
-            if not loop_body_net:
-                loop_body_net = core.Net('empty_loop_body_net')
-
-            while_net = core.Net(self.name + '/while_net')
-            add_while_op(while_net, _cond_blob, self._lexical_scope,
-                            loop_body_net, _cond_net)
-
-            self._current_net = while_net
-            self._children = [while_net]
-        NetBuilder.__exit__(self, type, *args)
-
-
-class _RunWhileCondition(NetBuilder):
-    """
-    Computes loop's condition, used in the context of WhileNet.
-    Last operator must have a single scalar boolean output that will be used
-    as a condition value, no other blobs created in the condition net are
-    visible outside of it
-    """
-    def __init__(self, name=None):
-        NetBuilder.__init__(self, name=name, _use_control_ops=True)
-        parent = NetBuilder.current(required=False)
-        assert parent and isinstance(parent, _RunWhileNet), \
-            'Invalid use of loop condition builder'
-        assert not parent._cond_builder, \
-            'Multiple loop condition builders specified'
-        assert len(parent._children) == 0, \
-            'Condition definition must be specified before the loop\'s body'
-        parent._cond_builder = self
-        self._cond_blob = None
-        self._cond_net = None
-
-    def __exit__(self, type, *args):
-        if type is None:
-            condition_body = self._children
-            self._reset_children()
-            self._cond_net = NetBuilder.merge_nets(
-                condition_body, self._lexical_scope)
-            assert self._cond_net, 'Invalid loop condition specified'
-            assert len(self._cond_net.Proto().op) > 0, 'Invalid condition net'
-            last_op = self._cond_net.Proto().op[-1]
-            assert len(last_op.output) == 1, 'Invalid condition net'
-            self._cond_blob = core.BlobReference(name=last_op.output[0], net=None)
-
-            self._current_net = self._cond_net
-            self._children = [self._cond_net]
-        NetBuilder.__exit__(self, type, *args)
diff --git a/caffe2/python/net_builder_test.py b/caffe2/python/net_builder_test.py
deleted file mode 100644
index ed0d0fb01d04..000000000000
--- a/caffe2/python/net_builder_test.py
+++ /dev/null
@@ -1,332 +0,0 @@
-
-
-
-
-
-from caffe2.python import workspace
-from caffe2.python.core import Plan, to_execution_step, Net
-from caffe2.python.task import Task, TaskGroup, final_output
-from caffe2.python.net_builder import ops, NetBuilder
-from caffe2.python.session import LocalSession
-import unittest
-import threading
-
-
-class PythonOpStats:
-    lock = threading.Lock()
-    num_instances = 0
-    num_calls = 0
-
-
-def python_op_builder():
-    PythonOpStats.lock.acquire()
-    PythonOpStats.num_instances += 1
-    PythonOpStats.lock.release()
-
-    def my_op(inputs, outputs):
-        PythonOpStats.lock.acquire()
-        PythonOpStats.num_calls += 1
-        PythonOpStats.lock.release()
-
-    return my_op
-
-
-def _test_loop():
-    x = ops.Const(5)
-    y = ops.Const(0)
-    with ops.loop():
-        ops.stop_if(ops.EQ([x, ops.Const(0)]))
-        ops.Add([x, ops.Const(-1)], [x])
-        ops.Add([y, ops.Const(1)], [y])
-    return y
-
-
-def _test_inner_stop(x):
-    ops.stop_if(ops.LT([x, ops.Const(5)]))
-
-
-def _test_outer():
-    x = ops.Const(10)
-    # test stop_if(False)
-    with ops.stop_guard() as g1:
-        _test_inner_stop(x)
-
-    # test stop_if(True)
-    y = ops.Const(3)
-    with ops.stop_guard() as g2:
-        _test_inner_stop(y)
-
-    # test no stop
-    with ops.stop_guard() as g4:
-        ops.Const(0)
-
-    # test empty clause
-    with ops.stop_guard() as g3:
-        pass
-
-    return (
-        g1.has_stopped(), g2.has_stopped(), g3.has_stopped(), g4.has_stopped())
-
-
-def _test_if(x):
-    y = ops.Const(1)
-    with ops.If(ops.GT([x, ops.Const(50)])):
-        ops.Const(2, blob_out=y)
-    with ops.If(ops.LT([x, ops.Const(50)])):
-        ops.Const(3, blob_out=y)
-        ops.stop()
-        ops.Const(4, blob_out=y)
-    return y
-
-
-class TestNetBuilder(unittest.TestCase):
-    def test_ops(self):
-        with NetBuilder() as nb:
-            y = _test_loop()
-            z, w, a, b = _test_outer()
-            p = _test_if(ops.Const(75))
-            q = _test_if(ops.Const(25))
-        plan = Plan('name')
-        plan.AddStep(to_execution_step(nb))
-        ws = workspace.C.Workspace()
-        ws.run(plan)
-        expected_results = [
-            (y, 5),
-            (z, False),
-            (w, True),
-            (a, False),
-            (b, False),
-            (p, 2),
-            (q, 3),
-        ]
-        for b, expected in expected_results:
-            actual = ws.blobs[str(b)].fetch()
-            self.assertEqual(actual, expected)
-
-    def _expected_loop(self):
-        total = 0
-        total_large = 0
-        total_small = 0
-        total_tiny = 0
-        for loop_iter in range(10):
-            outer = loop_iter * 10
-            for inner_iter in range(loop_iter):
-                val = outer + inner_iter
-                if val >= 80:
-                    total_large += val
-                elif val >= 50:
-                    total_small += val
-                else:
-                    total_tiny += val
-                total += val
-        return total, total_large, total_small, total_tiny
-
-    def _actual_loop(self):
-        total = ops.Const(0)
-        total_large = ops.Const(0)
-        total_small = ops.Const(0)
-        total_tiny = ops.Const(0)
-        with ops.loop(10) as loop:
-            outer = ops.Mul([loop.iter(), ops.Const(10)])
-            with ops.loop(loop.iter()) as inner:
-                val = ops.Add([outer, inner.iter()])
-                with ops.If(ops.GE([val, ops.Const(80)])) as c:
-                    ops.Add([total_large, val], [total_large])
-                with c.Elif(ops.GE([val, ops.Const(50)])) as c:
-                    ops.Add([total_small, val], [total_small])
-                with c.Else():
-                    ops.Add([total_tiny, val], [total_tiny])
-                ops.Add([total, val], total)
-        return [
-            final_output(x)
-            for x in [total, total_large, total_small, total_tiny]
-        ]
-
-    def test_net_multi_use(self):
-        with Task() as task:
-            total = ops.Const(0)
-            net = Net('my_net')
-            net.Add([total, net.Const(1)], [total])
-            ops.net(net)
-            ops.net(net)
-            result = final_output(total)
-        with LocalSession() as session:
-            session.run(task)
-            self.assertEqual(2, result.fetch())
-
-    def test_loops(self):
-        with Task() as task:
-            out_actual = self._actual_loop()
-        with LocalSession() as session:
-            session.run(task)
-            expected = self._expected_loop()
-            actual = [o.fetch() for o in out_actual]
-            for e, a in zip(expected, actual):
-                self.assertEqual(e, a)
-
-    def test_setup(self):
-        with Task() as task:
-            with ops.task_init():
-                one = ops.Const(1)
-            two = ops.Add([one, one])
-            with ops.task_init():
-                three = ops.Const(3)
-            accum = ops.Add([two, three])
-            # here, accum should be 5
-            with ops.task_exit():
-                # here, accum should be 6, since this executes after lines below
-                seven_1 = ops.Add([accum, one])
-            six = ops.Add([accum, one])
-            ops.Add([accum, one], [accum])
-            seven_2 = ops.Add([accum, one])
-            o6 = final_output(six)
-            o7_1 = final_output(seven_1)
-            o7_2 = final_output(seven_2)
-        with LocalSession() as session:
-            session.run(task)
-            self.assertEqual(o6.fetch(), 6)
-            self.assertEqual(o7_1.fetch(), 7)
-            self.assertEqual(o7_2.fetch(), 7)
-
-    def test_multi_instance_python_op(self):
-        """
-        When task instances are created at runtime, C++ concurrently creates
-        multiple instances of operators in C++, and concurrently destroys them
-        once the task is finished. This means that the destructor of PythonOp
-        will be called concurrently, so the GIL must be acquired. This
-        test exercises this condition.
-        """
-        with Task(num_instances=64) as task:
-            with ops.loop(4):
-                ops.Python((python_op_builder, [], {}))([], [])
-        with LocalSession() as session:
-            PythonOpStats.num_instances = 0
-            PythonOpStats.num_calls = 0
-            session.run(task)
-            self.assertEqual(PythonOpStats.num_instances, 64)
-            self.assertEqual(PythonOpStats.num_calls, 256)
-
-    def test_multi_instance(self):
-        NUM_INSTANCES = 10
-        NUM_ITERS = 15
-        with TaskGroup() as tg:
-            with Task(num_instances=NUM_INSTANCES):
-                with ops.task_init():
-                    counter1 = ops.CreateCounter([], ['global_counter'])
-                    counter2 = ops.CreateCounter([], ['global_counter2'])
-                    counter3 = ops.CreateCounter([], ['global_counter3'])
-                # both task_counter and local_counter should be thread local
-                with ops.task_instance_init():
-                    task_counter = ops.CreateCounter([], ['task_counter'])
-                local_counter = ops.CreateCounter([], ['local_counter'])
-                with ops.loop(NUM_ITERS):
-                    ops.CountUp(counter1)
-                    ops.CountUp(task_counter)
-                    ops.CountUp(local_counter)
-                # gather sum of squares of local counters to make sure that
-                # each local counter counted exactly up to NUM_ITERS, and
-                # that there was no false sharing of counter instances.
-                with ops.task_instance_exit():
-                    count2 = ops.RetrieveCount(task_counter)
-                    with ops.loop(ops.Mul([count2, count2])):
-                        ops.CountUp(counter2)
-                # This should have the same effect as the above
-                count3 = ops.RetrieveCount(local_counter)
-                with ops.loop(ops.Mul([count3, count3])):
-                    ops.CountUp(counter3)
-                # The code below will only run once
-                with ops.task_exit():
-                    total1 = final_output(ops.RetrieveCount(counter1))
-                    total2 = final_output(ops.RetrieveCount(counter2))
-                    total3 = final_output(ops.RetrieveCount(counter3))
-
-        with LocalSession() as session:
-            session.run(tg)
-            self.assertEqual(total1.fetch(), NUM_INSTANCES * NUM_ITERS)
-            self.assertEqual(total2.fetch(), NUM_INSTANCES * (NUM_ITERS ** 2))
-            self.assertEqual(total3.fetch(), NUM_INSTANCES * (NUM_ITERS ** 2))
-
-    def test_if_net(self):
-        with NetBuilder() as nb:
-            x0 = ops.Const(0)
-            x1 = ops.Const(1)
-            x2 = ops.Const(2)
-            y0 = ops.Const(0)
-            y1 = ops.Const(1)
-            y2 = ops.Const(2)
-
-            # basic logic
-            first_res = ops.Const(0)
-            with ops.IfNet(ops.Const(True)):
-                ops.Const(1, blob_out=first_res)
-            with ops.Else():
-                ops.Const(2, blob_out=first_res)
-
-            second_res = ops.Const(0)
-            with ops.IfNet(ops.Const(False)):
-                ops.Const(1, blob_out=second_res)
-            with ops.Else():
-                ops.Const(2, blob_out=second_res)
-
-            # nested and sequential ifs,
-            # empty then/else,
-            # passing outer blobs into branches,
-            # writing into outer blobs, incl. into input blob
-            # using local blobs
-            with ops.IfNet(ops.LT([x0, x1])):
-                local_blob = ops.Const(900)
-                ops.Add([ops.Const(100), local_blob], [y0])
-
-                gt = ops.GT([x1, x2])
-                with ops.IfNet(gt):
-                    # empty then
-                    pass
-                with ops.Else():
-                    ops.Add([y1, local_blob], [local_blob])
-                    ops.Add([ops.Const(100), y1], [y1])
-
-                with ops.IfNet(ops.EQ([local_blob, ops.Const(901)])):
-                    ops.Const(7, blob_out=y2)
-                    ops.Add([y1, y2], [y2])
-            with ops.Else():
-                # empty else
-                pass
-
-        plan = Plan('if_net_test')
-        plan.AddStep(to_execution_step(nb))
-        ws = workspace.C.Workspace()
-        ws.run(plan)
-
-        first_res_value = ws.blobs[str(first_res)].fetch()
-        second_res_value = ws.blobs[str(second_res)].fetch()
-        y0_value = ws.blobs[str(y0)].fetch()
-        y1_value = ws.blobs[str(y1)].fetch()
-        y2_value = ws.blobs[str(y2)].fetch()
-
-        self.assertEqual(first_res_value, 1)
-        self.assertEqual(second_res_value, 2)
-        self.assertEqual(y0_value, 1000)
-        self.assertEqual(y1_value, 101)
-        self.assertEqual(y2_value, 108)
-        self.assertTrue(str(local_blob) not in ws.blobs)
-
-    def test_while_net(self):
-        with NetBuilder() as nb:
-            x = ops.Const(0)
-            y = ops.Const(0)
-            with ops.WhileNet():
-                with ops.Condition():
-                    ops.Add([x, ops.Const(1)], [x])
-                    ops.LT([x, ops.Const(7)])
-                ops.Add([x, y], [y])
-
-        plan = Plan('while_net_test')
-        plan.AddStep(to_execution_step(nb))
-        ws = workspace.C.Workspace()
-        ws.run(plan)
-
-        x_value = ws.blobs[str(x)].fetch()
-        y_value = ws.blobs[str(y)].fetch()
-
-        self.assertEqual(x_value, 7)
-        self.assertEqual(y_value, 21)
diff --git a/caffe2/python/net_drawer.py b/caffe2/python/net_drawer.py
deleted file mode 100644
index 71c72fc5983a..000000000000
--- a/caffe2/python/net_drawer.py
+++ /dev/null
@@ -1,411 +0,0 @@
-## @package net_drawer
-# Module caffe2.python.net_drawer
-
-
-
-
-import argparse
-import json
-import logging
-from collections import defaultdict
-from caffe2.python import utils
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-try:
-    import pydot
-except ImportError:
-    logger.info(
-        'Cannot import pydot, which is required for drawing a network. This '
-        'can usually be installed in python with "pip install pydot". Also, '
-        'pydot requires graphviz to convert dot files to pdf: in ubuntu, this '
-        'can usually be installed with "sudo apt-get install graphviz".'
-    )
-    print(
-        'net_drawer will not run correctly. Please install the correct '
-        'dependencies.'
-    )
-    pydot = None
-
-from caffe2.proto import caffe2_pb2
-
-OP_STYLE = {
-    'shape': 'box',
-    'color': '#0F9D58',
-    'style': 'filled',
-    'fontcolor': '#FFFFFF'
-}
-BLOB_STYLE = {'shape': 'octagon'}
-
-
-def _rectify_operator_and_name(operators_or_net, name):
-    """Gets the operators and name for the pydot graph."""
-    if isinstance(operators_or_net, caffe2_pb2.NetDef):
-        operators = operators_or_net.op
-        if name is None:
-            name = operators_or_net.name
-    elif hasattr(operators_or_net, 'Proto'):
-        net = operators_or_net.Proto()
-        if not isinstance(net, caffe2_pb2.NetDef):
-            raise RuntimeError(
-                "Expecting NetDef, but got {}".format(type(net)))
-        operators = net.op
-        if name is None:
-            name = net.name
-    else:
-        operators = operators_or_net
-        if name is None:
-            name = "unnamed"
-    return operators, name
-
-
-def _escape_label(name):
-    # json.dumps is poor man's escaping
-    return json.dumps(name)
-
-
-def GetOpNodeProducer(append_output, **kwargs):
-    def ReallyGetOpNode(op, op_id):
-        if op.name:
-            node_name = '%s/%s (op#%d)' % (op.name, op.type, op_id)
-        else:
-            node_name = '%s (op#%d)' % (op.type, op_id)
-        if append_output:
-            for output_name in op.output:
-                node_name += '\n' + output_name
-        return pydot.Node(node_name, **kwargs)
-    return ReallyGetOpNode
-
-
-def GetBlobNodeProducer(**kwargs):
-    def ReallyGetBlobNode(node_name, label):
-        return pydot.Node(node_name, label=label, **kwargs)
-    return ReallyGetBlobNode
-
-def GetPydotGraph(
-    operators_or_net,
-    name=None,
-    rankdir='LR',
-    op_node_producer=None,
-    blob_node_producer=None
-):
-    if op_node_producer is None:
-        op_node_producer = GetOpNodeProducer(False, **OP_STYLE)
-    if blob_node_producer is None:
-        blob_node_producer = GetBlobNodeProducer(**BLOB_STYLE)
-    operators, name = _rectify_operator_and_name(operators_or_net, name)
-    graph = pydot.Dot(name, rankdir=rankdir)
-    pydot_nodes = {}
-    pydot_node_counts = defaultdict(int)
-    for op_id, op in enumerate(operators):
-        op_node = op_node_producer(op, op_id)
-        graph.add_node(op_node)
-        # print 'Op: %s' % op.name
-        # print 'inputs: %s' % str(op.input)
-        # print 'outputs: %s' % str(op.output)
-        for input_name in op.input:
-            if input_name not in pydot_nodes:
-                input_node = blob_node_producer(
-                    _escape_label(
-                        input_name + str(pydot_node_counts[input_name])),
-                    label=_escape_label(input_name),
-                )
-                pydot_nodes[input_name] = input_node
-            else:
-                input_node = pydot_nodes[input_name]
-            graph.add_node(input_node)
-            graph.add_edge(pydot.Edge(input_node, op_node))
-        for output_name in op.output:
-            if output_name in pydot_nodes:
-                # we are overwriting an existing blob. need to update the count.
-                pydot_node_counts[output_name] += 1
-            output_node = blob_node_producer(
-                _escape_label(
-                    output_name + str(pydot_node_counts[output_name])),
-                label=_escape_label(output_name),
-            )
-            pydot_nodes[output_name] = output_node
-            graph.add_node(output_node)
-            graph.add_edge(pydot.Edge(op_node, output_node))
-    return graph
-
-
-def GetPydotGraphMinimal(
-    operators_or_net,
-    name=None,
-    rankdir='LR',
-    minimal_dependency=False,
-    op_node_producer=None,
-):
-    """Different from GetPydotGraph, hide all blob nodes and only show op nodes.
-
-    If minimal_dependency is set as well, for each op, we will only draw the
-    edges to the minimal necessary ancestors. For example, if op c depends on
-    op a and b, and op b depends on a, then only the edge b->c will be drawn
-    because a->c will be implied.
-    """
-    if op_node_producer is None:
-        op_node_producer = GetOpNodeProducer(False, **OP_STYLE)
-    operators, name = _rectify_operator_and_name(operators_or_net, name)
-    graph = pydot.Dot(name, rankdir=rankdir)
-    # blob_parents maps each blob name to its generating op.
-    blob_parents = {}
-    # op_ancestry records the ancestors of each op.
-    op_ancestry = defaultdict(set)
-    for op_id, op in enumerate(operators):
-        op_node = op_node_producer(op, op_id)
-        graph.add_node(op_node)
-        # Get parents, and set up op ancestry.
-        parents = [
-            blob_parents[input_name] for input_name in op.input
-            if input_name in blob_parents
-        ]
-        op_ancestry[op_node].update(parents)
-        for node in parents:
-            op_ancestry[op_node].update(op_ancestry[node])
-        if minimal_dependency:
-            # only add nodes that do not have transitive ancestry
-            for node in parents:
-                if all(
-                    [node not in op_ancestry[other_node]
-                     for other_node in parents]
-                ):
-                    graph.add_edge(pydot.Edge(node, op_node))
-        else:
-            # Add all parents to the graph.
-            for node in parents:
-                graph.add_edge(pydot.Edge(node, op_node))
-        # Update blob_parents to reflect that this op created the blobs.
-        for output_name in op.output:
-            blob_parents[output_name] = op_node
-    return graph
-
-
-def GetOperatorMapForPlan(plan_def):
-    operator_map = {}
-    for net_id, net in enumerate(plan_def.network):
-        if net.HasField('name'):
-            operator_map[plan_def.name + "_" + net.name] = net.op
-        else:
-            operator_map[plan_def.name + "_network_%d" % net_id] = net.op
-    return operator_map
-
-
-def _draw_nets(nets, g):
-    nodes = []
-    for i, net in enumerate(nets):
-        nodes.append(pydot.Node(_escape_label(net)))
-        g.add_node(nodes[-1])
-        if i > 0:
-            g.add_edge(pydot.Edge(nodes[-2], nodes[-1]))
-    return nodes
-
-
-def _draw_steps(steps, g, skip_step_edges=False):  # noqa
-    kMaxParallelSteps = 3
-
-    def get_label():
-        label = [step.name + '\n']
-        if step.report_net:
-            label.append('Reporter: {}'.format(step.report_net))
-        if step.should_stop_blob:
-            label.append('Stopper: {}'.format(step.should_stop_blob))
-        if step.concurrent_substeps:
-            label.append('Concurrent')
-        if step.only_once:
-            label.append('Once')
-        return '\n'.join(label)
-
-    def substep_edge(start, end):
-        return pydot.Edge(start, end, arrowhead='dot', style='dashed')
-
-    nodes = []
-    for i, step in enumerate(steps):
-        parallel = step.concurrent_substeps
-
-        nodes.append(pydot.Node(_escape_label(get_label()), **OP_STYLE))
-        g.add_node(nodes[-1])
-
-        if i > 0 and not skip_step_edges:
-            g.add_edge(pydot.Edge(nodes[-2], nodes[-1]))
-
-        if step.network:
-            sub_nodes = _draw_nets(step.network, g)
-        elif step.substep:
-            if parallel:
-                sub_nodes = _draw_steps(
-                    step.substep[:kMaxParallelSteps], g, skip_step_edges=True)
-            else:
-                sub_nodes = _draw_steps(step.substep, g)
-        else:
-            raise ValueError('invalid step')
-
-        if parallel:
-            for sn in sub_nodes:
-                g.add_edge(substep_edge(nodes[-1], sn))
-            if len(step.substep) > kMaxParallelSteps:
-                ellipsis = pydot.Node('{} more steps'.format(
-                    len(step.substep) - kMaxParallelSteps), **OP_STYLE)
-                g.add_node(ellipsis)
-                g.add_edge(substep_edge(nodes[-1], ellipsis))
-        else:
-            g.add_edge(substep_edge(nodes[-1], sub_nodes[0]))
-
-    return nodes
-
-
-def GetPlanGraph(plan_def, name=None, rankdir='TB'):
-    graph = pydot.Dot(name, rankdir=rankdir)
-    _draw_steps(plan_def.execution_step, graph)
-    return graph
-
-
-def GetGraphInJson(operators_or_net, output_filepath):
-    operators, _ = _rectify_operator_and_name(operators_or_net, None)
-    blob_strid_to_node_id = {}
-    node_name_counts = defaultdict(int)
-    nodes = []
-    edges = []
-    for op_id, op in enumerate(operators):
-        op_label = op.name + '/' + op.type if op.name else op.type
-        op_node_id = len(nodes)
-        nodes.append({
-            'id': op_node_id,
-            'label': op_label,
-            'op_id': op_id,
-            'type': 'op'
-        })
-        for input_name in op.input:
-            strid = _escape_label(
-                input_name + str(node_name_counts[input_name]))
-            if strid not in blob_strid_to_node_id:
-                input_node = {
-                    'id': len(nodes),
-                    'label': input_name,
-                    'type': 'blob'
-                }
-                blob_strid_to_node_id[strid] = len(nodes)
-                nodes.append(input_node)
-            else:
-                input_node = nodes[blob_strid_to_node_id[strid]]
-            edges.append({
-                'source': blob_strid_to_node_id[strid],
-                'target': op_node_id
-            })
-        for output_name in op.output:
-            strid = _escape_label(
-                output_name + str(node_name_counts[output_name]))
-            if strid in blob_strid_to_node_id:
-                # we are overwriting an existing blob. need to update the count.
-                node_name_counts[output_name] += 1
-                strid = _escape_label(
-                    output_name + str(node_name_counts[output_name]))
-
-            if strid not in blob_strid_to_node_id:
-                output_node = {
-                    'id': len(nodes),
-                    'label': output_name,
-                    'type': 'blob'
-                }
-                blob_strid_to_node_id[strid] = len(nodes)
-                nodes.append(output_node)
-            edges.append({
-                'source': op_node_id,
-                'target': blob_strid_to_node_id[strid]
-            })
-
-    with open(output_filepath, 'w') as f:
-        json.dump({'nodes': nodes, 'edges': edges}, f)
-
-
-# A dummy minimal PNG image used by GetGraphPngSafe as a
-# placeholder when rendering fail to run.
-_DummyPngImage = (
-    b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00'
-    b'\x01\x01\x00\x00\x00\x007n\xf9$\x00\x00\x00\nIDATx\x9cc`\x00\x00'
-    b'\x00\x02\x00\x01H\xaf\xa4q\x00\x00\x00\x00IEND\xaeB`\x82')
-
-
-def GetGraphPngSafe(func, *args, **kwargs):
-    """
-    Invokes `func` (e.g. GetPydotGraph) with args. If anything fails - returns
-    and empty image instead of throwing Exception
-    """
-    try:
-        graph = func(*args, **kwargs)
-        if not isinstance(graph, pydot.Dot):
-            raise ValueError("func is expected to return pydot.Dot")
-        return graph.create_png()
-    except Exception as e:
-        logger.error("Failed to draw graph: {}".format(e))
-        return _DummyPngImage
-
-
-def main():
-    parser = argparse.ArgumentParser(description="Caffe2 net drawer.")
-    parser.add_argument(
-        "--input",
-        type=str, required=True,
-        help="The input protobuf file."
-    )
-    parser.add_argument(
-        "--output_prefix",
-        type=str, default="",
-        help="The prefix to be added to the output filename."
-    )
-    parser.add_argument(
-        "--minimal", action="store_true",
-        help="If set, produce a minimal visualization."
-    )
-    parser.add_argument(
-        "--minimal_dependency", action="store_true",
-        help="If set, only draw minimal dependency."
-    )
-    parser.add_argument(
-        "--append_output", action="store_true",
-        help="If set, append the output blobs to the operator names.")
-    parser.add_argument(
-        "--rankdir", type=str, default="LR",
-        help="The rank direction of the pydot graph."
-    )
-    args = parser.parse_args()
-    with open(args.input, 'r') as fid:
-        content = fid.read()
-        graphs = utils.GetContentFromProtoString(
-            content, {
-                caffe2_pb2.PlanDef: GetOperatorMapForPlan,
-                caffe2_pb2.NetDef: lambda x: {x.name: x.op},
-            }
-        )
-    for key, operators in graphs.items():
-        if args.minimal:
-            graph = GetPydotGraphMinimal(
-                operators,
-                name=key,
-                rankdir=args.rankdir,
-                node_producer=GetOpNodeProducer(args.append_output, **OP_STYLE),
-                minimal_dependency=args.minimal_dependency)
-        else:
-            graph = GetPydotGraph(
-                operators,
-                name=key,
-                rankdir=args.rankdir,
-                node_producer=GetOpNodeProducer(args.append_output, **OP_STYLE))
-        filename = args.output_prefix + graph.get_name() + '.dot'
-        graph.write(filename, format='raw')
-        pdf_filename = filename[:-3] + 'pdf'
-        try:
-            graph.write_pdf(pdf_filename)
-        except Exception:
-            print(
-                'Error when writing out the pdf file. Pydot requires graphviz '
-                'to convert dot files to pdf, and you may not have installed '
-                'graphviz. On ubuntu this can usually be installed with "sudo '
-                'apt-get install graphviz". We have generated the .dot file '
-                'but will not be able to generate pdf file for now.'
-            )
-
-
-if __name__ == '__main__':
-    main()
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
deleted file mode 100644
index b7b3bba4542c..000000000000
--- a/caffe2/python/net_printer.py
+++ /dev/null
@@ -1,424 +0,0 @@
-## @package net_printer
-# Module caffe2.python.net_printer
-
-
-
-
-
-from caffe2.proto.caffe2_pb2 import OperatorDef, NetDef
-from caffe2.python.checkpoint import Job
-from caffe2.python.core import Net, ExecutionStep, Plan
-from caffe2.python.task import Task, TaskGroup, WorkspaceType, TaskOutput
-from collections import defaultdict
-from contextlib import contextmanager
-from copy import copy
-from itertools import chain
-
-
-class Visitor:
-    @classmethod
-    def register(cls, Type):
-        if not(hasattr(cls, 'visitors')):
-            cls.visitors = {}
-        else:
-            assert Type not in cls.visitors, \
-                '{} already registered!'.format(Type)
-
-        def _register(func):
-            cls.visitors[Type] = func
-            return func
-
-        return _register
-
-    def __call__(self, obj, *args, **kwargs):
-        if obj is None:
-            return
-
-        Type = type(obj)
-        if Type not in self.__class__.visitors:
-            raise TypeError('%s: unsupported object type: %s' % (
-                self.__class__.__name__, Type))
-
-        func = self.__class__.visitors[Type]
-        return func(self, obj, *args, **kwargs)
-
-
-class Analyzer(Visitor):
-    PREFIXES_TO_IGNORE = {'distributed_ctx_init'}
-
-    def __init__(self):
-        self.workspaces = defaultdict(lambda: defaultdict(lambda: 0))
-        self.workspace_ctx = []
-
-    @property
-    def workspace(self):
-        return self.workspace_ctx[-1]
-
-    @contextmanager
-    def set_workspace(self, node=None, ws=None, do_copy=False):
-        if ws is not None:
-            ws = ws
-        elif node is not None:
-            ws = self.workspaces[str(node)]
-        else:
-            ws = self.workspace
-        if do_copy:
-            ws = copy(ws)
-        self.workspace_ctx.append(ws)
-        try:
-            yield ws
-        finally:
-            del self.workspace_ctx[-1]
-
-    def define_blob(self, blob):
-        self.workspace[blob] += 1
-
-    def need_blob(self, blob):
-        if any(blob.startswith(p) for p in Analyzer.PREFIXES_TO_IGNORE):
-            return
-        assert blob in self.workspace, 'Blob undefined: %s' % blob
-
-
-@Analyzer.register(OperatorDef)
-def analyze_op(analyzer, op):
-    for x in op.input:
-        analyzer.need_blob(x)
-    for x in op.output:
-        analyzer.define_blob(x)
-
-
-@Analyzer.register(Net)
-def analyze_net(analyzer, net):
-    for x in net.Proto().op:
-        analyzer(x)
-
-
-@Analyzer.register(ExecutionStep)
-def analyze_step(analyzer, step):
-    proto = step.Proto()
-    with analyzer.set_workspace(do_copy=proto.create_workspace):
-        if proto.report_net:
-            with analyzer.set_workspace(do_copy=True):
-                analyzer(step.get_net(proto.report_net))
-        all_new_blobs = set()
-        substeps = step.Substeps() + [step.get_net(n) for n in proto.network]
-        for substep in substeps:
-            with analyzer.set_workspace(
-                    do_copy=proto.concurrent_substeps) as ws_in:
-                analyzer(substep)
-                if proto.should_stop_blob:
-                    analyzer.need_blob(proto.should_stop_blob)
-            if proto.concurrent_substeps:
-                new_blobs = set(ws_in.keys()) - set(analyzer.workspace.keys())
-                assert len(all_new_blobs & new_blobs) == 0, (
-                    'Error: Blobs created by multiple parallel steps: %s' % (
-                        ', '.join(all_new_blobs & new_blobs)))
-                all_new_blobs |= new_blobs
-    for x in all_new_blobs:
-        analyzer.define_blob(x)
-
-
-@Analyzer.register(Task)
-def analyze_task(analyzer, task):
-    # check that our plan protobuf is not too large (limit of 64Mb)
-    step = task.get_step()
-    plan = Plan(task.node)
-    plan.AddStep(step)
-    proto_len = len(plan.Proto().SerializeToString())
-    assert proto_len < 2 ** 26, (
-        'Due to a protobuf limitation, serialized tasks must be smaller '
-        'than 64Mb, but this task has {} bytes.' % proto_len)
-
-    is_private = task.workspace_type() != WorkspaceType.GLOBAL
-    with analyzer.set_workspace(do_copy=is_private):
-        analyzer(step)
-
-
-@Analyzer.register(TaskGroup)
-def analyze_task_group(analyzer, tg):
-    for task in tg.tasks_by_node().tasks():
-        with analyzer.set_workspace(node=task.node):
-            analyzer(task)
-
-
-@Analyzer.register(Job)
-def analyze_job(analyzer, job):
-    analyzer(job.init_group)
-    analyzer(job.epoch_group)
-
-
-def analyze(obj):
-    """
-    Given a Job, visits all the execution steps making sure that:
-      - no undefined blobs will be found during execution
-      - no blob with same name is defined in concurrent steps
-    """
-    Analyzer()(obj)
-
-
-class Text:
-    def __init__(self):
-        self._indent = 0
-        self._lines_in_context = [0]
-        self.lines = []
-
-    @contextmanager
-    def context(self, text):
-        if text is not None:
-            self.add('with %s:' % text)
-            self._indent += 4
-            self._lines_in_context.append(0)
-        try:
-            yield
-        finally:
-            if text is not None:
-                if self._lines_in_context[-1] == 0:
-                    self.add('pass')
-                self._indent -= 4
-                del self._lines_in_context[-1]
-
-    def add(self, text):
-        self._lines_in_context[-1] += 1
-        self.lines.append((' ' * self._indent) + text)
-
-    def __str__(self):
-        return '\n'.join(self.lines)
-
-
-class Printer(Visitor, Text):
-    def __init__(self, factor_prefixes=False, c2_syntax=True):
-        super(Visitor, self).__init__()
-        super(Text, self).__init__()
-        self.factor_prefixes = factor_prefixes
-        self.c2_syntax = c2_syntax
-        self.c2_net_name = None
-
-
-def _sanitize_str(s):
-    if isinstance(s, str):
-        sanitized = s
-    elif isinstance(s, bytes):
-        sanitized = s.decode('ascii', errors='ignore')
-    else:
-        sanitized = str(s)
-    if len(sanitized) < 64:
-        return "'%s'" % sanitized
-    else:
-        return "'%s'" % sanitized[:64] + '...<+len=%d>' % (len(sanitized) - 64)
-
-
-def _arg_val(arg):
-    if arg.HasField('f'):
-        return str(arg.f)
-    if arg.HasField('i'):
-        return str(arg.i)
-    if arg.HasField('s'):
-        return _sanitize_str(arg.s)
-    if arg.floats:
-        return str(list(arg.floats))
-    if arg.ints:
-        return str(list(arg.ints))
-    if arg.strings:
-        return str([_sanitize_str(s) for s in arg.strings])
-    return '[]'
-
-
-def commonprefix(m):
-    "Given a list of strings, returns the longest common prefix"
-    if not m:
-        return ''
-    s1 = min(m)
-    s2 = max(m)
-    for i, c in enumerate(s1):
-        if c != s2[i]:
-            return s1[:i]
-    return s1
-
-
-def format_value(val):
-    if isinstance(val, list):
-        return '[%s]' % ', '.join("'%s'" % str(v) for v in val)
-    else:
-        return str(val)
-
-
-def factor_prefix(vals, do_it):
-    vals = [format_value(v) for v in vals]
-    prefix = commonprefix(vals) if len(vals) > 1 and do_it else ''
-    joined = ', '.join(v[len(prefix):] for v in vals)
-    return '%s[%s]' % (prefix, joined) if prefix else joined
-
-
-def call(op, inputs=None, outputs=None, factor_prefixes=False):
-    if not inputs:
-        inputs = ''
-    else:
-        inputs_v = [a for a in inputs if not isinstance(a, tuple)]
-        inputs_kv = [a for a in inputs if isinstance(a, tuple)]
-        inputs = ', '.join(
-            x
-            for x in chain(
-                [factor_prefix(inputs_v, factor_prefixes)],
-                ('%s=%s' % kv for kv in inputs_kv),
-            )
-            if x
-        )
-    call = '%s(%s)' % (op, inputs)
-    return call if not outputs else '%s = %s' % (
-        factor_prefix(outputs, factor_prefixes), call)
-
-
-def format_device_option(dev_opt):
-    if not dev_opt or not (
-            dev_opt.device_type or dev_opt.device_id or dev_opt.node_name):
-        return None
-    return call(
-        'DeviceOption',
-        [dev_opt.device_type, dev_opt.device_id, "'%s'" % dev_opt.node_name])
-
-
-@Printer.register(OperatorDef)
-def print_op(text, op):
-    args = [(a.name, _arg_val(a)) for a in op.arg]
-    dev_opt_txt = format_device_option(op.device_option)
-    if dev_opt_txt:
-        args.append(('device_option', dev_opt_txt))
-
-    if text.c2_net_name:
-        text.add(call(
-            text.c2_net_name + '.' + op.type,
-            [list(op.input), list(op.output)] + args))
-    else:
-        text.add(call(
-            op.type,
-            list(op.input) + args,
-            op.output,
-            factor_prefixes=text.factor_prefixes))
-    for arg in op.arg:
-        if arg.HasField('n'):
-            with text.context('arg: %s' % arg.name):
-                text(arg.n)
-
-
-@Printer.register(NetDef)
-def print_net_def(text, net_def):
-    if text.c2_syntax:
-        text.add(call('core.Net', ["'%s'" % net_def.name], [net_def.name]))
-        text.c2_net_name = net_def.name
-    else:
-        text.add('# net: %s' % net_def.name)
-    for op in net_def.op:
-        text(op)
-    if text.c2_syntax:
-        text.c2_net_name = None
-
-
-@Printer.register(Net)
-def print_net(text, net):
-    text(net.Proto())
-
-
-def _get_step_context(step):
-    proto = step.Proto()
-    if proto.should_stop_blob:
-        return call('loop'), False
-    if proto.num_iter and proto.num_iter != 1:
-        return call('loop', [proto.num_iter]), False
-    if proto.num_concurrent_instances > 1:
-        return (
-            call('parallel',
-                 [('num_instances', proto.num_concurrent_instances)]),
-            len(step.Substeps()) > 1)
-    concurrent = proto.concurrent_substeps and len(step.Substeps()) > 1
-    if concurrent:
-        return call('parallel'), True
-    if proto.report_net:
-        return call('run_once'), False
-    return None, False
-
-
-@Printer.register(ExecutionStep)
-def print_step(text, step):
-    proto = step.Proto()
-    step_ctx, do_substep = _get_step_context(step)
-    with text.context(step_ctx):
-        if proto.report_net:
-            with text.context(call('report_net', [proto.report_interval])):
-                text(step.get_net(proto.report_net))
-        substeps = step.Substeps() + [step.get_net(n) for n in proto.network]
-        for substep in substeps:
-            sub_proto = (
-                substep.Proto() if isinstance(substep, ExecutionStep) else None)
-            if sub_proto is not None and sub_proto.run_every_ms:
-                substep_ctx = call(
-                    'reporter',
-                    [str(substep), ('interval_ms', sub_proto.run_every_ms)])
-            elif do_substep:
-                title = (
-                    'workspace'
-                    if sub_proto is not None and sub_proto.create_workspace else
-                    'step')
-                substep_ctx = call(title, [str(substep)])
-            else:
-                substep_ctx = None
-            with text.context(substep_ctx):
-                text(substep)
-                if proto.should_stop_blob:
-                    text.add(call('yield stop_if', [proto.should_stop_blob]))
-
-
-def _print_task_output(x):
-    assert isinstance(x, TaskOutput)
-    return 'Output[' + ', '.join(str(x) for x in x.names) + ']'
-
-
-@Printer.register(Task)
-def print_task(text, task):
-    outs = ', '.join(_print_task_output(o) for o in task.outputs())
-    context = [('node', task.node), ('name', task.name), ('outputs', outs)]
-    with text.context(call('Task', context)):
-        text(task.get_step())
-
-
-@Printer.register(TaskGroup)
-def print_task_group(text, tg, header=None):
-    with text.context(header or call('TaskGroup')):
-        for task in tg.tasks_by_node().tasks():
-            text(task)
-
-
-@Printer.register(Job)
-def print_job(text, job):
-    text(job.init_group, 'Job.current().init_group')
-    text(job.epoch_group, 'Job.current().epoch_group')
-    with text.context('Job.current().stop_conditions'):
-        for out in job.stop_conditions:
-            text.add(_print_task_output(out))
-    text(job.download_group, 'Job.current().download_group')
-    text(job.exit_group, 'Job.current().exit_group')
-
-
-def to_string(obj, **kwargs):
-    """
-    Given a Net, ExecutionStep, Task, TaskGroup or Job, produces a string
-    with detailed description of the execution steps.
-    """
-    printer = Printer(**kwargs)
-    printer(obj)
-    return str(printer)
-
-
-def debug_net(net):
-    """
-    Given a Net, produce another net that logs info about the operator call
-    before each operator execution. Use for debugging purposes.
-    """
-    assert isinstance(net, Net)
-    debug_net = Net(str(net))
-    assert isinstance(net, Net)
-    for op in net.Proto().op:
-        text = Text()
-        print_op(op, text)
-        debug_net.LogInfo(str(text))
-        debug_net.Proto().op.extend([op])
-    return debug_net
diff --git a/caffe2/python/net_printer_test.py b/caffe2/python/net_printer_test.py
deleted file mode 100644
index e71a2b323dea..000000000000
--- a/caffe2/python/net_printer_test.py
+++ /dev/null
@@ -1,99 +0,0 @@
-
-
-
-
-
-from caffe2.python import net_printer
-from caffe2.python.checkpoint import Job
-from caffe2.python.net_builder import ops
-from caffe2.python.task import Task, final_output, WorkspaceType
-import unittest
-
-
-def example_loop():
-    with Task():
-        total = ops.Const(0)
-        total_large = ops.Const(0)
-        total_small = ops.Const(0)
-        total_tiny = ops.Const(0)
-        with ops.loop(10) as loop:
-            outer = ops.Mul([loop.iter(), ops.Const(10)])
-            with ops.loop(loop.iter()) as inner:
-                val = ops.Add([outer, inner.iter()])
-                with ops.If(ops.GE([val, ops.Const(80)])) as c:
-                    ops.Add([total_large, val], [total_large])
-                with c.Elif(ops.GE([val, ops.Const(50)])) as c:
-                    ops.Add([total_small, val], [total_small])
-                with c.Else():
-                    ops.Add([total_tiny, val], [total_tiny])
-                ops.Add([total, val], total)
-
-
-def example_task():
-    with Task():
-        with ops.task_init():
-            one = ops.Const(1)
-        two = ops.Add([one, one])
-        with ops.task_init():
-            three = ops.Const(3)
-        accum = ops.Add([two, three])
-        # here, accum should be 5
-        with ops.task_exit():
-            # here, accum should be 6, since this executes after lines below
-            seven_1 = ops.Add([accum, one])
-        six = ops.Add([accum, one])
-        ops.Add([accum, one], [accum])
-        seven_2 = ops.Add([accum, one])
-        o6 = final_output(six)
-        o7_1 = final_output(seven_1)
-        o7_2 = final_output(seven_2)
-
-    with Task(num_instances=2):
-        with ops.task_init():
-            one = ops.Const(1)
-        with ops.task_instance_init():
-            local = ops.Const(2)
-        ops.Add([one, local], [one])
-        ops.LogInfo('ble')
-
-    return o6, o7_1, o7_2
-
-def example_job():
-    with Job() as job:
-        with job.init_group:
-            example_loop()
-        example_task()
-    return job
-
-
-class TestNetPrinter(unittest.TestCase):
-    def test_print(self):
-        self.assertTrue(len(net_printer.to_string(example_job())) > 0)
-
-    def test_valid_job(self):
-        job = example_job()
-        with job:
-            with Task():
-                # distributed_ctx_init_* ignored by analyzer
-                ops.Add(['distributed_ctx_init_a', 'distributed_ctx_init_b'])
-        # net_printer.analyze(example_job())
-        print(net_printer.to_string(example_job()))
-
-    def test_undefined_blob(self):
-        job = example_job()
-        with job:
-            with Task():
-                ops.Add(['a', 'b'])
-        with self.assertRaises(AssertionError) as e:
-            net_printer.analyze(job)
-        self.assertEqual("Blob undefined: a", str(e.exception))
-
-    def test_multiple_definition(self):
-        job = example_job()
-        with job:
-            with Task(workspace_type=WorkspaceType.GLOBAL):
-                ops.Add([ops.Const(0), ops.Const(1)], 'out1')
-            with Task(workspace_type=WorkspaceType.GLOBAL):
-                ops.Add([ops.Const(2), ops.Const(3)], 'out1')
-        with self.assertRaises(AssertionError):
-            net_printer.analyze(job)
diff --git a/caffe2/python/nomnigraph.py b/caffe2/python/nomnigraph.py
deleted file mode 100644
index 0390d8ef20c2..000000000000
--- a/caffe2/python/nomnigraph.py
+++ /dev/null
@@ -1,140 +0,0 @@
-
-
-import errno
-import os
-from subprocess import PIPE, Popen
-
-import caffe2.python._import_c_extension as C
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-
-
-class NNModule:
-    def __init__(self, net=None, device_map=None):
-        if net is not None:
-            serialized_proto = None
-            if isinstance(net, core.Net):
-                serialized_proto = net.Proto().SerializeToString()
-            elif isinstance(net, caffe2_pb2.NetDef):
-                serialized_proto = net.SerializeToString()
-
-            # Distributed
-            if device_map is not None:
-                serialized_device_map = {}
-                for k in device_map:
-                    serialized_device_map[k] = device_map[k].SerializeToString()
-                self._NNModule = C.NNModuleFromProtobufDistributed(
-                    serialized_proto, serialized_device_map
-                )
-            # Default
-            elif serialized_proto:
-                self._NNModule, self._OpList = C.NNModuleFromProtobuf(serialized_proto)
-            else:
-                raise Exception(
-                    "NNModule can be constructed with core.Net or caffe2_pb2.NetDef types"
-                )
-        else:
-            self._NNModule = C.NNModule()
-
-    @property
-    def dataFlow(self):
-        return self._NNModule.dataFlow()
-
-    @property
-    def controlFlow(self):
-        return self._NNModule.getExecutionOrder()
-
-    @property
-    def nodes(self):
-        return self._NNModule.dataFlow().nodes
-
-    @property
-    def operators(self):
-        return self._NNModule.dataFlow().operators
-
-    @property
-    def tensors(self):
-        return self._NNModule.dataFlow().tensors
-
-    def createNode(self, val):
-        return self._NNModule.dataFlow().createNode(val)
-
-    def deleteNode(self, node):
-        return self._NNModule.dataFlow().deleteNode(node)
-
-    def createEdge(self, a, b):
-        return self._NNModule.dataFlow().createEdge(a, b)
-
-    def deleteEdge(self, a, b=None):
-        if b:
-            self._NNModule.dataFlow().deleteEdge(a, b)
-        else:
-            self._NNModule.dataFlow().deleteEdge(a)
-
-    def replaceNode(self, old_node, new_node):
-        return self._NNModule.dataFlow().replaceNode(old_node, new_node)
-
-    def replaceProducer(self, tensor, new_producer):
-        C.replaceProducer(tensor, new_producer)
-
-    def replaceAllUsesWith(self, old_tensor, new_tensor):
-        C.replaceAllUsesWith(old_tensor, new_tensor)
-
-    def replaceAsConsumer(self, old_consumer, new_consumer):
-        C.replaceAsConsumer(old_consumer, new_consumer)
-
-    def replaceSubgraph(self, subgraph, new_node, inputs, outputs):
-        self._NNModule.replaceSubgraph(subgraph, new_node, inputs, outputs)
-
-    def deleteSubgraph(self, subgraph):
-        self._NNModule.deleteSubgraph(subgraph)
-
-    def createUniqueDataNode(self, prefix="_unique"):
-        return self._NNModule.createUniqueDataNode(prefix)
-
-    def convertToCaffe2Proto(self, old_proto=None):
-        if not old_proto:
-            old_proto = caffe2_pb2.NetDef()
-        output = self._NNModule.convertToCaffe2Proto(old_proto)
-        new_proto = caffe2_pb2.NetDef()
-        new_proto.ParseFromString(output)
-        return new_proto
-
-    def match(self, pattern):
-        for n in self.dataFlow.getMutableNodes():
-            m = C.matchSubgraph(n, pattern)
-            if m:
-                yield m
-
-
-def render(s):
-    s = str(s)
-    cmd_exists = lambda x: any(
-        os.access(os.path.join(path, x), os.X_OK)
-        for path in os.getenv("PATH", "").split(os.pathsep)
-    )
-    if cmd_exists("graph-easy"):
-        p = Popen("graph-easy", stdin=PIPE)
-        try:
-            p.stdin.write(s.encode("utf-8"))
-        except IOError as e:
-            if e.errno == errno.EPIPE or e.errno == errno.EINVAL:
-                pass
-            else:
-                # Raise any other error.
-                raise
-
-        p.stdin.close()
-        p.wait()
-    else:
-        print(s)
-
-
-NeuralNetOperator = C.NeuralNetOperator
-Operator = C.NeuralNetOperator
-NeuralNetData = C.NeuralNetData
-Data = C.NeuralNetData
-NNSubgraph = C.NNSubgraph
-NNMatchGraph = C.NNMatchGraph
-Graph = C.Graph
-Annotation = C.Annotation
diff --git a/caffe2/python/nomnigraph_test.py b/caffe2/python/nomnigraph_test.py
deleted file mode 100644
index bd9d10fcbae1..000000000000
--- a/caffe2/python/nomnigraph_test.py
+++ /dev/null
@@ -1,443 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, test_util
-from caffe2.proto import caffe2_pb2
-import caffe2.python.nomnigraph as ng
-
-from hypothesis import given
-import hypothesis.strategies as st
-import random
-
-
-class TestBindings(test_util.TestCase):
-    def test_simple(self):
-        nn = ng.NNModule()
-        dfg = nn.dataFlow
-        dfg.createNode(ng.NeuralNetData("X"))
-        dfg.createNode(ng.NeuralNetOperator("FC"))
-        assert len(nn.dataFlow.getMutableNodes()) == 2
-
-    def test_core_net_simple(self):
-        net = core.Net("name")
-        net.FC(["X", "W"], ["Y"])
-        nn = ng.NNModule(net)
-        for node in nn.dataFlow.getMutableNodes():
-            if node.isOperator():
-                assert node.getName() == "FC"
-            elif node.isTensor():
-                assert node.getName() in ["X", "W", "Y"]
-
-    def test_core_net_controlflow(self):
-        net = core.Net("name")
-        net.FC(["X", "W"], ["Y"])
-        net.Relu(["Y"], ["Z"])
-        nn = ng.NNModule(net)
-        assert len(nn.controlFlow) == 2
-        for instr in nn.controlFlow:
-            assert instr.getType() == "Operator"
-        assert nn.controlFlow[0].getName() == "FC"
-        assert nn.controlFlow[1].getName() == "Relu"
-
-    def test_core_net_nn_accessors(self):
-        net = core.Net("name")
-        net.FC(["X", "W"], ["Y"])
-        net.Relu(["Y"], ["Z"])
-        nn = ng.NNModule(net)
-        tensors = set()
-        for t in nn.tensors:
-            tensors.add(t.name)
-        assert tensors == set(["X", "W", "Y", "Z"])
-        ops = set()
-        for op in nn.operators:
-            ops.add(op.name)
-        assert ops == set(["FC", "Relu"])
-        nodes = set()
-        for node in nn.nodes:
-            nodes.add(node.name)
-        assert nodes == (ops | tensors)
-
-    def test_netdef_simple(self):
-        net = core.Net("name")
-        net.FC(["X", "W"], ["Y"])
-        nn = ng.NNModule(net.Proto())
-        for node in nn.dataFlow.getMutableNodes():
-            if node.isOperator():
-                assert node.getOperator().getName() == "FC"
-            elif node.isTensor():
-                assert node.getTensor().getName() in ["X", "W", "Y"]
-
-    def test_operatordef_simple(self):
-        nn = ng.NNModule()
-        dfg = nn.dataFlow
-        op = core.CreateOperator("Ceil", ["X"], ["Y"], engine="CUDNN")
-        dfg.createNode(op)
-        for node in dfg.getMutableNodes():
-            assert node.isOperator()
-            assert node.getOperator().getName() == "Ceil"
-
-    def test_invalid_node(self):
-        nn = ng.NNModule()
-        dfg = nn.dataFlow
-        with self.assertRaises(Exception):
-            dfg.createNode(7)
-
-    def test_edges_simple(self):
-        nn = ng.NNModule()
-        dfg = nn.dataFlow
-        x = dfg.createNode(ng.NeuralNetData("X"))
-        w = dfg.createNode(ng.NeuralNetData("W"))
-        op = dfg.createNode(ng.NeuralNetOperator("Op"))
-
-        with self.assertRaises(Exception):
-            dfg.createEdge(x, w)
-        dfg.createEdge(op, w)
-        dfg.createEdge(x, op)
-
-        # Dot generation
-        assert(str(dfg).startswith("digraph G"))
-
-        # subgraph
-        sg = ng.NNSubgraph()
-        sg.addNode(x)
-        sg.addNode(op)
-        sg.induceEdges()
-        assert len(sg) == 2
-
-        # subgraph dot generation
-        assert(str(sg).startswith("digraph G"))
-
-    @given(size=st.sampled_from([10, 50]))
-    def test_edges_complex(self, size):
-        random.seed(1337)
-        nn = ng.NNModule()
-        dfg = nn.dataFlow
-
-        data = []
-        ops = []
-        for _ in range(size):
-            data.append(dfg.createNode(ng.NeuralNetData("X")))
-        for i in range(size):
-            ops.append(dfg.createNode(ng.NeuralNetOperator("Op" + str(i))))
-
-        for i in range(size):
-            for j in range(size):
-                if bool(random.getrandbits(1)):
-                    dfg.createEdge(data[i], ops[j])
-
-    def test_traversal(self):
-        net = core.Net("test")
-        net.FC(["X", "W"], ["Y"])
-        net.Relu(["Y"], ["Z"])
-        nn = ng.NNModule(net)
-        fc = nn.controlFlow[0]
-        relu = nn.controlFlow[1]
-        assert not fc.inputs[0].hasProducer()
-        assert fc.inputs[0].name == "X"
-        assert fc.inputs[1].name == "W"
-        assert relu.outputs[0].name == "Z"
-        assert relu.inputs[0].name == "Y"
-        assert relu.inputs[0].hasProducer()
-        assert relu.inputs[0].producer.name == "FC"
-        assert fc.outputs[0].consumers[0].name == "Relu"
-
-    def test_debug(self):
-        nn = ng.NNModule()
-        dfg = nn.dataFlow
-        dfg.createNode(ng.NeuralNetData("X"))
-        dfg.createNode(ng.NeuralNetData("W"))
-        dfg.createNode(ng.NeuralNetOperator("Op"))
-
-        ng.render(nn.dataFlow)
-
-    def test_match_graph_node(self):
-        mg = ng.NNMatchGraph()
-        mg.createNode(ng.NeuralNetOperator("test"))
-        nn = ng.NNModule()
-        test = nn.dataFlow.createNode(ng.NeuralNetOperator("test"))
-        x = nn.dataFlow.createNode(ng.NeuralNetData("X"))
-        nn.dataFlow.createEdge(x, test)
-
-        count = 0
-        for match in nn.match(mg):
-            assert len(match) == 1
-            count += 1
-            # Dot generation of subgraph
-            assert(str(match).startswith("digraph G"))
-        assert count == 1
-
-    def test_match_graph_node_strict(self):
-        mg = ng.NNMatchGraph()
-        mg.createNode(ng.NeuralNetOperator("test"), strict=True)
-        nn = ng.NNModule()
-        test = nn.dataFlow.createNode(ng.NeuralNetOperator("test"))
-        x = nn.dataFlow.createNode(ng.NeuralNetData("X"))
-        nn.dataFlow.createEdge(test, x)
-
-        count = 0
-        for match in nn.match(mg):
-            assert len(match) == 1
-            count += 1
-
-        with self.assertRaises(Exception):
-            assert count == 1
-
-    def test_match_graph(self):
-        mg = ng.NNMatchGraph()
-        test2m = mg.createNode(ng.NeuralNetOperator("test2"), strict=True)
-        xm = mg.createNode(ng.NeuralNetData("X"), strict=True)
-        testm = mg.createNode(ng.NeuralNetOperator("test"))
-        mg.createEdge(test2m, xm)
-        mg.createEdge(xm, testm)
-
-        nn = ng.NNModule()
-        test2 = nn.dataFlow.createNode(ng.NeuralNetOperator("test2"))
-        x = nn.dataFlow.createNode(ng.NeuralNetData("X"))
-        test = nn.dataFlow.createNode(ng.NeuralNetOperator("test"))
-        nn.dataFlow.createEdge(test2, x)
-        nn.dataFlow.createEdge(x, test)
-
-        count = 0
-        for match in nn.match(mg):
-            print(len(match))
-            assert len(match) == 3
-            count += 1
-        assert count == 1
-
-    def test_delete_subgraph(self):
-        mg = ng.NNMatchGraph()
-        test2m = mg.createNode(ng.NeuralNetOperator("test2"), strict=True)
-        xm = mg.createNode(ng.NeuralNetData("X"), strict=True)
-        testm = mg.createNode(ng.NeuralNetOperator("test"))
-        mg.createEdge(test2m, xm)
-        mg.createEdge(xm, testm)
-
-        nn = ng.NNModule()
-        test2 = nn.dataFlow.createNode(ng.NeuralNetOperator("test2"))
-        x = nn.dataFlow.createNode(ng.NeuralNetData("X"))
-        test = nn.dataFlow.createNode(ng.NeuralNetOperator("test"))
-        nn.dataFlow.createEdge(test2, x)
-        nn.dataFlow.createEdge(x, test)
-
-        for m in nn.match(mg):
-            match = m
-        nn.deleteSubgraph(match)
-        assert len(nn.controlFlow) == 0
-
-    def test_replace_subraph(self):
-        mg = ng.NNMatchGraph()
-        test2m = mg.createNode(ng.NeuralNetOperator("test2"), strict=True)
-        xm = mg.createNode(ng.NeuralNetData("X"), strict=True)
-        testm = mg.createNode(ng.NeuralNetOperator("test"))
-        mg.createEdge(test2m, xm)
-        mg.createEdge(xm, testm)
-
-        nn = ng.NNModule()
-        test2 = nn.dataFlow.createNode(ng.NeuralNetOperator("test2"))
-        x = nn.dataFlow.createNode(ng.NeuralNetData("X"))
-        test = nn.dataFlow.createNode(ng.NeuralNetOperator("test"))
-        nn.dataFlow.createEdge(test2, x)
-        nn.dataFlow.createEdge(x, test)
-
-        for m in nn.match(mg):
-            match = m
-        new_op = nn.dataFlow.createNode(ng.NeuralNetOperator("new_op"))
-        nn.replaceSubgraph(match, new_op, [], [])
-        assert len(nn.controlFlow) == 1
-        assert nn.controlFlow[0].name == "new_op"
-
-    def test_genericGraph(self):
-        g = ng.Graph()
-        n1 = g.createNode("hello1")
-        n2 = g.createNode("hello2")
-        e = g.createEdge(n1, n2)
-        ng.render(g)
-
-    def test_createUniqueDataNode(self):
-        net = core.Net("name")
-        nn = ng.NNModule(net)
-        n1 = nn.createUniqueDataNode("a")
-        self.assertEqual(n1.name[0], "a")
-        n2 = nn.dataFlow.createNode(ng.Operator("test1"))
-        nn.createEdge(n1, n2)
-        n3 = nn.createUniqueDataNode("a")
-        nn.createEdge(n2, n3)
-        self.assertEqual(n3.name[0], "a")
-        self.assertNotEqual(n1.name, n3.name)
-        n1 = nn.createUniqueDataNode("b")
-        n2 = nn.createUniqueDataNode("b")
-        self.assertNotEqual(n1.name, n2.name)
-
-    def test_convertToProto(self):
-        net = core.Net("name")
-        net.FC(["X", "W"], ["Y"])
-        nn = ng.NNModule(net)
-        new_netdef = nn.convertToCaffe2Proto()
-        print(new_netdef)
-        print(net.Proto())
-        assert len(new_netdef.op) == len(net.Proto().op)
-        for i in range(len(new_netdef.op)):
-            op = net.Proto().op[i]
-            new_op = new_netdef.op[i]
-            assert op.type == new_op.type
-            assert len(op.input) == len(new_op.input)
-            assert len(op.output) == len(new_op.output)
-            for a, b in zip(op.input, new_op.input):
-                assert a == b
-            for a, b in zip(op.output, new_op.output):
-                assert a == b
-        for a, b in zip(new_netdef.external_input, net.Proto().external_input):
-            assert a == b
-        for a, b in zip(new_netdef.external_output, net.Proto().external_output):
-            assert a == b
-
-    def test_node_interactions(self):
-        nn = ng.NNModule()
-        dfg = nn.dataFlow
-        test1 = dfg.createNode(ng.Operator("test1"))
-        test2 = dfg.createNode(ng.Operator("test2"))
-        x = dfg.createNode(ng.Data("x"))
-        dfg.createEdge(test1, x)
-        dfg.createEdge(x, test2)
-        p = test2.getOperatorPredecessors()
-        assert len(p) == 1
-        assert p[0] == test1
-
-        # Add another node
-        test3 = dfg.createNode(ng.Operator("test3"))
-        y = dfg.createNode(ng.Data("y"))
-        dfg.createEdge(test3, y)
-        dfg.createEdge(y, test2)
-        p = test2.getOperatorPredecessors()
-        assert len(p) == 2
-        assert test1 in p
-        assert test3 in p
-
-        # Successors
-        assert len(test2.getOperatorSuccessors()) == 0
-        assert len(test1.getOperatorSuccessors()) == 1
-        assert test1.getOperatorSuccessors()[0] == test2
-
-        # Check all the nodes are valid (pybind ownership test)
-        for node in [test1, test2, test3]:
-            assert node.isOperator()
-        for node in [x, y]:
-            assert node.isTensor()
-
-    def test_delete_node(self):
-        nn = ng.NNModule()
-        node = nn.dataFlow.createNode(ng.NeuralNetOperator("TestOp"))
-        nn.dataFlow.deleteNode(node)
-        assert len(nn.dataFlow.getMutableNodes()) == 0
-
-    def test_replace_producer(self):
-        net = core.Net("name")
-        net.FC(["X", "W"], ["Y"])
-        nn = ng.NNModule(net)
-        fc = nn.controlFlow[0]
-        test_op = nn.dataFlow.createNode(ng.NeuralNetOperator("TestOp"))
-        nn.replaceProducer(fc.outputs[0], test_op)
-        nn.deleteNode(fc)
-        assert len(nn.controlFlow) == 1
-        assert nn.controlFlow[0].name == "TestOp"
-
-    def test_replace_all_uses_with(self):
-        net = core.Net("name")
-        net.FC(["X", "W"], ["Y"])
-        net.FC(["X", "W2"], ["Y2"])
-        nn = ng.NNModule(net)
-        fc = nn.controlFlow[0]
-        test_tensor = nn.dataFlow.createNode(ng.NeuralNetData("T"))
-        nn.replaceAllUsesWith(fc.inputs[0], test_tensor)
-
-        for op in nn.controlFlow:
-            assert op.inputs[0].name == "T"
-
-    def test_replace_as_consumer(self):
-        net = core.Net("name")
-        net.FC(["X", "W"], ["Y"])
-        nn = ng.NNModule(net)
-        fc = nn.controlFlow[0]
-        test_op = nn.dataFlow.createNode(ng.NeuralNetOperator("TestOp"))
-        nn.replaceAsConsumer(fc, test_op)
-        nn.deleteNode(fc)
-        assert len(nn.controlFlow) == 1
-        assert nn.controlFlow[0].name == "TestOp"
-        assert nn.controlFlow[0].inputs[0].name == "X"
-        assert nn.controlFlow[0].inputs[1].name == "W"
-
-    def test_annotation_basic(self):
-        annot = ng.Annotation()
-        annot.setDevice("woot")
-        assert annot.getDevice() == "woot"
-        annot.setDeviceType(7)
-        assert annot.getDeviceType() == 7
-
-    def test_annotation_from_graph(self):
-        nn = ng.NNModule()
-        node = nn.dataFlow.createNode(ng.NeuralNetOperator("TestOp"))
-        annot = node.getAnnotation()
-        annot.setDeviceType(7)
-        node.setAnnotation(annot)
-        new_annot = node.getAnnotation()
-        assert new_annot.getDeviceType() == 7
-
-    def test_annotation_operator_def(self):
-        nn = ng.NNModule()
-        opdef = core.CreateOperator("Conv", [], [], engine="SENTINEL")
-        node = nn.dataFlow.createNode(opdef)
-        assert node.annotation.operator_def.engine == "SENTINEL"
-        opdef = core.CreateOperator("Conv", [], [], engine="NEW_SENTINEL")
-        node.annotation.operator_def = opdef
-        netdef = nn.convertToCaffe2Proto()
-        assert len(netdef.op) == 1
-        assert netdef.op[0].engine == "NEW_SENTINEL"
-
-    def test_annotation_device_option(self):
-        nn = ng.NNModule()
-        node = nn.dataFlow.createNode(ng.NeuralNetOperator("TestOp"))
-        d = caffe2_pb2.DeviceOption()
-        d.node_name = "test"
-        node.annotation.device_option = d
-        # access in a different way
-        d_2 = nn.controlFlow[0].annotation.device_option
-        assert d == d_2
-
-    def test_has_device_option(self):
-        nn = ng.NNModule()
-        node = nn.dataFlow.createNode(ng.NeuralNetOperator("TestOp"))
-        assert not node.annotation.hasDeviceOption()
-        d = caffe2_pb2.DeviceOption()
-        node.annotation.device_option = d
-        assert node.annotation.hasDeviceOption()
-
-    def test_distributed_annotations(self):
-        nn = ng.NNModule()
-        key = nn.dataFlow.createNode(ng.NeuralNetData("key"))
-        length = nn.dataFlow.createNode(ng.NeuralNetData("length"))
-        node = nn.dataFlow.createNode(ng.NeuralNetOperator("TestOp"))
-
-        annot = ng.Annotation()
-        annot.setKeyNode(key)
-        annot.setLengthNode(length)
-        annot.setComponentLevels(["", "test", "woot"])
-
-        node.setAnnotation(annot)
-
-        new_annot = node.getAnnotation()
-        #assert new_annot.getLengthNode() == length
-        assert new_annot.getKeyNode() == key
-        assert len(new_annot.getComponentLevels()) == 3
-        assert new_annot.getComponentLevels()[0] == ""
-        assert new_annot.getComponentLevels()[2] == "woot"
-
-    def test_distributed_device_map(self):
-        net = core.Net("name")
-        net.FC(["X", "W"], ["Y"])
-        d = caffe2_pb2.DeviceOption()
-        nn = ng.NNModule(net, {"X": d, "W": d})
-
-        with self.assertRaises(Exception):
-            nn = ng.NNModule(net, {"X": d, "Fake": d})
diff --git a/caffe2/python/nomnigraph_transformations.py b/caffe2/python/nomnigraph_transformations.py
deleted file mode 100644
index 570c743df152..000000000000
--- a/caffe2/python/nomnigraph_transformations.py
+++ /dev/null
@@ -1,88 +0,0 @@
-
-
-from collections import defaultdict
-
-import caffe2.python.nomnigraph as ng
-from caffe2.python import core, utils
-
-
-def transpose_network(nn):
-    """
-    Convert all Convolutions operators which are in the NCHW order
-    to NHWC order and also transform their inputs and outputs so that the
-    rest of the graph is not affected.
-    """
-    # track the incoming tensors into NHWC2NCHW operators
-    incoming = {}  # output tensor -> input tensor
-    # track outgoing tensors from NCHW2NHWC operators
-    outgoing = defaultdict(lambda: [])  # input tensor -> list of operators
-    dfg = nn.dataFlow
-    orig_nodes = [x for x in nn.nodes]
-    for node in orig_nodes:
-        if node.isOperator() and node.name == "Conv":
-            arg_dict = utils.ArgsToDict(node.annotation.operator_def.arg)
-            # a missing "order" argument implies default NCHW order
-            if "order" in arg_dict and arg_dict["order"] != "NCHW":
-                continue
-            inputs = [x for x in node.inputs]
-            assert len(inputs) >= 2, "Conv operator should have two inputs"
-            outputs = [x for x in node.outputs]
-            assert len(outputs) >= 1, "Conv operator should have an output"
-            for inp in inputs:
-                nn.deleteEdge(inp, node)
-            for outp in outputs:
-                nn.deleteEdge(node, outp)
-            # only the first two inputs of the Convolution the data and the
-            # weights need to be transformed
-            for idx in range(2):
-                new_inp = nn.createUniqueDataNode(inputs[idx].name)
-                transp = dfg.createNode(ng.NeuralNetOperator("NCHW2NHWC"))
-                nn.createEdge(inputs[idx], transp)
-                nn.createEdge(transp, new_inp)
-                outgoing[inputs[idx]].append(transp)
-                inputs[idx] = new_inp
-            for idx in range(len(outputs)):
-                new_outp = nn.createUniqueDataNode(outputs[idx].name)
-                transp = dfg.createNode(ng.NeuralNetOperator("NHWC2NCHW"))
-                nn.createEdge(transp, outputs[idx])
-                nn.createEdge(new_outp, transp)
-                incoming[outputs[idx]] = new_outp
-                outputs[idx] = new_outp
-            # create a new Convolution with identical arguments as the original
-            # one except for the order
-            arg_dict["order"] = "NHWC"
-            new_node = nn.createNode(core.CreateOperator("Conv", [], [],
-                                                         **arg_dict))
-            for inp in inputs:
-                nn.createEdge(inp, new_node)
-            for outp in outputs:
-                nn.createEdge(new_node, outp)
-
-            nn.deleteNode(node)
-
-    # finally, we will compress
-    # case 1:
-    # X -> NHWC2NCHW -> Y -> NCHW2NHWC -> Z1 ; Y -> NCHW2NHWC -> Z2
-    #  to:
-    # X -> NHWC2NCHW -> Y   and replace Z1 with X and replace Z2 with X
-    # And case 2:
-    # Y -> NCHW2NHWC -> Z1 ; Y -> NCHW2NHWC -> Z2
-    #  to:
-    # Y -> NCHW2NHWC -> Z1     and   replace Z2 with Z1
-
-    # orig_tensor is one of the tensors in the original graph in NCHW order
-    for orig_tensor in outgoing:
-        # new_tensor is identical to orig_tensor except the order is NHWC
-        if orig_tensor in incoming:  # case 1 (see above)
-            new_tensor = incoming[orig_tensor]
-        else:  # case 2 (see above)
-            out_ops = outgoing[orig_tensor]
-            new_tensor = out_ops[0].outputs[0]
-            outgoing[orig_tensor] = out_ops[1:]
-
-        for opnode in outgoing[orig_tensor]:
-            # there should only be one output, so this iteration is overkill
-            for out in opnode.outputs:
-                nn.replaceAllUsesWith(out, new_tensor)
-                nn.deleteNode(out)
-            nn.deleteNode(opnode)
diff --git a/caffe2/python/nomnigraph_transformations_test.py b/caffe2/python/nomnigraph_transformations_test.py
deleted file mode 100644
index adbfe1a4885a..000000000000
--- a/caffe2/python/nomnigraph_transformations_test.py
+++ /dev/null
@@ -1,144 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python import test_util as tu
-import caffe2.python.nomnigraph as ng
-from caffe2.python.nomnigraph_transformations import transpose_network
-
-import numpy as np
-from hypothesis import given
-import hypothesis.strategies as st
-
-
-class TestNomnigraphTransformations(tu.TestCase):
-    def test_simple_replace(self):
-        net = core.Net("name")
-        net.FC(["X", "W"], ["Y"])
-        nn = ng.NNModule(net)
-        fc = nn.controlFlow[0]
-        add = nn.createNode(core.CreateOperator("Add", ["X"], ["Y"], engine="CUDNN"))
-        nn.replaceNode(fc, add)
-        nn.deleteNode(fc)
-
-        # Test it out
-        new_netdef = nn.convertToCaffe2Proto()
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("X", np.array([1, 2, 3]))
-        workspace.FeedBlob("W", np.array([1, 2, 3]))
-        workspace.RunNetOnce(new_netdef)
-        out = workspace.FetchBlob("Y")
-        expected_out = np.array([2, 4, 6])
-        np.testing.assert_almost_equal(out, expected_out)
-
-    def test_simple_rewire(self):
-        net = core.Net("name")
-        # Rewire this so that we get
-        # c = Add(a, d)
-        # e = Mul(c, b)
-        #
-        # if a = 1, b = 2, d = 3
-        # we get 8: (1 + 3) * 2
-        # as opposed to 7: 1 + (3 * 2)
-        net.Mul(["a", "b"], ["c"])
-        net.Add(["c", "d"], ["e"])
-        nn = ng.NNModule(net)
-
-        mul = nn.controlFlow[0]
-        add = nn.controlFlow[1]
-        a = mul.inputs[0]
-        b = mul.inputs[1]
-        c = mul.outputs[0]
-        d = add.inputs[1]
-        e = add.outputs[0]
-
-        nn.deleteEdge(a, mul)
-        nn.deleteEdge(b, mul)
-        nn.deleteEdge(mul, c)
-        nn.deleteEdge(c, add)
-        nn.deleteEdge(d, add)
-        nn.deleteEdge(add, e)
-
-        nn.createEdge(a, add)
-        nn.createEdge(d, add)
-        nn.createEdge(add, c)
-        nn.createEdge(c, mul)
-        nn.createEdge(b, mul)
-        nn.createEdge(mul, e)
-
-        # Test it out
-        new_netdef = nn.convertToCaffe2Proto()
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("a", np.array([1, 1, 1]))
-        workspace.FeedBlob("b", np.array([2, 2, 2]))
-        workspace.FeedBlob("d", np.array([3, 3, 3]))
-        workspace.RunNetOnce(new_netdef)
-        out = workspace.FetchBlob("e")
-        expected_out = np.array([8, 8, 8])
-        np.testing.assert_almost_equal(out, expected_out)
-
-    @given(
-        batch_size=st.integers(16, 20),
-        channels=st.integers(1, 10),
-        height=st.integers(10, 15),
-        width=st.integers(10, 15),
-        seed=st.integers(0, 65535),
-        kernel=st.integers(3, 5),
-    )
-    def test_transpose_network(self, batch_size, channels, height, width, seed,
-                               kernel):
-        net = core.Net("net")
-        net.Conv(["X", "w1", "b1"], ["c1"], stride=1, pad=0, kernel=kernel)
-        net.Conv(["X", "w2", "b2"], ["c2"], stride=1, pad=0, kernel=kernel)
-        # c1 and c2: batch_size, 2*channels, height - kernel + 1, width - kernel + 1
-        net.Conv(["c1", "w3", "b3"], ["c3"], stride=1, pad=0, kernel=kernel)
-        net.Conv(["c1", "w4", "b4"], ["c4"], stride=1, pad=0, kernel=kernel)
-        # c3 and c4: batch_size, 2*channels, height - 2*kernel + 2, width - 2*kernel + 2
-        net.Flatten(["c3"], "c3f")
-        net.Flatten(["c4"], "c4f")
-        net.Flatten(["X"], "Xf")
-        net.Concat(["c3f", "c4f", "Xf"], ["out", "split_info"], axis=1, add_axis=0)
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-        tu.randBlobFloat32("X", batch_size, channels, height, width)
-        tu.randBlobsFloat32(["w1", "w2"], 2 * channels, channels, kernel, kernel)
-        tu.randBlobsFloat32(["b1", "b2"], 2 * channels)
-        tu.randBlobsFloat32(["w3", "w4"], 4 * channels, 2 * channels, kernel, kernel)
-        tu.randBlobsFloat32(["b3", "b4"], 4 * channels)
-        all_inp_names = ["X", "w1", "w2", "b1", "b2", "w3", "w4", "b3", "b4"]
-        all_input = workspace.FetchBlobs(all_inp_names)
-        workspace.RunNetOnce(net)
-        preTransformC1 = workspace.FetchBlob("c1")
-        preTransformC3 = workspace.FetchBlob("c3")
-        preTransformOut = workspace.FetchBlob("out")
-        nn = ng.NNModule(net)
-        preTransformNumOperators = len(nn.operators)
-        preTransformNumTensors = len(nn.tensors)
-        transpose_network(nn)
-        new_netdef = nn.convertToCaffe2Proto()
-        postTransformNumOperators = len(nn.operators)
-        postTransformNumTensors = len(nn.tensors)
-        # The minimal number of additional operators and tensors is at least one
-        # NCHW2NHWC operator and tensor for each channel-based input tensor
-        # and a NHWC2NCHW operator and tensor for the output of each convolution
-        # X, w1, w2, w3, w4 are channel-based inputs
-        # c1, c2, c3, c4 are the outputs of convolutions
-        # i.e. a total of 9.
-        self.assertEqual(postTransformNumOperators,
-                         preTransformNumOperators + 9,
-                         "expected 9 additional operators")
-        self.assertEqual(postTransformNumTensors,
-                         preTransformNumTensors + 9,
-                         "expected 9 additional tensors")
-        workspace.ResetWorkspace()
-        for name, val in zip(all_inp_names, all_input):
-            workspace.FeedBlob(name, val)
-        workspace.RunNetOnce(new_netdef)
-        postTransformC1 = workspace.FetchBlob("c1")
-        postTransformC3 = workspace.FetchBlob("c3")
-        postTransformOut = workspace.FetchBlob("out")
-        np.testing.assert_almost_equal(postTransformC1, preTransformC1, 1)
-        np.testing.assert_almost_equal(postTransformC3, preTransformC3, 1)
-        np.testing.assert_almost_equal(postTransformOut, preTransformOut, 1)
diff --git a/caffe2/python/normalizer.py b/caffe2/python/normalizer.py
deleted file mode 100644
index bc6b36b00cf8..000000000000
--- a/caffe2/python/normalizer.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# @package optimizer
-# Module caffe2.python.normalizer
-
-
-
-class Normalizer:
-    def __init__(self):
-        pass
-    """
-    Adds normalization to train_net for given parameter. Its factor ahead of
-    regularization is given when initialization.
-    The param should be a BlobReference.
-    """
-
-    def __call__(self, net, param):
-        return self._run(net, param)
-
-    def _run(self, net, param):
-        raise Exception("Not Impelemented")
-
-
-class BatchNormalizer(Normalizer):
-    def __init__(self, momentum, scale_init_value=1.0):
-        super().__init__()
-        self._momentum = float(momentum)
-        self._scale_init_value = float(scale_init_value)
-
-    def _run(self, layer_model, param):
-        return layer_model.BatchNormalization(
-            param, momentum=self._momentum, scale_init_value=self._scale_init_value
-        )
-
-
-class LayerNormalizer(Normalizer):
-    def __init__(self, epsilon, use_layer_norm_op=True, scale_init_value=1.0):
-        super().__init__()
-        self._epsilon = float(epsilon)
-        self._use_layer_norm_op = use_layer_norm_op
-        self._scale_init_value = float(scale_init_value)
-
-    def _run(self, layer_model, param):
-        return layer_model.LayerNormalization(
-            param, epsilon=self._epsilon, use_layer_norm_op=self._use_layer_norm_op, scale_init_value=self._scale_init_value
-        )
diff --git a/caffe2/python/normalizer_context.py b/caffe2/python/normalizer_context.py
deleted file mode 100644
index 9559024bbcd3..000000000000
--- a/caffe2/python/normalizer_context.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# @package regularizer_context
-# Module caffe2.python.normalizer_context
-
-
-
-
-
-from caffe2.python import context
-from caffe2.python.modifier_context import (
-    ModifierContext, UseModifierBase)
-
-
-class NormalizerContext(ModifierContext, context.DefaultManaged):
-    """
-    provide context to allow param_info to have different normalizers
-    """
-
-    def has_normalizer(self, name):
-        return self._has_modifier(name)
-
-    def get_normalizer(self, name):
-        assert self.has_normalizer(name), (
-            "{} normalizer is not provided!".format(name))
-        return self._get_modifier(name)
-
-
-class UseNormalizer(UseModifierBase):
-    '''
-    context class to allow setting the current context.
-    Example usage with layer:
-        normalizers = {'norm1': norm1, 'norm2': norm2}
-        with UseNormalizer(normalizers):
-            norm = NormalizerContext.current().get_normalizer('norm1')
-            layer(norm=norm)
-    '''
-    def _context_class(self):
-        return NormalizerContext
diff --git a/caffe2/python/normalizer_test.py b/caffe2/python/normalizer_test.py
deleted file mode 100644
index 6a1c2b2642ec..000000000000
--- a/caffe2/python/normalizer_test.py
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-
-
-from caffe2.python.normalizer_context import UseNormalizer, NormalizerContext
-from caffe2.python.normalizer import BatchNormalizer
-from caffe2.python.layer_test_util import LayersTestCase
-
-
-class TestNormalizerContext(LayersTestCase):
-    def test_normalizer_context(self):
-        bn = BatchNormalizer(momentum=0.1)
-        with UseNormalizer({'BATCH': bn}):
-            normalizer = NormalizerContext.current().get_normalizer('BATCH')
-            self.assertEqual(bn, normalizer)
diff --git a/caffe2/python/numa_benchmark.py b/caffe2/python/numa_benchmark.py
deleted file mode 100644
index a840c6932123..000000000000
--- a/caffe2/python/numa_benchmark.py
+++ /dev/null
@@ -1,69 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.proto import caffe2_pb2
-import time
-
-SHAPE_LEN = 4096
-NUM_ITER = 1000
-GB = 1024 * 1024 * 1024
-NUM_REPLICAS = 48
-
-
-def build_net(net_name, cross_socket):
-    init_net = core.Net(net_name + "_init")
-    init_net.Proto().type = "async_scheduling"
-    numa_device_option = caffe2_pb2.DeviceOption()
-    numa_device_option.device_type = caffe2_pb2.CPU
-    numa_device_option.numa_node_id = 0
-    for replica_id in range(NUM_REPLICAS):
-        init_net.XavierFill([], net_name + "/input_blob_" + str(replica_id),
-            shape=[SHAPE_LEN, SHAPE_LEN], device_option=numa_device_option)
-
-    net = core.Net(net_name)
-    net.Proto().type = "async_scheduling"
-    if cross_socket:
-        numa_device_option.numa_node_id = 1
-    for replica_id in range(NUM_REPLICAS):
-        net.Copy(net_name + "/input_blob_" + str(replica_id),
-                net_name + "/output_blob_" + str(replica_id),
-                device_option=numa_device_option)
-    return init_net, net
-
-
-def main():
-    assert workspace.IsNUMAEnabled() and workspace.GetNumNUMANodes() >= 2
-
-    single_init, single_net = build_net("single_net", False)
-    cross_init, cross_net = build_net("cross_net", True)
-
-    workspace.CreateNet(single_init)
-    workspace.RunNet(single_init.Name())
-    workspace.CreateNet(cross_init)
-    workspace.RunNet(cross_init.Name())
-
-    workspace.CreateNet(single_net)
-    workspace.CreateNet(cross_net)
-
-    for _ in range(4):
-        t = time.time()
-        workspace.RunNet(single_net.Name(), NUM_ITER)
-        dt = time.time() - t
-        print("Single socket time:", dt)
-        single_bw = 4 * SHAPE_LEN * SHAPE_LEN * NUM_REPLICAS * NUM_ITER / dt / GB
-        print("Single socket BW: {} GB/s".format(single_bw))
-
-        t = time.time()
-        workspace.RunNet(cross_net.Name(), NUM_ITER)
-        dt = time.time() - t
-        print("Cross socket time:", dt)
-        cross_bw = 4 * SHAPE_LEN * SHAPE_LEN * NUM_REPLICAS * NUM_ITER / dt / GB
-        print("Cross socket BW: {} GB/s".format(cross_bw))
-        print("Single BW / Cross BW: {}".format(single_bw / cross_bw))
-
-
-if __name__ == '__main__':
-    core.GlobalInit(["caffe2", "--caffe2_cpu_numa_enabled=1"])
-    main()
diff --git a/caffe2/python/numa_test.py b/caffe2/python/numa_test.py
deleted file mode 100644
index aba6e420ed55..000000000000
--- a/caffe2/python/numa_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.proto import caffe2_pb2
-from caffe2.python.test_util import TestCase
-import unittest
-
-core.GlobalInit(["caffe2", "--caffe2_cpu_numa_enabled=1"])
-
-def build_test_net(net_name):
-    net = core.Net(net_name)
-    net.Proto().type = "async_scheduling"
-
-    numa_device_option = caffe2_pb2.DeviceOption()
-    numa_device_option.device_type = caffe2_pb2.CPU
-    numa_device_option.numa_node_id = 0
-
-    net.ConstantFill([], "output_blob_0", shape=[1], value=3.14,
-                         device_option=numa_device_option)
-
-    numa_device_option.numa_node_id = 1
-    net.ConstantFill([], "output_blob_1", shape=[1], value=3.14,
-                         device_option=numa_device_option)
-
-    gpu_device_option = caffe2_pb2.DeviceOption()
-    gpu_device_option.device_type = caffe2_pb2.CUDA
-    gpu_device_option.device_id = 0
-
-    net.CopyCPUToGPU("output_blob_0", "output_blob_0_gpu",
-                        device_option=gpu_device_option)
-    net.CopyCPUToGPU("output_blob_1", "output_blob_1_gpu",
-                        device_option=gpu_device_option)
-
-    return net
-
-
-@unittest.skipIf(not workspace.IsNUMAEnabled(), "NUMA is not enabled")
-@unittest.skipIf(workspace.GetNumNUMANodes() < 2, "Not enough NUMA nodes")
-@unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
-class NUMATest(TestCase):
-    def test_numa(self):
-        net = build_test_net("test_numa")
-
-        workspace.RunNetOnce(net)
-
-        self.assertEqual(workspace.GetBlobNUMANode("output_blob_0"), 0)
-        self.assertEqual(workspace.GetBlobNUMANode("output_blob_1"), 1)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/observer_test.py b/caffe2/python/observer_test.py
deleted file mode 100644
index cc3ca1718a5c..000000000000
--- a/caffe2/python/observer_test.py
+++ /dev/null
@@ -1,154 +0,0 @@
-
-
-
-
-
-import numpy as np
-import unittest
-from hypothesis import given, settings
-import hypothesis.strategies as st
-
-from caffe2.python import brew, core, model_helper, rnn_cell
-import caffe2.python.workspace as ws
-
-
-class TestObservers(unittest.TestCase):
-    def setUp(self):
-        core.GlobalInit(["python", "caffe2"])
-        ws.ResetWorkspace()
-        self.model = model_helper.ModelHelper()
-        brew.fc(self.model, "data", "y",
-                    dim_in=4, dim_out=2,
-                    weight_init=('ConstantFill', dict(value=1.0)),
-                    bias_init=('ConstantFill', dict(value=0.0)),
-                    axis=0)
-        ws.FeedBlob("data", np.zeros([4], dtype='float32'))
-
-        ws.RunNetOnce(self.model.param_init_net)
-        ws.CreateNet(self.model.net)
-
-    def testObserver(self):
-        ob = self.model.net.AddObserver("TimeObserver")
-        ws.RunNet(self.model.net)
-        print(ob.average_time())
-        num = self.model.net.NumObservers()
-        self.model.net.RemoveObserver(ob)
-        assert(self.model.net.NumObservers() + 1 == num)
-
-    @given(
-        num_layers=st.integers(1, 4),
-        forward_only=st.booleans()
-    )
-    @settings(deadline=1000)
-    def test_observer_rnn_executor(self, num_layers, forward_only):
-        '''
-        Test that the RNN executor produces same results as
-        the non-executor (i.e running step nets as sequence of simple nets).
-        '''
-
-        Tseq = [2, 3, 4]
-        batch_size = 10
-        input_dim = 3
-        hidden_dim = 3
-
-        run_cnt = [0] * len(Tseq)
-        avg_time = [0] * len(Tseq)
-        for j in range(len(Tseq)):
-            T = Tseq[j]
-
-            ws.ResetWorkspace()
-            ws.FeedBlob(
-                "seq_lengths",
-                np.array([T] * batch_size, dtype=np.int32)
-            )
-            ws.FeedBlob("target", np.random.rand(
-                T, batch_size, hidden_dim).astype(np.float32))
-            ws.FeedBlob("hidden_init", np.zeros(
-                [1, batch_size, hidden_dim], dtype=np.float32
-            ))
-            ws.FeedBlob("cell_init", np.zeros(
-                [1, batch_size, hidden_dim], dtype=np.float32
-            ))
-
-            model = model_helper.ModelHelper(name="lstm")
-            model.net.AddExternalInputs(["input"])
-
-            init_blobs = []
-            for i in range(num_layers):
-                hidden_init, cell_init = model.net.AddExternalInputs(
-                    "hidden_init_{}".format(i),
-                    "cell_init_{}".format(i)
-                )
-                init_blobs.extend([hidden_init, cell_init])
-
-            output, last_hidden, _, last_state = rnn_cell.LSTM(
-                model=model,
-                input_blob="input",
-                seq_lengths="seq_lengths",
-                initial_states=init_blobs,
-                dim_in=input_dim,
-                dim_out=[hidden_dim] * num_layers,
-                drop_states=True,
-                forward_only=forward_only,
-                return_last_layer_only=True,
-            )
-
-            loss = model.AveragedLoss(
-                model.SquaredL2Distance([output, "target"], "dist"),
-                "loss"
-            )
-            # Add gradient ops
-            if not forward_only:
-                model.AddGradientOperators([loss])
-
-            # init
-            for init_blob in init_blobs:
-                ws.FeedBlob(init_blob, np.zeros(
-                    [1, batch_size, hidden_dim], dtype=np.float32
-                ))
-            ws.RunNetOnce(model.param_init_net)
-
-            # Run with executor
-            self.enable_rnn_executor(model.net, 1, forward_only)
-
-            np.random.seed(10022015)
-            input_shape = [T, batch_size, input_dim]
-            ws.FeedBlob(
-                "input",
-                np.random.rand(*input_shape).astype(np.float32)
-            )
-            ws.FeedBlob(
-                "target",
-                np.random.rand(
-                    T,
-                    batch_size,
-                    hidden_dim
-                ).astype(np.float32)
-            )
-            ws.CreateNet(model.net, overwrite=True)
-
-            time_ob = model.net.AddObserver("TimeObserver")
-            run_cnt_ob = model.net.AddObserver("RunCountObserver")
-            ws.RunNet(model.net)
-            avg_time[j] = time_ob.average_time()
-            run_cnt[j] = int(''.join(x for x in run_cnt_ob.debug_info() if x.isdigit()))
-            model.net.RemoveObserver(time_ob)
-            model.net.RemoveObserver(run_cnt_ob)
-
-        print(avg_time)
-        print(run_cnt)
-        self.assertTrue(run_cnt[1] > run_cnt[0] and run_cnt[2] > run_cnt[1])
-        self.assertEqual(run_cnt[1] - run_cnt[0], run_cnt[2] - run_cnt[1])
-
-    def enable_rnn_executor(self, net, value, forward_only):
-        num_found = 0
-        for op in net.Proto().op:
-            if op.type.startswith("RecurrentNetwork"):
-                for arg in op.arg:
-                    if arg.name == 'enable_rnn_executor':
-                        arg.i = value
-                        num_found += 1
-        # This sanity check is so that if someone changes the
-        # enable_rnn_executor parameter name, the test will
-        # start failing as this function will become defective.
-        self.assertEqual(1 if forward_only else 2, num_found)
diff --git a/caffe2/python/onnx/ONNXOpCoverage.md b/caffe2/python/onnx/ONNXOpCoverage.md
deleted file mode 100644
index 66cf2d692e87..000000000000
--- a/caffe2/python/onnx/ONNXOpCoverage.md
+++ /dev/null
@@ -1,121 +0,0 @@
-# Tracking why operators are not covered
-[ONNX backend test script](https://github.com/onnx/onnx-caffe2/blob/master/tests/onnx_backend_test.py)
-reports the coverage on the operators and attributes. But we have various of reasons for the missing test coverage on operators.
-This doc keeps tracking why operators are not covered by the testcases.
-
-- &#x1F49A; The ONNX operator can map to a Caffe2 operator.
-- &#x1F49B; The solution is not perfect/finished, for example, the operator can map to a combination of Caffe2 operators.
-- &#x1F494; Hard to find a solution with existing Caffe2 operators.
-
-| Operator | Test Coverage | PyTorch | Caffe2 |
-|---|:--:|:---:|:---:|
-|Abs|Yes|OK|&#x1F49A;OK|
-|Acos|Yes|OK|&#x1F49A;OK|
-|Add|Yes|OK|&#x1F49A;OK|
-|And|Yes|Support int tensor, but no bool tensor|&#x1F49A;OK|
-|ArgMax|||&#x1F49A;OK|
-|ArgMin|||&#x1F49A;OK|
-|Asin|||&#x1F49A;OK|
-|Atan|||&#x1F49A;OK|
-|AveragePool||OK|&#x1F49A;OK|
-|BatchNormalization||OK|&#x1F49A;OK|
-|Cast|Yes||&#x1F494;Need extension|
-|Ceil|Yes||&#x1F49A;OK|
-|Clip|Yes|OK|&#x1F49A;OK|
-|Concat|Yes|OK|&#x1F49A;OK|
-|Constant|Yes|OK|&#x1F49B;Special handling|
-|Conv|Yes|OK|&#x1F49A;OK|
-|ConvTranspose|Yes||&#x1F49A;OK, under enhancement|
-|Cos|Yes|OK|&#x1F49A;OK|
-|DepthToSpace|Yes||&#x1F494;No op|
-|Div|Yes|OK|&#x1F49A;OK|
-|Dropout|Yes|OK|&#x1F49A;OK|
-|Elu|Yes|OK|&#x1F49A;OK|
-|Equal|Yes|OK|&#x1F49A;OK|
-|Exp|Yes|OK|&#x1F49A;OK|
-|Flatten|Yes|OK|&#x1F49A;OK|
-|Floor|Yes||&#x1F49A;OK|
-|GRU|||&#x1F49A;|
-|Gather|Yes|OK|&#x1F49B;C2 only support axis=0 or 1, under development|
-|Gemm|Yes|OK|&#x1F49B;C2 use FC or MatMul + Add|
-|GlobalAveragePool|Yes|No direct mapping|&#x1F49A;OK|
-|GlobalLpPool|||&#x1F494;No mapping yet|
-|GlobalMaxPool|||&#x1F49A;OK|
-|Greater|Yes||&#x1F49A;OK|
-|HardSigmoid|Yes||&#x1F494;No op|
-|Hardmax|Yes||&#x1F494;No op|
-|InstanceNormalization|||&#x1F49A;OK|
-|LRN||OK|&#x1F49A;OK|
-|LSTM|||&#x1F49A;OK|
-|LeakyRelu|Yes|OK|&#x1F49A;OK|
-|Less|Yes||&#x1F49A;OK|
-|Log|Yes|OK|&#x1F49A;OK|
-|LogSoftmax||OK|&#x1F49A;No op, translated in onnx-caffe2|
-|LpNormalization|||&#x1F494;ONNX and C2 have different definition|
-|LpPool|||&#x1F49A;Should be LpPool, no tests|
-|MatMul|Yes|OK|&#x1F49A;OK|
-|Max|Yes|OK|&#x1F49A;OK|
-|MaxPool||OK|&#x1F49A;OK|
-|MaxRoiPool|||&#x1F494;No mapping yet|
-|Mean|||&#x1F49A;OK, need broadcasting support|
-|Min|Yes|OK|&#x1F49A;OK, need broadcasting support|
-|Mul|Yes|OK|&#x1F49A;OK, need broadcasting support|
-|Multinomial|Yes|OK|&#x1F494;no op|
-|Neg|Yes|OK|&#x1F49A;OK|
-|Not|Yes||&#x1F49A;OK|
-|Or|Yes||&#x1F49A;OK|
-|PRelu|Yes|OK|&#x1F49B;Need to enhance C2 implementation|
-|Pad|Yes|OK|&#x1F49A;OK|
-|Pow|Yes|OK|&#x1F49A;OK|
-|RNN|||&#x1F49A;OK|
-|RandomNormal|||&#x1F494;No op|
-|RandomNormalLike|||&#x1F494;No op|
-|RandomUniform|||&#x1F494;No op|
-|RandomUniformLike|||&#x1F494;No op|
-|Reciprocal|Yes||&#x1F49A;Use Pow to implement|
-|ReduceL1|||&#x1F494;No op|
-|ReduceL2|||&#x1F494;No op|
-|ReduceLogSum|||&#x1F494;No op|
-|ReduceLogSumExp|||&#x1F494;No op|
-|ReduceMax|||&#x1F49A;OK|
-|ReduceMean|||&#x1F49A;OK|
-|ReduceMin|||&#x1F49A;OK|
-|ReduceProd|||&#x1F49A;OK|
-|ReduceSum|||&#x1F49A;OK|
-|ReduceSumSquare|||&#x1F494;No op|
-|Relu|Yes|OK|&#x1F49A;OK|
-|Reshape|Yes|OK|&#x1F49A;OK|
-|Selu|Yes|OK|&#x1F49A;OK|
-|Sigmoid|Yes|OK|&#x1F49A;OK|
-|Sin|Yes|OK|&#x1F49A;OK|
-|Size|Yes|OK|&#x1F49A;OK|
-|Slice|Yes|OK|&#x1F494;ScatterAssign + Cast, very hacky implementation, Slice in C2 only supports one dimension|
-|Softmax|Yes|OK|&#x1F494;Axis and dim has different semantics|
-|Softplus|Yes|OK|&#x1F49A;OK|
-|Softsign|Yes||&#x1F49A;OK|
-|SpaceToDepth|||&#x1F494;No op|
-|Split|Yes|OK|&#x1F49A;OK|
-|Sqrt|Yes||&#x1F49A;OK|
-|Squeeze|Yes||&#x1F49A;OK|
-|Sub|Yes|OK|&#x1F49A;OK|
-|Sum|Yes|OK|&#x1F49A;OK, need broadcasting support|
-|Tanh|Yes|OK|&#x1F49A;OK|
-|Tile||OK|&#x1F49B;OK, need some enhance|
-|TopK||OK|&#x1F49A;OK|
-|Transpose|Yes|OK|&#x1F49A;OK|
-|Upsample|||&#x1F49B;No bilinear|
-|Xor|Yes||&#x1F49A;OK|
-|experimental ATen|||&#x1F49A;OK|
-|experimental Affine|||&#x1F494;No op|
-|experimental ConstantFill|||&#x1F49A;OK|
-|experimental Crop|||&#x1F494;No op|
-|experimental FC|||&#x1F49A;OK|
-|experimental GRUUnit|||&#x1F49A;OK, no tests|
-|experimental GivenTensorFill|||&#x1F49A;OK|
-|experimental Identity|||&#x1F49A;OK|
-|experimental ImageScaler|||&#x1F494;No op|
-|experimental MeanVarianceNormalization|||&#x1F494;No op|
-|experimental ParametricSoftplus|||&#x1F494;No op|
-|experimental Scale|||&#x1F49A;OK|
-|experimental ScaledTanh|||&#x1F494;No op|
-|experimental ThresholdedRelu|Yes||&#x1F49A;OK|
diff --git a/caffe2/python/onnx/README.md b/caffe2/python/onnx/README.md
deleted file mode 100644
index 444951bf684a..000000000000
--- a/caffe2/python/onnx/README.md
+++ /dev/null
@@ -1,53 +0,0 @@
-Caffe2 implementation of Open Neural Network Exchange (ONNX)
-========
-
-# Usage
-
-* [ONNX to Caffe2](https://github.com/onnx/tutorials/blob/master/tutorials/OnnxCaffe2Import.ipynb)
-* [Caffe2 to ONNX](https://github.com/onnx/tutorials/blob/master/tutorials/Caffe2OnnxExport.ipynb)
-* [other end-to-end tutorials](https://github.com/onnx/tutorials)
-
-# Installation
-
-onnx-caffe2 is installed as a part of Caffe2.
-Please follow the [instructions](https://caffe2.ai/docs/getting-started.html) to install Caffe2.
-
-
-# Folder Structure
-
-- ./: the main folder that all code lies under
-  - frontend.py: translate from caffe2 model to onnx model
-  - backend.py: execution engine that runs onnx on caffe2
-- tests/: test files
-
-# Testing
-
-onnx-caffe2 uses [pytest](https://docs.pytest.org) as test driver. In order to run tests, first you need to install pytest:
-
-
-```
-pip install pytest-cov
-```
-
-After installing pytest, do
-
-```
-pytest
-```
-
-to run tests.
-
-Testing coverage issues/status: https://github.com/caffe2/caffe2/blob/master/caffe2/python/onnx/ONNXOpCoverage.md
-
-# Development
-
-During development it's convenient to install caffe2 in development mode:
-
-```
-cd /path/to/caffe2
-pip install -e caffe2/
-```
-
-# License
-
-[MIT License](LICENSE)
diff --git a/caffe2/python/onnx/__init__.py b/caffe2/python/onnx/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/onnx/backend.py b/caffe2/python/onnx/backend.py
deleted file mode 100644
index fa06a7e68c87..000000000000
--- a/caffe2/python/onnx/backend.py
+++ /dev/null
@@ -1,967 +0,0 @@
-## @package onnx
-# Module caffe2.python.onnx.backend
-
-"""Backend for running ONNX on Caffe2
-
-To run this, you will need to have Caffe2 installed as well.
-"""
-import collections
-import sys
-import zipfile
-import itertools
-
-# When onnx is built against a version of protobuf that is older than
-# that which is vendored with caffe2, onnx will crash if caffe2's
-# vendored protobuf is loaded first. We can work around this by
-# importing onnx first, which will cause it to go out and pick up the
-# system protobuf.
-import onnx.backend
-from caffe2.python import core, workspace, rnn_cell, gru_cell
-from caffe2.python.model_helper import ModelHelper
-from caffe2.proto import caffe2_pb2
-import caffe2.python.utils
-import numpy as np
-import onnx
-from onnx import TensorProto
-import onnx.numpy_helper
-import onnx.defs
-import onnx.shape_inference
-import onnx.utils
-from onnx.backend.base import Backend, Device, DeviceType, namedtupledict
-
-from caffe2.python.onnx.workspace import Workspace
-from caffe2.python.onnx.backend_rep import Caffe2Rep
-
-import caffe2.python._import_c_extension as C
-
-import warnings
-
-def force_unicode(s):
-    try:
-        return s.decode('utf-8')
-    except AttributeError:
-        return s
-
-def get_device_option(device):
-    m = {DeviceType.CPU: caffe2_pb2.CPU,
-         DeviceType.CUDA: workspace.GpuDeviceType}
-    return core.DeviceOption(m[device.type], device.device_id)
-
-
-class OnnxAttributes(dict):
-    """
-    This is a more convenient way to work with ONNX/Caffe2 attributes
-    that is not the protobuf representation.
-    """
-    @staticmethod
-    def from_onnx(args):
-        d = OnnxAttributes()
-        for arg in args:
-            d[arg.name] = convertAttributeProto(arg)
-        return d
-
-    def caffe2(self, kmap=lambda k: k):
-        for k, v in self.items():
-            if kmap(k) != '':
-                yield caffe2.python.utils.MakeArgument(kmap(k), v)
-
-# TODO: Move this into ONNX main library
-def convertAttributeProto(onnx_arg):
-    """
-    Convert an ONNX AttributeProto into an appropriate Python object
-    for the type.
-
-    NB: Tensor attribute gets returned as the straight proto.
-    """
-    if onnx_arg.HasField('f'):
-        return onnx_arg.f
-    elif onnx_arg.HasField('i'):
-        return onnx_arg.i
-    elif onnx_arg.HasField('s'):
-        return onnx_arg.s
-    elif onnx_arg.HasField('t'):
-        return onnx_arg.t  # this is a proto!
-    elif onnx_arg.HasField('g'):
-        return Caffe2Backend._graph_to_net(onnx_arg.g, Caffe2Backend._known_opset_version)
-    elif len(onnx_arg.floats):
-        return list(onnx_arg.floats)
-    elif len(onnx_arg.ints):
-        return list(onnx_arg.ints)
-    elif len(onnx_arg.strings):
-        return list(onnx_arg.strings)
-    elif len(onnx_arg.graphs):
-        retval = []
-        # TODO: this doesn't work with RNN ops
-        for g in onnx_arg.graphs:
-            retval.append(Caffe2Backend._graph_to_net(g, Caffe2Backend._known_opset_version))
-        return retval
-    else:
-        raise ValueError("Unsupported ONNX attribute: {}".format(onnx_arg))
-
-
-# TODO: Move this into ONNX main library
-class OnnxNode:
-    """
-    Reimplementation of NodeProto from ONNX, but in a form
-    more convenient to work with from Python.
-
-    We may temporarily edit these nodes to get them into Caffe2 form,
-    before actually translating into the Caffe2 protobuf, since this
-    is easier than decomposing everything, and putting it back together
-    when we're ready.
-    """
-    def __init__(self, node):
-        self.name = str(node.name)
-        self.op_type = str(node.op_type)
-        self.attrs = OnnxAttributes.from_onnx(node.attribute)
-        self.inputs = list(node.input)
-        self.outputs = list(node.output)
-
-
-Caffe2Ops = collections.namedtuple('Caffe2Ops', ['ops', 'init_ops', 'interface_blobs'])
-
-
-class Caffe2Backend(Backend):
-
-    # The greatest version of the ONNX operator set which we are aware of.
-    # Models whose version is larger than this will cause us to emit a warning
-    # that we are attempting to translate on a "best effort" basis.
-    #
-    # If you increase this, make SURE you cross-reference all BC-breaking
-    # changes from one version to the next, and any that you did not
-    # implement, mark as broken in _broken_operators
-    _known_opset_version = 9
-
-    # This dictionary will record operators which are KNOWN to be
-    # broken, so we give a good error message rather than do something
-    # bogus and then fail.
-    _broken_operators = {
-        # 'BrokenOp': version_it_was_broken_in
-    }
-
-    # Operators that are different between Caffe2 and
-    # ONNX but only in their name.
-    # In most cases, this should be empty - as the effort of ONNX is
-    # to unify the operator definitions.
-    _renamed_operators = {
-        'GlobalMaxPool':         'MaxPool',
-        'GlobalAveragePool':     'AveragePool',
-        'Pad':                   'PadImage',
-        'Neg':                   'Negative',
-        'BatchNormalization':    'SpatialBN',
-        'InstanceNormalization': 'InstanceNorm',
-        'MatMul':                'BatchMatMul',
-        'Upsample':              'ResizeNearest',
-        'Identity':              'Copy',
-        'InstanceNormalization': 'InstanceNorm',
-        'Equal':                 'EQ',
-        'Less':                  'LT',
-        'Greater':               'GT',
-        'Unsqueeze':             'ExpandDims',
-        'Loop':                  'ONNXWhile',
-        'Tile':                  'NumpyTile',
-        'RandomNormal':          'GaussianFill',
-        'RandomUniform':         'UniformFill',
-    }
-
-    _global_renamed_attrs = {'kernel_shape': 'kernels'}
-    _per_op_renamed_attrs = {
-        'Squeeze':              {'axes': 'dims'},
-        'Unsqueeze':            {'axes': 'dims'},
-        'Transpose':            {'perm': 'axes'},
-        'Upsample':             {'mode': '',
-                                 'scales': ''},
-        'ConvTranspose':        {'output_padding': 'adjs'},
-        'Selu':                 {'gamma': 'scale'},
-        'If':                   {'then_branch': 'then_net',
-                                 'else_branch': 'else_net'},
-        'RandomUniform':        {'low': 'min',
-                                 'high': 'max'}
-    }
-
-    # operators whose behavior is different beyond renaming
-    # the value is an attribute of this class that is a
-    # function from ToffeIR node_def to caffe2 op_def
-    _special_operators = {
-        'LSTM': '_create_rnn_variant',
-        'GRU': '_create_rnn_variant',
-        'RNN': '_create_rnn_variant',
-        'Loop': '_create_loop',
-        'If': '_create_if',
-        'Upsample': '_create_upsample',
-        'RandomNormal': '_create_gaussian_fill'
-    }
-
-    # Dummy name generator
-    _dummy_name = C.DummyName()
-
-    @classmethod
-    def dummy_name(cls):
-        return cls._dummy_name.new_dummy_name()
-
-    # NB: By default, you will use the LATEST definition of the operator,
-    # so this interface MAY make BC-breaking changes.  Specify an
-    # opset_version if you don't want this to version.
-    @classmethod
-    def run_node(cls, node, inputs, device='CPU', opset_version=_known_opset_version, outputs_info=None):
-        super(Caffe2Backend, cls).run_node(node, inputs, device=device,
-                                           outputs_info=outputs_info, opset_version=opset_version)
-
-        value_infos = []
-        device_option = get_device_option(Device(device))
-        ws = Workspace()
-        with core.DeviceScope(device_option):  # temporary!
-            if isinstance(inputs, dict):
-                for key, value in inputs.items():
-                    ws.FeedBlob(key, value)
-                    value_infos.append(onnx.helper.make_tensor_value_info(
-                        name=key,
-                        elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype],
-                        shape=value.shape).SerializeToString())
-            else:
-                assert len(node.input) == len(inputs), "{}: expected {} but got {}".format(
-                    node.op_type, len(node.input), len(inputs))
-                for key, value in zip(node.input, inputs):
-                    ws.FeedBlob(key, value)
-                    value_infos.append(onnx.helper.make_tensor_value_info(
-                        name=key,
-                        elem_type=onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[value.dtype],
-                        shape=value.shape).SerializeToString())
-
-            ops = []
-            cbackend = C.Caffe2Backend(cls._dummy_name)
-            ops_str = cbackend.convert_node(node.SerializeToString(), value_infos, opset_version)
-            for s in ops_str[0] + ops_str[1]:
-                op = caffe2_pb2.OperatorDef()
-                op.ParseFromString(s)
-                op.device_option.CopyFrom(device_option)
-                ops.append(op)
-            ws.RunOperatorsOnce(ops)
-            output_values = [ws.FetchBlob(name) for name in node.output]
-            return namedtupledict('Outputs', node.output)(*output_values)
-
-    @classmethod
-    def _create_tensor_filling_op(cls, onnx_tensor, name=None):
-        """
-        Given an Onnx TensorProto, translate it into a Caffe2 operator
-        which produces the given tensor filling op.
-        """
-        assert name or onnx_tensor.name
-        name = name or onnx_tensor.name
-
-        c2_op = caffe2_pb2.OperatorDef()
-
-        c2_values = c2_op.arg.add()
-        c2_values.name = "values"
-
-        def tensor2list(onnx_tensor):
-            # Use the onnx.numpy_helper because the data may be raw
-            return onnx.numpy_helper.to_array(onnx_tensor).flatten().tolist()
-
-        if onnx_tensor.data_type in [TensorProto.FLOAT]:
-            c2_op.type = 'GivenTensorFill'
-            c2_values.floats.extend(tensor2list(onnx_tensor))
-        elif onnx_tensor.data_type in [TensorProto.DOUBLE]:
-            c2_op.type = 'GivenTensorDoubleFill'
-            c2_values.floats.extend(tensor2list(onnx_tensor))
-        elif onnx_tensor.data_type in [TensorProto.INT64,
-                                       TensorProto.UINT32]:
-            c2_op.type = 'GivenTensorInt64Fill'
-            c2_values.ints.extend(tensor2list(onnx_tensor))
-        elif onnx_tensor.data_type in [TensorProto.UINT8,
-                                       TensorProto.INT8,
-                                       TensorProto.UINT16,
-                                       TensorProto.INT16,
-                                       TensorProto.INT32]:
-            c2_op.type = 'GivenTensorIntFill'
-            c2_values.ints.extend(tensor2list(onnx_tensor))
-        elif onnx_tensor.data_type == TensorProto.BOOL:
-            c2_op.type = 'GivenTensorBoolFill'
-            c2_values.ints.extend(tensor2list(onnx_tensor))
-        elif onnx_tensor.data_type == TensorProto.STRING:
-            c2_op.type = 'GivenTensorStringFill'
-            c2_values.strings.extend(onnx_tensor.string_data)
-        else:
-            raise RuntimeError(
-                "unrecognized tensor type {}".format(onnx_tensor.data_type))
-
-        c2_shape = c2_op.arg.add()
-        c2_shape.name = "shape"
-        c2_shape.ints.extend(onnx_tensor.dims)
-
-        c2_op.output.append(name)
-
-        return c2_op
-
-    @classmethod
-    def _rnn_reform_weights(cls, reforms, name, hidden_size, init_net, gates, reorder_indices):
-        for name_from, name_to, do_concat, extra_dims in reforms:
-            gate_blobs = ['%s/%s_%s' % (name, prefix, name_to) for prefix in gates]
-            for i, x in enumerate(gate_blobs):
-                dim0 = i * hidden_size, (i+1) * hidden_size
-                starts, ends = zip(dim0, *extra_dims)
-                init_net.Slice(name_from, x, starts=starts, ends=ends)
-            if do_concat:
-                reordered_gate_blobs = [gate_blobs[i] for i in reorder_indices]
-                init_net.Concat(reordered_gate_blobs, ['%s/%s' % (name, name_to), cls.dummy_name()], axis=0)
-
-    @classmethod
-    def _make_rnn_direction(cls, input_blob, B, W, R, initial_states_and_names, sequence_lens,
-                            pred_mh, init_net,
-                            input_size, hidden_size, num_gates, direction_offset,
-                            Bi, Br, W_, R_,
-                            reform, make_cell, keep_outputs):
-        name = cls.dummy_name()
-
-        # input and recurrence biases are squashed together in onnx
-        # but not in caffe2
-        gates_hidden_size = num_gates * hidden_size
-        bias_offset = 2 * direction_offset * gates_hidden_size
-        weight_offset = direction_offset * gates_hidden_size
-        Bi = init_net.Slice(B, name + Bi,
-                            starts=[bias_offset + 0 * gates_hidden_size],
-                            ends  =[bias_offset + 1 * gates_hidden_size])
-        Br = init_net.Slice(B, name + Br,
-                            starts=[bias_offset + 1 * gates_hidden_size],
-                            ends  =[bias_offset + 2 * gates_hidden_size])
-        W_ = init_net.Slice(W, name + W_,
-                            starts=[weight_offset + 0 * gates_hidden_size, 0],
-                            ends  =[weight_offset + 1 * gates_hidden_size,-1])
-        R_ = init_net.Slice(R, name + R_,
-                            starts=[weight_offset + 0 * gates_hidden_size, 0],
-                            ends  =[weight_offset + 1 * gates_hidden_size,-1])
-
-        initial_states_sliced = []
-        for initial_state, name_suffix in initial_states_and_names:
-            initial_states_sliced.append(
-                pred_mh.net.Slice(initial_state, name + name_suffix,
-                                  starts=[direction_offset + 0, 0, 0],
-                                  ends  =[direction_offset + 1,-1,-1]))
-
-        if direction_offset == 1:
-            if sequence_lens is not None:
-                seq_lens_for_reverse = sequence_lens
-            else:
-                input_shape = pred_mh.net.Shape(input_blob, name + '/input_shape')
-                batch_size = pred_mh.net.Slice(input_shape, name + '/batch_size_slice', starts=[1], ends=[2])
-                seq_len = pred_mh.net.Slice(input_shape, name + '/seq_len_slice', starts=[0], ends=[1])
-                dummy_sequence_lens = pred_mh.net.Tile([seq_len, batch_size], name + '/dummy_sequence_lens', axis=0)
-                pred_mh.net.Reshape(dummy_sequence_lens, [dummy_sequence_lens, cls.dummy_name()], shape=[-1])
-                seq_lens_for_reverse = pred_mh.net.Cast(dummy_sequence_lens, name + '/seq_lens_for_reverse', to=core.DataType.INT32)
-        reform(Bi, Br, W_, R_, name, hidden_size, init_net)
-
-        if direction_offset == 1:
-            input = pred_mh.net.ReversePackedSegs(
-                [input_blob, seq_lens_for_reverse], name + "/input-reversed")
-        else:
-            input = input_blob
-
-        outputs = keep_outputs(list(make_cell(
-            pred_mh,
-            input,
-            sequence_lens,
-            initial_states_sliced,
-            input_size,
-            hidden_size,
-            name,
-            drop_states=False,
-            forward_only=True,
-        )))
-
-        if direction_offset == 1:
-            outputs[0] = pred_mh.net.ReversePackedSegs(
-                [outputs[0], seq_lens_for_reverse], name + "/output-reversed")
-
-        return outputs
-
-    @classmethod
-    def _create_rnn_variant(cls, init_model, pred_model, n, opset_version):
-        assert init_model is not None, "cannot convert RNNs without access to the full model"
-        assert pred_model is not None, "cannot convert RNNs without access to the full model"
-
-        attrs = dict(n.attrs) # make a copy, which is safe to mutate
-        hidden_size = attrs.pop('hidden_size')
-        direction = force_unicode(attrs.pop('direction', 'forward'))
-
-        if n.op_type == 'RNN':
-            activation = force_unicode(attrs.pop('activations', ('tanh',))[0].lower())
-        elif n.op_type == 'GRU':
-            linear_before_reset = attrs.pop('linear_before_reset', 0)
-
-        assert not attrs, "unsupported RNN attributes: " + str(attrs.keys())
-        assert direction in ['forward', 'bidirectional'], "unsupported backwards RNN/GRU/LSTM"
-
-        if n.op_type in ['RNN', 'GRU']:
-            input_blob, W, R, B, sequence_lens, initial_h = n.inputs
-        elif n.op_type == 'LSTM':
-            input_blob, W, R, B, sequence_lens, initial_h, initial_c = n.inputs
-
-        if sequence_lens == "":
-            sequence_lens = None
-
-        for x in itertools.chain(init_model.graph.input,
-                                 init_model.graph.value_info,
-                                 pred_model.graph.input,
-                                 pred_model.graph.value_info):
-            if x.name == W:
-                input_size = x.type.tensor_type.shape.dim[2].dim_value
-                break
-        else:
-            raise RuntimeError("best-effort shape inference for RNN/GRU/LSTM failed")
-
-        pred_mh = ModelHelper()
-        init_net = core.Net("init-net")
-
-        init_net.Reshape(W, [W, cls.dummy_name()], shape=[1,-1,0])
-        init_net.Squeeze(W, W, dims=[0])
-        init_net.Reshape(R, [R, cls.dummy_name()], shape=[1,-1,0])
-        init_net.Squeeze(R, R, dims=[0])
-        init_net.Reshape(B, [B, cls.dummy_name()], shape=[1,-1])
-        init_net.Squeeze(B, B, dims=[0])
-
-        if n.op_type == 'RNN':
-            def reform(*args):
-                pass
-
-            def make_cell(*args, **kwargs):
-                return rnn_cell.BasicRNN(*args, activation=activation, **kwargs)
-
-            def make_rnn(direction_offset):
-                return cls._make_rnn_direction(
-                    input_blob, B, W, R, [(initial_h, '/initial_h')], sequence_lens,
-                    pred_mh, init_net, input_size, hidden_size, 1, direction_offset,
-                    "/i2h_b", "/gates_t_b", "/i2h_w", "/gates_t_w",
-                    reform, make_cell, lambda x: x)
-
-        elif n.op_type == 'GRU':
-            def reform(Bi, Br, W_, R_, name, hidden_size, init_net):
-                # caffe2 has a different order from onnx. We need to rearrange
-                #  z r h  -> r z h
-                reforms = ((W_, 'i2h_w',    True,  [(0,-1)]),
-                           (R_, 'gate_t_w', False, [(0,-1)]),
-                           (Bi, 'i2h_b',    True,  []),
-                           (Br, 'gate_t_b', False, []))
-                cls._rnn_reform_weights(reforms, name, hidden_size, init_net,
-                                        ['update', 'reset', 'output'], [1, 0, 2])
-
-            def make_cell(*args, **kwargs):
-                return gru_cell.GRU(*args, linear_before_reset=linear_before_reset, **kwargs)
-
-            def make_rnn(direction_offset):
-                return cls._make_rnn_direction(
-                    input_blob, B, W, R, [(initial_h, '/initial_h')], sequence_lens,
-                    pred_mh, init_net, input_size, hidden_size, 3, direction_offset,
-                    "_bias_i2h", "_bias_gates", "/i2h_w_pre", "/gates_t_w_pre",
-                    reform, make_cell, lambda x: x)
-
-        elif n.op_type == 'LSTM':
-            def reform(Bi, Br, W_, R_, name, hidden_size, init_net):
-                # caffe2 has a different order from onnx. We need to rearrange
-                #   i o f c -> i f o c
-                reforms = ((W_, 'i2h_w',     True, [(0, -1)]),
-                           (R_, 'gates_t_w', True, [(0, -1)]),
-                           (Bi, 'i2h_b'    , True, []),
-                           (Br, 'gates_t_b', True, []))
-                cls._rnn_reform_weights(reforms, name, hidden_size, init_net,
-                                        ['input', 'output', 'forget', 'cell'], [0, 2, 1, 3])
-
-            def make_cell(*args, **kwargs):
-                return rnn_cell.LSTM(*args, **kwargs)
-
-            def make_rnn(direction_offset):
-                return cls._make_rnn_direction(
-                    input_blob, B, W, R, [(initial_h, '/initial_h'), (initial_c, '/initial_c')], sequence_lens,
-                    pred_mh, init_net, input_size, hidden_size, 4, direction_offset,
-                    "/i2h_b", "/gates_t_b", "/i2h_w", "/gates_t_w",
-                    reform, make_cell, lambda x: [x[0], x[1], x[3]])
-
-        if direction == 'forward':
-            outputs = make_rnn(0)
-
-            # in the forward case, storage is shared between the
-            # last outputs. We need to decouple them so that the
-            # VariableLengthSequencePadding only mutates
-            # n.outputs[0]
-            for i in range(1, len(outputs)):
-                pred_mh.net.Copy(outputs[i], n.outputs[i])
-
-            if sequence_lens is not None:
-                pred_mh.net.VariableLengthSequencePadding(
-                    [outputs[0], sequence_lens], [outputs[0]])
-            pred_mh.net.ExpandDims([outputs[0]], [n.outputs[0]], dims=[1])
-        elif direction == 'bidirectional':
-            outputs_f = make_rnn(0)
-            outputs_b = make_rnn(1)
-
-            concatted_output, _ = pred_mh.net.Concat(
-                [outputs_f[0], outputs_b[0]], [cls.dummy_name(), cls.dummy_name()], axis=2)
-            if sequence_lens is not None:
-                pred_mh.net.VariableLengthSequencePadding(
-                    [concatted_output, sequence_lens], [concatted_output])
-            reshaped_output, _ = pred_mh.net.Reshape(concatted_output, [cls.dummy_name(), cls.dummy_name()], shape=[0,0,-1,2])
-            pred_mh.net.Transpose(reshaped_output, n.outputs[0], axes=[0,2,1,3])
-            for i in range(1, len(n.outputs)):
-                pred_mh.net.Concat([outputs_f[i], outputs_b[i]],
-                                   [n.outputs[i], cls.dummy_name()], axis=0)
-
-        # We want to decide whether to put all of our weight-reshaping
-        # operators in the init net or the predict net. We can put
-        # them in the init net iff the inputs to those operators are
-        # already available, either as graph initializers, or as the
-        # output of other operators in the init net. The latter case
-        # occurs, for example, when exporting from pytorch to onnx.
-        # In most production use, we expect has_initializers to be
-        # true.
-        initializers = {i.name for i in init_model.graph.initializer}
-        outputs = {output for node in init_model.graph.node for output in node.output}
-        has_initializers = all(x in initializers or x in outputs for x in (W, R, B))
-
-        pred_ops = []
-        init_ops = []
-        (init_ops if has_initializers else pred_ops).extend(init_net.Proto().op)
-        pred_ops.extend(pred_mh.Proto().op)
-
-        return Caffe2Ops(pred_ops, init_ops, list(pred_mh.Proto().external_input))
-
-    @classmethod
-    def _create_control_op(cls, init_model, pred_model, n, opset_version):
-        control_inputs = []
-        if '__control_inputs' in n.attrs:
-            control_inputs.extend(n.attrs['__control_inputs'])
-        node = cls._common_onnx_node_to_caffe2_op(init_model, pred_model, n, opset_version)
-        node.control_input.extend(control_inputs)
-        return Caffe2Ops([node], [], [])
-
-    @classmethod
-    def _remove_ssa(cls, net, remap_dict):
-        for op in net.op:
-            for i, name in enumerate(op.output):
-                if name in remap_dict:
-                    op.output[i] = remap_dict[name]
-        for i, out in enumerate(net.external_output):
-            if out in remap_dict:
-                net.external_output[i] = remap_dict[out]
-
-    @classmethod
-    def _create_if(cls, init_model, pred_model, n, opset_version):
-        ops = cls._create_control_op(init_model, pred_model, n, opset_version)
-        assert ops[0][0].type == 'If'
-        if_op = ops[0][0]
-        then_net = else_net = None
-        control_inputs = []
-        for arg in if_op.arg:
-            if arg.name == 'then_net':
-                then_net = arg.n
-            if arg.name == 'else_net':
-                else_net = arg.n
-            if arg.name == '__control_inputs':
-                control_inputs = arg.strings
-
-        assert then_net and else_net
-        then_net_outs = then_net.external_output
-        else_net_outs = else_net.external_output
-        op_outputs = if_op.output
-        assert len(then_net_outs) == len(else_net_outs)
-        assert len(else_net_outs) == len(op_outputs)
-
-        for arg in if_op.arg:
-            if arg.name == 'then_net':
-                arg.n.external_input.extend(control_inputs)
-            if arg.name == 'else_net':
-                arg.n.external_input.extend(control_inputs)
-
-        return ops
-
-    @classmethod
-    def _create_loop(cls, init_model, pred_model, n, opset_version):
-        ops = cls._create_control_op(init_model, pred_model, n, opset_version)
-        assert ops[0][0].type == 'ONNXWhile'
-        while_op = ops[0][0]
-        while_op.arg.extend([caffe2.python.utils.MakeArgument('has_trip_count', True)])
-        while_op.arg.extend([caffe2.python.utils.MakeArgument('has_cond', True)])
-        while_op.arg.extend([caffe2.python.utils.MakeArgument('disable_scopes', True)])
-        control_inputs = []
-        for arg in while_op.arg:
-            if arg.name == '__control_inputs':
-                control_inputs = arg.strings
-        num_loop_carried_deps = 0
-        for arg in while_op.arg:
-            if arg.name == 'body':
-                num_loop_carried_deps = len(arg.n.external_input) - 2
-                arg.n.external_input.extend(control_inputs)
-        while_op.arg.extend([
-            caffe2.python.utils.MakeArgument('num_loop_carried_deps',
-                                             num_loop_carried_deps)
-        ])
-
-        return ops
-
-    @classmethod
-    def _substitute_raw_value(cls, tp, raw_values_dict):
-        if tp.HasField('raw_data') and tp.raw_data == bytes(b'__EXTERNAL'):
-            if tp.name not in raw_values_dict:
-                raise RuntimeError('TensorProto for value {} referenced raw data but it was not found!'.format(tp.name))
-            else:
-                tp.raw_data = raw_values_dict[tp.name]
-
-    @classmethod
-    def _visit_and_substitute_raw_values(cls, nodes, raw_values_dict):
-        for node in nodes:
-            for attr in node.attribute:
-                if attr.HasField('t'):
-                    cls._substitute_raw_value(attr.t, raw_values_dict)
-                for t in attr.tensors:
-                    cls._substitute_raw_value(t, raw_values_dict)
-                if attr.HasField('g'):
-                    cls._visit_and_substitute_raw_values(attr.g.node, raw_values_dict)
-                for g in attr.graphs:
-                    cls._visit_and_substitute_raw_values(g.node, raw_values_dict)
-
-    @classmethod
-    def _external_value_resolution_pass(cls, model, raw_values_dict):
-        for init in model.graph.initializer:
-            cls._substitute_raw_value(init, raw_values_dict)
-
-        cls._visit_and_substitute_raw_values(model.graph.node, raw_values_dict)
-
-
-    @classmethod
-    def _direct_initialize_parameters(cls, initializer, ws, device_option):
-        for tp in initializer:
-            ws.FeedBlob(tp.name, onnx.numpy_helper.to_array(tp), device_option)
-
-    @classmethod
-    def _direct_initialize_inputs(cls, inputs, initialized, ws, device_option):
-        for value_info in inputs:
-            if value_info.name in initialized:
-                continue
-            shape = list(d.dim_value for d in value_info.type.tensor_type.shape.dim)
-            ws.FeedBlob(
-                value_info.name,
-                np.ones(shape, dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[value_info.type.tensor_type.elem_type]),
-                device_option)
-
-    @staticmethod
-    def optimize_onnx(input, init=False, predict=False):
-        passes =  ['fuse_consecutive_transposes',
-                   'eliminate_nop_transpose',
-                   'fuse_transpose_into_gemm',
-                   'lift_lexical_references']
-        if init:
-            passes.append('split_init')
-        if predict:
-            passes.append('split_predict')
-        try:
-            out = onnx.optimizer.optimize(input, passes)
-        except AttributeError:
-            warnings.warn("OptimizerWarning: optimizer module not found in ONNX version {}".format(onnx.__version__))
-            # ONNX does no ship onnx.optimizer since version 1.9+
-            import onnxoptimizer
-            out = onnxoptimizer.optimize(input, passes)
-        return out
-
-    @classmethod
-    def prepare_zip_archive(cls, file, device='CPU', **kwargs):
-        with zipfile.ZipFile(file, mode='r') as z:
-            with z.open('__MODEL_PROTO', 'r') as f:
-                model = onnx.load(f);
-            blob_names = set(z.namelist()) - set('__MODEL_PROTO')
-            # TODO: make this more efficient
-            raw_values_dict = {}
-            for name in blob_names:
-                with z.open(name, 'r') as blob_file:
-                    raw_values_dict[name] = blob_file.read()
-
-        return cls.prepare(model, device, raw_values_dict=raw_values_dict, **kwargs)
-
-    @classmethod
-    def prepare(cls, model, device='CPU', raw_values_dict=None, **kwargs):
-        '''
-        For Onnx Caffe2Backend, we require that init_graph don't initialize the actual input of the predict_graph,
-
-        for example, if "img" is the input blob for the predict_net, we require that in init_graph and in
-        initializer of the predict_graph, "img" is not initalized. We don't have a check for this, since
-        there is no way we can know which blob is the input of the predict_graph.
-        '''
-        if not kwargs.pop('no_check_UNSAFE', False):
-            super(Caffe2Backend, cls).prepare(model, device, **kwargs)
-        opset_version = None
-        for imp in model.opset_import:
-            if not imp.HasField("domain") or imp.domain == "":
-                opset_version = imp.version
-                if imp.version > cls._known_opset_version:
-                    warnings.warn("This version of onnx-caffe2 targets ONNX operator set version {}, but the model we are trying to import uses version {}.  We will try to import it anyway, but if the model uses operators which had BC-breaking changes in the intervening versions, import will fail.".format(cls._known_opset_version, imp.version))
-            else:
-                warnings.warn("Unrecognized operator set {}".format(imp.domain))
-        if opset_version is None:
-            if model.ir_version >= 0x00000003:
-                raise RuntimeError("Model with IR version >= 3 did not specify ONNX operator set version (onnx-caffe2 requires it)")
-            else:
-                opset_version = 1
-
-        # Prior to onnx version update to onnx-1.8.0, errors caused by failures in
-        # in the onnx shape inference call were being suppressed. Hence a try-catch block
-        # is added around the infer_shapes call to avoid these failures and preserve status
-        try:
-            model = onnx.shape_inference.infer_shapes(model)
-        except RuntimeError:
-            warnings.warn("ShapeInferenceWarning: Inferred shape and existing shape differ in rank")
-
-        ws = Workspace()
-        device_option = get_device_option(Device(device))
-
-        init_net, predict_net = cls._onnx_model_to_caffe2_net(model, device, opset_version, False)
-
-        if raw_values_dict:
-            cls._external_value_resolution_pass(model, raw_values_dict)
-
-        # Directly load initializer data into blobs in workspace
-        cls._direct_initialize_parameters(
-            model.graph.initializer,
-            ws,
-            device_option,
-        )
-
-        initialized = {init.name for init in model.graph.initializer}
-
-        cls._direct_initialize_inputs(
-            model.graph.input,
-            initialized,
-            ws,
-            device_option,
-        )
-
-        uninitialized = [value_info.name for value_info in model.graph.input if value_info.name not in initialized]
-
-        retval = Caffe2Rep(init_net, predict_net, ws, uninitialized)
-        return retval
-
-
-    @classmethod
-    # TODO: This method needs a refactor for clarity
-    def _onnx_node_to_caffe2_op(cls, init_model, pred_model, node_def, opset_version):
-        cbackend = C.Caffe2Backend(cls._dummy_name)
-        if cbackend.support_onnx_import(node_def.op_type):
-
-            # extract value infos from pred model (value infos of
-            # node's inputs that are in init model should be all
-            # available in pred model)
-            value_infos = []
-            for name in node_def.input:
-                if pred_model is not None:
-                    for vi in itertools.chain(pred_model.graph.input,
-                                              pred_model.graph.output,
-                                              pred_model.graph.value_info):
-                        if vi.name == name:
-                            value_infos.append(vi.SerializeToString())
-
-            op_strs = cbackend.convert_node(node_def.SerializeToString(), value_infos, opset_version)
-            init_ops = []
-            for s in op_strs[0]:
-                op = caffe2_pb2.OperatorDef()
-                op.ParseFromString(s)
-                init_ops.append(op)
-            ops = []
-            for s in op_strs[1]:
-                op = caffe2_pb2.OperatorDef()
-                op.ParseFromString(s)
-                ops.append(op)
-            return Caffe2Ops(ops, init_ops, [])
-
-        if node_def.op_type in cls._special_operators:
-            translator = getattr(cls, cls._special_operators[node_def.op_type])
-        else:
-            translator = cls._common_onnx_node_to_caffe2_op
-        ops = translator(init_model, pred_model, OnnxNode(node_def), opset_version)
-        if isinstance(ops, Caffe2Ops):
-            return ops
-        if not isinstance(ops, collections.abc.Iterable):
-            ops = [ops]
-        return Caffe2Ops(ops, [], [])
-
-    _broadcast_operators = {
-        'Add',
-        'Sub',
-    }
-
-    @classmethod
-    def _common_onnx_node_to_caffe2_op(cls, init_model, pred_model, onnx_node, opset_version):
-        """
-        This translator performs the basic translation of ONNX nodes into
-        Caffe2 operators.  Besides doing a straightforward marshalling from
-        one format to another, it also does these extra things:
-
-          - Renames operators based on '_renamed_operators'
-          - Renames attributes based on '_global_renamed_attrs' and
-            '_per_op_renamed_attrs'
-
-        If you're writing a custom translator, consider calling this first,
-        and then fixing things up further.
-        """
-        c2_op = caffe2_pb2.OperatorDef()
-
-        c2_op.input.extend(onnx_node.inputs)
-        c2_op.output.extend(onnx_node.outputs)
-        c2_op.name = onnx_node.name
-
-
-        onnx_op_type = onnx_node.op_type
-        broken_version = cls._broken_operators.get(onnx_op_type, float('Inf'))
-        if broken_version <= opset_version:
-            raise ValueError(
-                "Don't know how to translate op {} in ONNX operator set v{} (I only support prior to v{})".format(onnx_op_type, opset_version, broken_version))
-        c2_op.type = cls._renamed_operators.get(onnx_op_type, onnx_op_type)
-        if not core.IsOperator(c2_op.type):
-            raise ValueError(
-                "Don't know how to translate op {}".format(onnx_op_type))
-
-        def kmap(k):
-            if (onnx_op_type in cls._per_op_renamed_attrs and
-                    k in cls._per_op_renamed_attrs[onnx_op_type]):
-                return cls._per_op_renamed_attrs[onnx_op_type][k]
-            if k in cls._global_renamed_attrs:
-                return cls._global_renamed_attrs[k]
-            return k
-        c2_op.arg.extend(onnx_node.attrs.caffe2(kmap=kmap))
-
-        if opset_version < 7:
-            # onnx opset 7 and newest caffe2 have adopted full onnx broadcast semantics
-            # so we don't need this hack anymore
-            if c2_op.type in cls._broadcast_operators:
-                already_broadcast = False
-                for arg in c2_op.arg:
-                    if arg.name == 'broadcast':
-                        already_broadcast = True
-                if not already_broadcast:
-                    c2_op.arg.extend([caffe2.python.utils.MakeArgument('broadcast', 1)])
-
-        return c2_op
-
-    @staticmethod
-    def _all_names_in_graph(graph):
-        if graph is None:
-            return set()
-
-        names = set()
-        names.update(value_info.name for value_info in graph.input)
-        names.update(value_info.name for value_info in graph.output)
-        for node in graph.node:
-            names.update(node.input)
-            names.update(node.output)
-        return names
-
-    @classmethod
-    def _graph_to_net(cls, onnx_graph, opset_version):
-        net = caffe2_pb2.NetDef()
-        for node in onnx_graph.node:
-            try:
-                c2ops = cls._onnx_node_to_caffe2_op(
-                    None, None, node, opset_version)
-            except Exception as e:
-                print('ONNX FATAL:', e)
-                continue
-            net.op.extend(c2ops.init_ops)
-            net.op.extend(c2ops.ops)
-            net.external_input.extend(c2ops.interface_blobs)
-        net.external_output.extend(
-            value_info.name for value_info in onnx_graph.output)
-        net.external_input.extend(
-            value_info.name for value_info in onnx_graph.input)
-        return net
-
-    @classmethod
-    def _onnx_model_to_caffe2_net(cls, onnx_model, device, opset_version, include_initializers):
-        device_option = get_device_option(Device(device))
-
-        # Prior to onnx version update to onnx-1.8.0, errors caused by failures in
-        # in the onnx shape inference call were being suppressed. Hence a try-catch block
-        # is added around the infer_shapes call to avoid these failures and preserve status
-        try:
-            onnx_model = onnx.utils.polish_model(onnx_model)
-        except RuntimeError:
-            warnings.warn("ShapeInferenceWarning: Inferred shape and existing shape differ in rank")
-        except AttributeError:
-            warnings.warn("ShapeInferenceWarning: utils module not found in ONNX version {}".format(onnx.__version__))
-
-        # Optimizer module has been removed in ONNX-1.9 or later, warn caller if that is the case
-        try:
-            init_model = cls.optimize_onnx(onnx_model, init=True)
-            pred_model = cls.optimize_onnx(onnx_model, predict=True)
-        except ModuleNotFoundError:
-            warnings.warn("OptimizerWarning: onnxoptimizer module not installed. "
-                          "init_model and pred_model models will not be splitted, which can cause a runtime error")
-            init_model = onnx_model
-            pred_model = onnx_model
-
-        init_net = caffe2_pb2.NetDef()
-        pred_net = caffe2_pb2.NetDef()
-
-        init_net.name = onnx_model.graph.name + '_init'
-        pred_net.name = onnx_model.graph.name + '_predict'
-
-        if include_initializers:
-            init_net.op.extend(cls._create_tensor_filling_op(tp) for tp in onnx_model.graph.initializer)
-
-        cls._dummy_name.reset(cls._all_names_in_graph(init_model.graph) | cls._all_names_in_graph(pred_model.graph))
-
-        errors = []
-        for net, model in ( (init_net, init_model), (pred_net, pred_model) ):
-            net.device_option.CopyFrom(device_option)
-            for node in model.graph.node:
-                try:
-                    c2ops = cls._onnx_node_to_caffe2_op(
-                        init_model, pred_model, node, opset_version)
-                except Exception as e:
-                    msg = 'Error while processing node: {}. Exception: {}'.format(node, e)
-                    errors.append(msg)
-                    print('ONNX FATAL:', msg, file=sys.stderr)
-                    continue
-                init_net.op.extend(c2ops.init_ops)
-                net.op.extend(c2ops.ops)
-                net.external_input.extend(c2ops.interface_blobs)
-            net.external_output.extend(
-                value_info.name for value_info in model.graph.output)
-            net.external_input.extend(
-                value_info.name for value_info in model.graph.input)
-
-        if len(errors) > 0:
-            raise RuntimeError(
-                "ONNX conversion failed, encountered {} errors:\n\n{}".format(
-                    len(errors), "\n\n".join(errors)))
-
-        return init_net, pred_net
-
-    # wrapper for backwards compatibility
-    @classmethod
-    def onnx_graph_to_caffe2_net(cls, model, device="CPU", opset_version=_known_opset_version):
-        return cls._onnx_model_to_caffe2_net(model, device=device, opset_version=opset_version, include_initializers=True)
-
-    @classmethod
-    def supports_device(cls, device_str):
-        device = Device(device_str)
-        if device.type == DeviceType.CPU:
-            return True
-        elif core.IsGPUDeviceType(device.type):
-            return workspace.has_gpu_support
-        return False
-
-    @classmethod
-    def is_compatible(cls, model, device='CPU', **kwargs):
-        if hasattr(super(Caffe2Backend, cls), 'is_compatible') \
-           and callable(super(Caffe2Backend, cls).is_compatible):
-            if not super(Caffe2Backend, cls).is_compatible(model, device, **kwargs):
-                return False
-        # TODO: should have an unspported list of operators, be optimistic for now
-        return True
-
-prepare = Caffe2Backend.prepare
-
-prepare_zip_archive = Caffe2Backend.prepare_zip_archive
-
-run_node = Caffe2Backend.run_node
-
-run_model = Caffe2Backend.run_model
-
-supports_device = Caffe2Backend.supports_device  # noqa
-
-is_compatible = Caffe2Backend.is_compatible
diff --git a/caffe2/python/onnx/backend_cpp_rep.py b/caffe2/python/onnx/backend_cpp_rep.py
deleted file mode 100644
index 6092d93da2a7..000000000000
--- a/caffe2/python/onnx/backend_cpp_rep.py
+++ /dev/null
@@ -1,51 +0,0 @@
-## @package onnx
-# Module caffe2.python.onnx.backend_rep_cpp
-
-
-
-
-
-
-from onnx.backend.base import BackendRep, namedtupledict
-
-# This is a wrapper around C++ Caffe2BackendRep,
-# mainly to handle the different input and output types for convenience of Python
-class Caffe2CppRep(BackendRep):
-    def __init__(self, cpp_rep):
-        super().__init__()
-        self.__core = cpp_rep
-        self.__external_outputs = cpp_rep.external_outputs()
-        self.__external_inputs = cpp_rep.external_inputs()
-        self.__uninitialized_inputs = cpp_rep.uninitialized_inputs()
-
-    def init_net(self):
-        return self.__core.init_net()
-
-    def pred_net(self):
-        return self.__core.pred_net()
-
-    def external_outputs(self):
-        return self.__core.external_outputs()
-
-    def external_inputs(self):
-        return self.__core.external_inputs()
-
-    def run(self, inputs):
-        output_values = None
-        if isinstance(inputs, dict):
-            output_values = self.__core.run(inputs)
-        elif isinstance(inputs, list) or isinstance(inputs, tuple):
-            if len(inputs) != len(self.__uninitialized_inputs):
-                raise RuntimeError('Expected {} values for uninitialized '
-                                   'graph inputs ({}), but got {}.'.format(
-                                        len(self.__uninitialized_inputs),
-                                        ', '.join(self.__uninitialized_inputs),
-                                        len(inputs)))
-            input_map = {}
-            for k, v in zip(self.__uninitialized_inputs, inputs):
-                input_map[k] = v
-            output_values = self.__core.run(input_map)
-        else:
-            # single input
-            output_values = self.__core.run([inputs])
-        return namedtupledict('Outputs', self.__external_outputs)(*output_values)
diff --git a/caffe2/python/onnx/backend_rep.py b/caffe2/python/onnx/backend_rep.py
deleted file mode 100644
index e9bc9438df9b..000000000000
--- a/caffe2/python/onnx/backend_rep.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# @package onnx
-# Module caffe2.python.onnx.backend_rep
-
-
-
-
-
-from caffe2.python import core
-from caffe2.proto import caffe2_pb2
-from onnx.backend.base import BackendRep, namedtupledict
-
-class Caffe2Rep(BackendRep):
-    def __init__(self, init_net, predict_net, workspace, uninitialized):
-        super().__init__()
-        self.init_net = init_net
-        self.predict_net = predict_net
-        self.workspace = workspace
-        # The list of uninitialized external_inputs in workspace, we need this to
-        # pair the name with given sequence inputs.
-        self.uninitialized = uninitialized
-        self.nets_created = False
-        self.ran_init_net = False
-
-    @property
-    def _name_scope(self):
-        if self.predict_net.device_option.device_type == caffe2_pb2.CUDA:
-            return 'gpu_{}'.format(self.predict_net.device_option.device_id)
-        return ''
-
-    def run(self, inputs, **kwargs):
-        super().run(inputs, **kwargs)
-        with core.DeviceScope(self.predict_net.device_option):
-            if isinstance(inputs, dict):
-                with core.NameScope(self._name_scope):
-                    for key, value in inputs.items():
-                        self.workspace.FeedBlob(key, value)
-            elif isinstance(inputs, list) or isinstance(inputs, tuple):
-                if len(self.uninitialized) != len(inputs):
-                    raise RuntimeError('Expected {} values for uninitialized '
-                                       'graph inputs ({}), but got {}.'.format(
-                                           len(self.uninitialized),
-                                           ', '.join(self.uninitialized),
-                                           len(inputs)))
-                for i, value in enumerate(inputs):
-                    # namescope already baked into protobuf
-                    self.workspace.FeedBlob(self.uninitialized[i], value)
-            else:
-                # single input
-                self.workspace.FeedBlob(self.uninitialized[0], inputs)
-            if not self.nets_created:
-                self.workspace.CreateNet(self.init_net)
-                self.workspace.CreateNet(self.predict_net)
-                self.nets_created = True
-            if not self.ran_init_net:
-                self.workspace.RunNet(self.init_net.name)
-                self.ran_init_net = True
-            self.workspace.RunNet(self.predict_net.name)
-        output_values = []
-        for name in self.predict_net.external_output:
-            try:
-                output_values.append(self.workspace.FetchBlob(name))
-            except Exception:
-                output_values.append(self.workspace.FetchInt8Blob(name))
-        return namedtupledict('Outputs',
-                              self.predict_net.external_output)(*output_values)
diff --git a/caffe2/python/onnx/bin/__init__.py b/caffe2/python/onnx/bin/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/onnx/bin/conversion.py b/caffe2/python/onnx/bin/conversion.py
deleted file mode 100644
index 7e469e514a73..000000000000
--- a/caffe2/python/onnx/bin/conversion.py
+++ /dev/null
@@ -1,88 +0,0 @@
-## @package onnx
-# Module caffe2.python.onnx.bin.conversion
-
-
-
-
-
-import json
-
-from caffe2.proto import caffe2_pb2
-import click
-from onnx import ModelProto
-
-from caffe2.python.onnx.backend import Caffe2Backend as c2
-import caffe2.python.onnx.frontend as c2_onnx
-
-
-@click.command(
-    help='convert caffe2 net to onnx model',
-    context_settings={
-        'help_option_names': ['-h', '--help']
-    }
-)
-@click.argument('caffe2_net', type=click.File('rb'))
-@click.option('--caffe2-net-name',
-              type=str,
-              help="Name of the caffe2 net")
-@click.option('--caffe2-init-net',
-              type=click.File('rb'),
-              help="Path of the caffe2 init net pb file")
-@click.option('--value-info',
-              type=str,
-              help='A json string providing the '
-              'type and shape information of the inputs')
-@click.option('-o', '--output', required=True,
-              type=click.File('wb'),
-              help='Output path for the onnx model pb file')
-def caffe2_to_onnx(caffe2_net,
-                   caffe2_net_name,
-                   caffe2_init_net,
-                   value_info,
-                   output):
-    c2_net_proto = caffe2_pb2.NetDef()
-    c2_net_proto.ParseFromString(caffe2_net.read())
-    if not c2_net_proto.name and not caffe2_net_name:
-        raise click.BadParameter(
-            'The input caffe2 net does not have name, '
-            '--caffe2-net-name must be provided')
-    c2_net_proto.name = caffe2_net_name or c2_net_proto.name
-    if caffe2_init_net:
-        c2_init_net_proto = caffe2_pb2.NetDef()
-        c2_init_net_proto.ParseFromString(caffe2_init_net.read())
-        c2_init_net_proto.name = '{}_init'.format(caffe2_net_name)
-    else:
-        c2_init_net_proto = None
-
-    if value_info:
-        value_info = json.loads(value_info)
-
-    onnx_model = c2_onnx.caffe2_net_to_onnx_model(
-        predict_net=c2_net_proto,
-        init_net=c2_init_net_proto,
-        value_info=value_info)
-
-    output.write(onnx_model.SerializeToString())
-
-
-@click.command(
-    help='convert onnx model to caffe2 net',
-    context_settings={
-        'help_option_names': ['-h', '--help']
-    }
-)
-@click.argument('onnx_model', type=click.File('rb'))
-@click.option('-o', '--output', required=True,
-              type=click.File('wb'),
-              help='Output path for the caffe2 net file')
-@click.option('--init-net-output',
-              required=True,
-              type=click.File('wb'),
-              help='Output path for the caffe2 init net file')
-def onnx_to_caffe2(onnx_model, output, init_net_output):
-    onnx_model_proto = ModelProto()
-    onnx_model_proto.ParseFromString(onnx_model.read())
-
-    init_net, predict_net = c2.onnx_graph_to_caffe2_net(onnx_model_proto)
-    init_net_output.write(init_net.SerializeToString())
-    output.write(predict_net.SerializeToString())
diff --git a/caffe2/python/onnx/error.py b/caffe2/python/onnx/error.py
deleted file mode 100644
index 1bac8290464d..000000000000
--- a/caffe2/python/onnx/error.py
+++ /dev/null
@@ -1,8 +0,0 @@
-## @package onnx
-# Module caffe2.python.onnx.error
-
-
-
-
-class BaseException(Exception): pass
-class Unsupported(BaseException): pass
diff --git a/caffe2/python/onnx/frontend.py b/caffe2/python/onnx/frontend.py
deleted file mode 100644
index 25a843e949ff..000000000000
--- a/caffe2/python/onnx/frontend.py
+++ /dev/null
@@ -1,357 +0,0 @@
-## @package onnx
-# Module caffe2.python.onnx.frontend
-
-"""Caffe2 Protobuf to ONNX converter
-
-To run this, you will need to have Caffe2 installed as well.
-"""
-
-
-
-
-
-import collections
-import itertools
-import logging
-import re
-
-from caffe2.python import core as caffe2_core
-from onnx import (checker, helper, numpy_helper, mapping,
-                  GraphProto, NodeProto, TensorProto, OperatorSetIdProto)
-from onnx.helper import make_tensor_value_info, make_model
-import numpy as np
-
-from caffe2.python.onnx.helper import c2_native_run_net
-
-import caffe2.python._import_c_extension as C
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class Caffe2Frontend:
-    # This number controls the semantics of the operators we target.  Whenever
-    # ONNX makes a BC breaking change to semantics of operators, having this set
-    # to an accurate number will prevent our models form exporting.  However,
-    # we should strive to keep this up-to-date as much as possible.
-    target_opset_version = 9
-
-    _renamed_operators = {
-        'SpatialBN': 'BatchNormalization',
-        'Conv1D': 'Conv',
-        'Conv2D': 'Conv',
-        'Conv3D': 'Conv',
-        'ConvTranspose1D': 'ConvTranspose',
-        'ConvTranspose2D': 'ConvTranspose',
-        'ConvTranspose3D': 'ConvTranspose',
-        'MaxPool1D': 'MaxPool',
-        'MaxPool2D': 'MaxPool',
-        'MaxPool3D': 'MaxPool',
-        'AveragePool1D': 'AveragePool',
-        'AveragePool2D': 'AveragePool',
-        'AveragePool3D': 'AveragePool',
-    }
-
-    # caffe2 arguments that are completely removed in onnx
-    _blocklist_caffe2_args = {
-        'order': {b'NCHW'},
-        'cudnn_exhaustive_search': {0, 1},
-        'exhaustive_search': {0, 1},
-        'use_cudnn': {0, 1},
-    }
-
-    _global_renamed_args = {
-        'kernels': 'kernel_shape',
-    }
-
-    _per_op_renamed_args = {
-        'Squeeze': {'dims': 'axes'},
-        'Transpose': {'axes': 'perm'},
-    }
-
-    _special_operators = {}
-
-    # Dummy name generator
-    _dummy_name = C.DummyName()
-
-    @classmethod
-    def dummy_name(cls):
-        return cls._dummy_name.new_dummy_name()
-
-    @classmethod
-    def _common_caffe2_arg_to_onnx_attr(cls, op_def, arg):
-        # name
-        op_type = op_def.type
-        name = cls._global_renamed_args.get(arg.name, arg.name)
-        if op_type in cls._per_op_renamed_args:
-            # Per-op attribute renames override the global attribute renames
-            name = cls._per_op_renamed_args[op_type].get(arg.name, name)
-
-        # value
-        if arg.HasField('f'):
-            value = arg.f
-        elif arg.HasField('i'):
-            value = arg.i
-        elif arg.HasField('s'):
-            value = arg.s
-        elif arg.floats:
-            value = arg.floats
-        elif arg.ints:
-            value = arg.ints
-        elif arg.strings:
-            value = arg.strings
-        else:
-            raise ValueError('Could not find data field in arg: {}'.format(arg))
-
-        if name in cls._blocklist_caffe2_args:
-            assert value in cls._blocklist_caffe2_args[arg.name]
-            return None
-
-        return helper.make_attribute(name, value)
-
-    @classmethod
-    def caffe2_arg_to_onnx_attr(cls, op_def, arg):
-        return cls._common_caffe2_arg_to_onnx_attr(op_def, arg)
-
-    @classmethod
-    def _common_caffe2_op_to_onnx_node(cls, op_def, shapes):
-        node_def = NodeProto()
-        node_def.name = op_def.name
-
-        node_def.op_type = cls._renamed_operators.get(op_def.type, op_def.type)
-
-        node_def.input.extend(op_def.input)
-        node_def.output.extend(op_def.output)
-
-        attrs = filter(None, [cls.caffe2_arg_to_onnx_attr(op_def, arg)
-                              for arg in op_def.arg])
-        node_def.attribute.extend(attrs)
-
-        return node_def
-
-    @classmethod
-    def caffe2_op_to_onnx_node(cls, op_def, shapes):
-        if C.support_onnx_export(op_def.type):
-            node_strs, tensor_strs = C.export_to_onnx(cls._dummy_name, op_def.SerializeToString(), shapes)
-            nodes = []
-            for s in node_strs:
-                node = NodeProto()
-                node.ParseFromString(s)
-                nodes.append(node)
-            const_tensors = []
-            for s in tensor_strs:
-                tensor = TensorProto()
-                tensor.ParseFromString(s)
-                const_tensors.append(tensor)
-            return nodes, const_tensors
-        elif op_def.type in cls._special_operators:
-            translator = getattr(cls, cls._special_operators[op_def.type])
-        else:
-            translator = cls._common_caffe2_op_to_onnx_node
-        nodes = translator(op_def, shapes)
-        const_tensors = []
-        if isinstance(nodes, tuple):
-            nodes, const_tensors = nodes
-        if not isinstance(nodes, collections.abc.Iterable):
-            nodes = [nodes]
-        return nodes, const_tensors
-
-    @staticmethod
-    def _all_names_in_net(net):
-        if net is None:
-            return set()
-
-        names = set()
-        names.update(net.external_input)
-        names.update(net.external_output)
-        for op in net.op:
-            names.update(op.input)
-            names.update(op.output)
-        return names
-
-    @staticmethod
-    def _extract_value_info(tensor):
-        return make_tensor_value_info(
-            name=tensor.name,
-            elem_type=tensor.data_type,
-            shape=tensor.dims)
-
-    @classmethod
-    def caffe2_net_to_onnx_graph(cls,
-                                 predict_net,
-                                 init_net=None,
-                                 value_info=None):
-        if value_info is None:
-            value_info = {}
-        if not isinstance(value_info, dict):
-            raise ValueError('Please pass value_info as a '
-                             'name -> (type, shape) dictionary')
-
-        cls._filter_fake_init(init_net, value_info)
-        cls._ssa_rewrite(predict_net, init_net, value_info)
-
-        if init_net:
-            initializer = cls.caffe2_init_net_to_initializer(init_net)
-            value_info.update({init.name: (init.data_type, init.dims)
-                               for init in initializer})
-        else:
-            initializer = []
-
-        # Check if value_info contains the types/shapes of all the blobs, in
-        # which case we don't need to infer them by running the net.
-        run_native_net = False
-        for op in predict_net.op:
-            for name in itertools.chain(op.input, op.output):
-                if name not in value_info:
-                    run_native_net = True
-                    break
-
-        # Check whether we have got type shape info of all input
-        missing = (set(list(predict_net.external_input)) -
-                   set(value_info.keys()))
-        if missing:
-            raise RuntimeError('Could not find value info of inputs: {}'.format(
-                ', '.join(missing)))
-
-        ws = None
-        outputs = None
-        if run_native_net:
-            inputs = {}
-            for name in predict_net.external_input:
-                elem_type, shape = value_info[name]
-                inputs[name] = np.random.randn(*shape).astype(
-                    mapping.TENSOR_TYPE_TO_NP_TYPE[elem_type])
-
-            ws, outputs = c2_native_run_net(
-                init_net,
-                predict_net,
-                inputs)
-
-            for name in predict_net.external_output:
-                output = outputs[name]
-                elem_type = mapping.NP_TYPE_TO_TENSOR_TYPE[output.dtype]
-                shape = output.shape
-                value_info[name] = (elem_type, shape)
-
-        graph_def = GraphProto()
-        graph_def.name = predict_net.name
-        graph_def.initializer.extend(initializer)
-        # This is a mapping from Caffe2 names to ONNX names
-        graph_def.input.extend(
-            make_tensor_value_info(
-                name=name,
-                elem_type=value_info[name][0],
-                shape=value_info[name][1])
-            for name in predict_net.external_input)
-
-        cls._dummy_name.reset(cls._all_names_in_net(predict_net) | cls._all_names_in_net(init_net))
-
-        for op in predict_net.op:
-            shapes = {}
-            for name in itertools.chain(op.input, op.output):
-                if ws:
-                    blob = ws.FetchBlob(name)
-                    if hasattr(blob, 'shape'):
-                        shapes[name] = blob.shape
-                else:
-                    shapes[name] = value_info[name][1]
-            nodes, const_tensors = cls.caffe2_op_to_onnx_node(op, shapes=shapes)
-            graph_def.node.extend(nodes)
-            graph_def.initializer.extend(const_tensors)
-            graph_def.input.extend([cls._extract_value_info(tensor) for tensor in const_tensors])
-
-        all_output = set(sum((list(node.output) for node in graph_def.node),
-                             [init.name for init in graph_def.initializer]))
-        redundant_output = set(vi.name for vi in graph_def.output) - all_output
-        if redundant_output:
-            logger.warning(
-                'There are graph output not produced by any node or initializer: {}'
-                '! Will drop them.'.format(', '.join(redundant_output)))
-        graph_def.output.extend(
-            make_tensor_value_info(
-                name=name,
-                elem_type=value_info[name][0],
-                shape=value_info[name][1])
-            for name in predict_net.external_output
-            if name in all_output)
-
-        return graph_def
-
-    @classmethod
-    def caffe2_init_net_to_initializer(cls, init_net):
-        ws, _ = c2_native_run_net(init_net=None, predict_net=init_net, inputs=[])
-        output_names = []
-        for op in init_net.op:
-            output_names.extend(op.output)
-        initializer = [numpy_helper.from_array(ws.FetchBlob(name), name=name)
-                       for name in sorted(set(output_names))]
-        return initializer
-
-    @classmethod
-    def _filter_fake_init(cls, init_net, value_info):
-        if init_net:
-            fake_inits = [op for op in init_net.op
-                          if len(op.output) == 1 and op.output[0] in value_info and
-                          re.match('GivenTensor.*Fill|ConstantFill', op.type)]
-            for fake_init in fake_inits:
-                init_net.op.remove(fake_init)
-            del fake_inits[:]
-            del fake_inits
-
-    @classmethod
-    def ssa_rewrite(cls, net, init_net, value_info):
-        return cls._ssa_rewrite(net, init_net, value_info)
-
-    @classmethod
-    def _ssa_rewrite(cls, net, init_net, value_info):
-        def ssa_name(name, version, version_cnt=None):
-            if version == 0:
-                return name
-            if version_cnt and len(version_cnt.get(name, {})) <= 1:
-                return name
-            return '{}_{}'.format(name, version)
-
-        if init_net:
-            for op in init_net.op:
-                assert re.match('GivenTensor.*Fill', op.type), "type is {}, \n{}".format(op.type, op)
-                assert len(op.output) == 1
-
-        ssa, blob_versions = caffe2_core.get_ssa(net)
-        version_cnt = {}
-        versioned_blobs = []
-        for versioned_input, versioned_output in ssa:
-            versioned_blobs += versioned_input
-            versioned_blobs += versioned_output
-
-        for (name, version) in versioned_blobs:
-            if name not in version_cnt:
-                version_cnt[name] = {version}
-            else:
-                version_cnt[name].add(version)
-
-        assert len(net.op) == len(ssa)
-        for op, (versioned_inputs, versioned_outputs) in zip(net.op, ssa):
-            op.input[:] = [ssa_name(name, version, version_cnt)
-                           for name, version in versioned_inputs]
-            op.output[:] = [ssa_name(name, version, version_cnt)
-                            for name, version in versioned_outputs]
-        net.external_output[:] = [ssa_name(name, blob_versions[name], version_cnt)
-                                  for name in net.external_output]
-
-    @classmethod
-    def caffe2_net_to_onnx_model(cls, *args, **kwargs):
-        opset_id = OperatorSetIdProto()
-        opset_id.domain = ''  # ONNX default domain
-        opset_id.version = cls.target_opset_version
-        model = make_model(cls.caffe2_net_to_onnx_graph(*args, **kwargs),
-                           opset_imports=[opset_id],  # current supported opset version
-                           producer_name='onnx-caffe2',  # producer name
-                           )
-        checker.check_model(model)
-        return model
-
-
-caffe2_net_to_onnx_graph = Caffe2Frontend.caffe2_net_to_onnx_graph
-caffe2_net_to_onnx_model = Caffe2Frontend.caffe2_net_to_onnx_model
-caffe2_init_net_to_initializer = Caffe2Frontend.caffe2_init_net_to_initializer
-ssa_rewrite = Caffe2Frontend.ssa_rewrite
diff --git a/caffe2/python/onnx/helper.py b/caffe2/python/onnx/helper.py
deleted file mode 100644
index 6e73a5d5c95d..000000000000
--- a/caffe2/python/onnx/helper.py
+++ /dev/null
@@ -1,120 +0,0 @@
-## @package onnx
-# Module caffe2.python.onnx.helper
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from onnx.backend.base import namedtupledict
-
-from caffe2.python.onnx.workspace import Workspace
-import logging
-import time
-
-
-log = logging.getLogger(__name__)
-
-
-def c2_native_run_op(op_def, inputs):
-    ws = Workspace()
-    if isinstance(inputs, dict):
-        for key, value in inputs.items():
-            ws.FeedBlob(key, value, op_def.device_option)
-    else:
-        assert(len(op_def.input) == len(inputs))
-        for key, value in zip(op_def.input, inputs):
-            ws.FeedBlob(key, value, op_def.device_option)
-
-    ws.RunOperatorOnce(op_def)
-
-    output_names = op_def.output
-    output_values = [ws.FetchBlob(name) for name in output_names]
-    return ws, namedtupledict('Outputs', output_names)(*output_values)
-
-
-def c2_native_run_net(init_net, predict_net, inputs, debug_arg=None):
-    ws = Workspace()
-    if init_net:
-        ws.RunNetOnce(init_net)
-
-    if isinstance(inputs, dict):
-        for key, value in inputs.items():
-            ws.FeedBlob(key, value, predict_net.device_option)
-    else:
-        uninitialized = [input_name
-                         for input_name in predict_net.external_input
-                         if not ws.HasBlob(input_name)]
-        if len(uninitialized) == len(inputs):
-            for key, value in zip(uninitialized, inputs):
-                ws.FeedBlob(key, value, predict_net.device_option)
-        else:
-            # If everything is initialized,
-            # we just initialized the first len(inputs) external_input.
-            # Added some extra logging to help debug sporadic sandcastle fails
-            if len(inputs) > len(predict_net.external_input):
-                print("c2_native_run_net assert. len(inputs)=", len(inputs),
-                      "len(predict_net.external_input)=",
-                      len(predict_net.external_input))
-                print("debug_arg: ", debug_arg)
-                print("predict_net ", type(predict_net), ":", predict_net)
-                print("inputs ", type(inputs), ":", inputs)
-            assert(len(inputs) <= len(predict_net.external_input))
-            for i in range(len(inputs)):
-                ws.FeedBlob(predict_net.external_input[i], inputs[i],
-                            predict_net.device_option)
-
-    ws.RunNetOnce(predict_net)
-
-    output_names = predict_net.external_output
-    output_values = [ws.FetchBlob(name) for name in output_names]
-    return ws, namedtupledict('Outputs', output_names)(*output_values)
-
-
-def load_caffe2_net(file):
-    net = caffe2_pb2.NetDef()
-    with open(file, "rb") as f:
-        net.ParseFromString(f.read())
-    return net
-
-
-def save_caffe2_net(net, file, output_txt=False):
-    with open(file, "wb") as f:
-        f.write(net.SerializeToString())
-    if output_txt:
-        with open(file + "txt", "w") as f:
-            f.write(str(net))
-
-
-def benchmark_caffe2_model(init_net, predict_net, warmup_iters=3, main_iters=10, layer_details=True):
-    '''
-        Run the benchmark net on the target model.
-        Return the execution time per iteration (millisecond).
-    '''
-    ws = Workspace()
-    if init_net:
-        ws.RunNetOnce(init_net)
-    ws.CreateNet(predict_net)
-    results = ws.BenchmarkNet(predict_net.name, warmup_iters, main_iters, layer_details)
-    del ws
-    return results[0]
-
-
-def benchmark_pytorch_model(model, inputs, training=False, warmup_iters=3,
-                            main_iters=10, verbose=False):
-    '''
-        Run the model several times, and measure the execution time.
-        Return the execution time per iteration (millisecond).
-    '''
-    for _i in range(warmup_iters):
-        model(*inputs)
-    total_pytorch_time = 0.0
-    for _i in range(main_iters):
-        ts = time.time()
-        model(*inputs)
-        te = time.time()
-        total_pytorch_time += te - ts
-    log.info("The PyTorch model execution time per iter is {} milliseconds, "
-             "{} iters per second.".format(total_pytorch_time / main_iters * 1000,
-                                           main_iters / total_pytorch_time))
-    return total_pytorch_time * 1000 / main_iters
diff --git a/caffe2/python/onnx/onnxifi.py b/caffe2/python/onnx/onnxifi.py
deleted file mode 100644
index 973d06378e20..000000000000
--- a/caffe2/python/onnx/onnxifi.py
+++ /dev/null
@@ -1,68 +0,0 @@
-## @package onnx
-#Module caffe2.python.onnx.onnxifi
-
-"""
-ONNXIFI a Caffe2 net
-"""
-
-from caffe2.proto import caffe2_pb2
-import caffe2.python._import_c_extension as C
-
-
-def onnxifi_set_option(option_name, option_value):
-    """
-    Set onnxifi option
-    """
-    return C.onnxifi_set_option(option_name, str(option_value))
-
-
-def onnxifi_get_option(option_name):
-    """
-    Get onnxifi option
-    """
-    return C.onnxifi_get_option(option_name)
-
-def onnxifi_caffe2_net(
-        pred_net,
-        input_shapes,
-        max_batch_size=1,
-        max_seq_size=1,
-        debug=False,
-        use_onnx=True,
-        merge_fp32_inputs_into_fp16=False,
-        adjust_batch=True,
-        block_list=None,
-        weight_names=None,
-        net_ssa_rewritten=False,
-        timeout=0):
-    """
-    Transform the caffe2_net by collapsing ONNXIFI-runnable nodes into Onnxifi c2 ops
-    """
-    shape_hints = caffe2_pb2.TensorBoundShapes()
-    if type(input_shapes) is caffe2_pb2.TensorBoundShapes:
-        shape_hints = input_shapes
-    elif type(input_shapes) is dict:
-        for k, v in input_shapes.items():
-            tbs = caffe2_pb2.TensorBoundShape()
-            tbs.name = k
-            tbs.shape.dims.extend(v)
-            tbs.dim_type.extend([caffe2_pb2.TensorBoundShape.CONSTANT] * len(tbs.shape.dims))
-            tbs.dim_type[0] = caffe2_pb2.TensorBoundShape.BATCH
-            shape_hints.shapes.extend([tbs])
-        shape_hints.max_batch_size = max_batch_size
-        shape_hints.max_feature_len = max_seq_size
-    pred_net_str = C.onnxifi(pred_net.SerializeToString(),
-                             shape_hints.SerializeToString(),
-                             block_list if block_list else [],
-                             weight_names if weight_names is not None else [],
-                             max_batch_size,
-                             max_seq_size,
-                             timeout,
-                             adjust_batch,
-                             debug,
-                             merge_fp32_inputs_into_fp16,
-                             net_ssa_rewritten,
-                             use_onnx)
-    pred_net_cut = caffe2_pb2.NetDef()
-    pred_net_cut.ParseFromString(pred_net_str)
-    return pred_net_cut
diff --git a/caffe2/python/onnx/test_onnxifi.py b/caffe2/python/onnx/test_onnxifi.py
deleted file mode 100644
index 73a120616985..000000000000
--- a/caffe2/python/onnx/test_onnxifi.py
+++ /dev/null
@@ -1,199 +0,0 @@
-
-
-
-
-
-import numpy as np
-import time
-import unittest
-
-import onnx
-import onnx.defs
-from onnx.backend.base import namedtupledict
-from onnx.helper import make_node, make_graph, make_tensor_value_info, make_model
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.models.download import ModelDownloader
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.onnx.tests.test_utils import TestCase
-
-ONNXIFI_DATATYPE_FLOAT32 = 1
-
-
-def _print_net(net):
-    for i in net.external_input:
-        print("Input: {}".format(i))
-    for i in net.external_output:
-        print("Output: {}".format(i))
-    for op in net.op:
-        print("Op {}".format(op.type))
-        for x in op.input:
-            print("  input: {}".format(x))
-        for y in op.output:
-            print("  output: {}".format(y))
-
-
-class OnnxifiTest(TestCase):
-    @unittest.skip("Need ONNXIFI backend support")
-    def test_relu_graph(self):
-        batch_size = 1
-        X = np.random.randn(batch_size, 1, 3, 2).astype(np.float32)
-        graph_def = make_graph(
-            [make_node("Relu", ["X"], ["Y"])],
-            name="test",
-            inputs=[make_tensor_value_info("X", onnx.TensorProto.FLOAT,
-                [batch_size, 1, 3, 2])],
-            outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT,
-                [batch_size, 1, 3, 2])])
-        model_def = make_model(graph_def, producer_name='relu-test')
-        op = core.CreateOperator(
-            "Onnxifi",
-            ["X"],
-            ["Y"],
-            onnx_model=model_def.SerializeToString(),
-            input_names=["X"],
-            output_names=["Y"],
-            output_shape_hint_0=[ONNXIFI_DATATYPE_FLOAT32, batch_size, 1, 3, 2])
-        workspace.FeedBlob("X", X)
-        workspace.RunOperatorOnce(op)
-        Y = workspace.FetchBlob("Y")
-        np.testing.assert_almost_equal(Y, np.maximum(X, 0))
-
-    @unittest.skip("Need ONNXIFI backend support")
-    def test_conv_graph(self):
-        X = np.array([[[[0., 1., 2., 3., 4.],  # (1, 1, 5, 5) input tensor
-                        [5., 6., 7., 8., 9.],
-                        [10., 11., 12., 13., 14.],
-                        [15., 16., 17., 18., 19.],
-                        [20., 21., 22., 23., 24.]]]]).astype(np.float32)
-        W = np.array([[[[1., 1., 1.],  # (1, 1, 3, 3) tensor for convolution weights
-                        [1., 1., 1.],
-                        [1., 1., 1.]]]]).astype(np.float32)
-        Y_without_padding = np.array([[[[54., 63., 72.],  # (1, 1, 3, 3) output tensor
-                                        [99., 108., 117.],
-                                        [144., 153., 162.]]]]).astype(np.float32)
-        graph_def = make_graph(
-            [make_node(
-                'Conv',
-                inputs=['X', 'W'],
-                outputs=['Y'],
-                kernel_shape=[3, 3],
-                # Default values for other attributes: strides=[1, 1], dilations=[1, 1], groups=1
-                pads=[0, 0, 0, 0],
-            )],
-            name="test",
-            inputs=[make_tensor_value_info("X", onnx.TensorProto.FLOAT, [1, 1, 5, 5]),
-                make_tensor_value_info("W", onnx.TensorProto.FLOAT, [1, 1, 3, 3]),
-            ],
-            outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT,
-                [1, 1, 3, 3])])
-        model_def = make_model(graph_def, producer_name='conv-test')
-        # We intentional rewrite the input/output name so test that the
-        # input/output binding of c2 op is positional
-        op = core.CreateOperator(
-            "Onnxifi",
-            ["X0"],
-            ["Y0"],
-            onnx_model=model_def.SerializeToString(),
-            initializers=["W", "W0"],
-            input_names=["X"],
-            output_names=["Y"],
-            output_shape_hint_0=[ONNXIFI_DATATYPE_FLOAT32, 1, 1, 3, 3])
-        workspace.FeedBlob("X0", X)
-        workspace.FeedBlob("W0", W)
-        workspace.RunOperatorOnce(op)
-        Y = workspace.FetchBlob("Y0")
-        np.testing.assert_almost_equal(Y, Y_without_padding)
-
-
-class OnnxifiTransformTest(TestCase):
-    def setUp(self):
-        self.model_downloader = ModelDownloader()
-
-    def _add_head_tail(self, pred_net, new_head, new_tail):
-        orig_head = pred_net.external_input[0]
-        orig_tail = pred_net.external_output[0]
-
-        # Add head
-        head = caffe2_pb2.OperatorDef()
-        head.type = "Copy"
-        head.input.append(new_head)
-        head.output.append(orig_head)
-        dummy = caffe2_pb2.NetDef()
-        dummy.op.extend(pred_net.op)
-        del pred_net.op[:]
-        pred_net.op.extend([head])
-        pred_net.op.extend(dummy.op)
-        pred_net.external_input[0] = new_head
-
-        # Add tail
-        tail = caffe2_pb2.OperatorDef()
-        tail.type = "Copy"
-        tail.input.append(orig_tail)
-        tail.output.append(new_tail)
-        pred_net.op.extend([tail])
-        pred_net.external_output[0] = new_tail
-
-    @unittest.skip("Need ONNXIFI backend support")
-    def test_resnet50_core(self):
-        N = 1
-        repeat = 1
-        print("Batch size: {}, repeat inference {} times".format(N, repeat))
-        init_net, pred_net, _ = self.model_downloader.get_c2_model('resnet50')
-        self._add_head_tail(pred_net, 'real_data', 'real_softmax')
-        input_blob_dims = (N, 3, 224, 224)
-        input_name = "real_data"
-
-        device_option = core.DeviceOption(caffe2_pb2.CPU, 0)
-        init_net.device_option.CopyFrom(device_option)
-        pred_net.device_option.CopyFrom(device_option)
-        for op in pred_net.op:
-            op.device_option.CopyFrom(device_option)
-        net_outputs = pred_net.external_output
-        Y_c2 = None
-        data = np.random.randn(*input_blob_dims).astype(np.float32)
-        c2_time = 1
-        workspace.SwitchWorkspace("onnxifi_test", True)
-        with core.DeviceScope(device_option):
-            workspace.FeedBlob(input_name, data)
-            workspace.RunNetOnce(init_net)
-            workspace.CreateNet(pred_net)
-            start = time.time()
-            for _ in range(repeat):
-                workspace.RunNet(pred_net.name)
-            end = time.time()
-            c2_time = end - start
-            output_values = [workspace.FetchBlob(name) for name in net_outputs]
-            Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values)
-        workspace.ResetWorkspace()
-
-        # Fill the workspace with the weights
-        with core.DeviceScope(device_option):
-            workspace.RunNetOnce(init_net)
-
-        # Cut the graph
-        start = time.time()
-        pred_net_cut = onnxifi_caffe2_net(pred_net,
-                                          {input_name: input_blob_dims},
-                                          infer_shapes=True)
-        del init_net, pred_net
-        #_print_net(pred_net_cut)
-
-        Y_trt = None
-        input_name = pred_net_cut.external_input[0]
-        print("C2 runtime: {}s".format(c2_time))
-        with core.DeviceScope(device_option):
-            workspace.FeedBlob(input_name, data)
-            workspace.CreateNet(pred_net_cut)
-            end = time.time()
-            print("Conversion time: {:.2f}s".format(end - start))
-
-            start = time.time()
-            for _ in range(repeat):
-                workspace.RunNet(pred_net_cut.name)
-            end = time.time()
-            trt_time = end - start
-            print("Onnxifi runtime: {}s, improvement: {}%".format(trt_time, (c2_time - trt_time) / c2_time * 100))
-            output_values = [workspace.FetchBlob(name) for name in net_outputs]
-            Y_trt = namedtupledict('Outputs', net_outputs)(*output_values)
-        np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
diff --git a/caffe2/python/onnx/tests/__init__.py b/caffe2/python/onnx/tests/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/onnx/tests/c2_ref_test.py b/caffe2/python/onnx/tests/c2_ref_test.py
deleted file mode 100644
index aab5a04a169c..000000000000
--- a/caffe2/python/onnx/tests/c2_ref_test.py
+++ /dev/null
@@ -1,869 +0,0 @@
-# @package onnx
-# Module caffe2.python.onnx.tests.c2_ref_test
-
-
-
-
-
-
-import os
-import unittest
-
-from caffe2.python import core
-from caffe2.proto import caffe2_pb2
-
-import onnx
-from onnx.helper import make_node, make_graph, make_tensor, make_tensor_value_info, make_model
-from caffe2.python.onnx.helper import c2_native_run_net, c2_native_run_op
-
-from onnx import mapping
-import caffe2.python.onnx.frontend as c2_onnx
-import caffe2.python.onnx.backend as c2
-
-import numpy as np
-from caffe2.python.models.download import ModelDownloader
-
-from caffe2.python.onnx.tests.test_utils import TestCase
-
-import caffe2.python._import_c_extension as C
-
-
-class TestCaffe2Basic(TestCase):
-    def test_dummy_name(self):
-        g = C.DummyName()
-        n1 = g.new_dummy_name()
-        n2 = g.new_dummy_name()
-        assert n1 != n2, "Got same names in different calls: {}".format(n1)
-
-    def test_check_arguments(self):
-        b2 = C.Caffe2Backend()
-
-        node_def = make_node("Add", inputs=["X", "Y"], outputs=["Z"])
-        b2.convert_node(node_def.SerializeToString())
-
-        bad_node_def = make_node("Add", inputs=["X", "Y"], outputs=["Z"], foo=42, bar=56)
-        with self.assertRaisesRegex(RuntimeError,
-                                    "Don't know how to map unexpected argument (foo|bar)"):
-            b2.convert_node(bad_node_def.SerializeToString())
-
-    def test_dynamicslice_3inputs_graph(self):
-        node_def = make_node(
-            "DynamicSlice", ["X1", "X2", "X3"], ["Y"])
-
-        graph_def = make_graph(
-            [node_def],
-            name="test",
-            inputs=[make_tensor_value_info("X1", onnx.TensorProto.FLOAT, (2, 4)),
-             make_tensor_value_info("X2", onnx.TensorProto.INT32, (1, 2)),
-             make_tensor_value_info("X3", onnx.TensorProto.INT32, (1, 2))],
-            outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT, (1, 2))])
-        model_def = make_model(graph_def, producer_name='caffe2-ref-test')
-
-        x = [[1,2,3,4],[5,6,7,8]]
-        start = [0, 0]
-        end = [-1, 4]
-        prepared = c2.prepare(model_def)
-        output = prepared.run(inputs=[np.array(x), np.array(start), np.array(end)])
-        self.assertSameOutputs(output[0], np.array(x)[0:-1, 0:4])
-
-    def test_dynamicslice_4inputs_graph(self):
-        node_def = make_node(
-            "DynamicSlice", ["X1", "X2", "X3", "axes"], ["Y"])
-        graph_def = make_graph(
-            [node_def],
-            name="test",
-            inputs=[make_tensor_value_info("X1", onnx.TensorProto.FLOAT, (2, 4)),
-             make_tensor_value_info("X2", onnx.TensorProto.INT32, (1, 2)),
-             make_tensor_value_info("X3", onnx.TensorProto.INT32, (1, 2)),
-             make_tensor_value_info("axes", onnx.TensorProto.INT32, (1, 2))],
-            outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT, (1, 2))])
-        model_def = make_model(graph_def, producer_name='caffe2-ref-test')
-        x = [[1,2,3,4],[5,6,7,8]]
-        start = [0, 1]
-        end = [4, 5]
-        axes = [1, 0]
-        prepared = c2.prepare(model_def)
-        output = prepared.run(inputs=[np.array(x), np.array(start), np.array(end), np.array(axes)])
-        self.assertSameOutputs(output[0], np.array(x)[1:5, 0:4])
-
-    def test_relu_graph(self):
-        X = np.random.randn(3, 2).astype(np.float32)
-        Y_ref = np.clip(X, 0, np.inf)
-
-        node_def = make_node(
-            "Relu", ["X"], ["Y"])
-        output = c2.run_node(
-            node_def, {"X": X})
-        np.testing.assert_almost_equal(output.Y, Y_ref)
-
-        graph_def = make_graph(
-            [node_def],
-            name="test",
-            inputs=[make_tensor_value_info("X", onnx.TensorProto.FLOAT, [3, 2])],
-            outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [3, 2])])
-        c2_rep = c2.prepare(make_model(graph_def, producer_name='caffe2-ref-test'))
-        output = c2_rep.run(X)
-        np.testing.assert_almost_equal(output.Y, Y_ref)
-
-    def test_elementwiselinear(self):
-        X = np.random.randn(4, 2, 5, 7, 3).astype(np.float32)
-        W = np.random.randn(21).astype(np.float32)
-        B = np.random.randn(21).astype(np.float32)
-
-        predict_net = caffe2_pb2.NetDef()
-        predict_net.name = 'test-elementwiselinear-net'
-        predict_net.external_input[:] = ['X', 'W', 'B']
-        predict_net.external_output[:] = ['Y']
-        predict_net.op.extend([
-            core.CreateOperator(
-                'ElementwiseLinear',
-                inputs=['X', 'W', 'B'],
-                outputs=['Y'],
-                axis=3,
-            ),
-        ])
-        ws, c2_outputs = c2_native_run_net(
-            init_net=None,
-            predict_net=predict_net,
-            inputs=[X, W, B])
-
-        onnx_model = c2_onnx.caffe2_net_to_onnx_model(
-            predict_net=predict_net,
-            value_info={
-                'X': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype], X.shape),
-                'W': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[W.dtype], W.shape),
-                'B': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[B.dtype], B.shape),
-            })
-        onnx_outputs = c2.run_model(onnx_model, inputs=[X, W, B])
-        self.assertSameOutputs(c2_outputs, onnx_outputs)
-
-    def test_initializer(self):
-        X = np.array([[1, 2], [3, 4]]).astype(np.float32)
-        Y = np.array([[1, 2], [3, 4]]).astype(np.float32)
-        weight = np.array([[1, 0], [0, 1]])
-        graph_def = make_graph(
-            [make_node("Add", ["X", "Y"], ["Z0"]),
-             make_node("Cast", ["Z0"], ["Z"], to=onnx.TensorProto.FLOAT),
-             make_node("Mul", ["Z", "weight"], ["W0"]),
-             make_node("Tanh", ["W0"], ["W1"]),
-             make_node("Sigmoid", ["W1"], ["W2"]),
-             make_node("Scale", ["W2"], ["W3"], scale=-1.0)],
-            name="test_initializer",
-            inputs=[
-                make_tensor_value_info("X", onnx.TensorProto.FLOAT, (2, 2)),
-                make_tensor_value_info("Y", onnx.TensorProto.FLOAT, (2, 2)),
-                make_tensor_value_info("weight", onnx.TensorProto.FLOAT, (2, 2)),
-            ],
-            outputs=[
-                make_tensor_value_info("W3", onnx.TensorProto.FLOAT, (2, 2))
-            ],
-            initializer=[make_tensor("weight",
-                                     onnx.TensorProto.FLOAT,
-                                     [2, 2],
-                                     weight.flatten().astype(float))]
-        )
-
-        def sigmoid(x):
-            return 1 / (1 + np.exp(-x))
-
-        W_ref = -sigmoid(np.tanh((X + Y) * weight))
-        c2_rep = c2.prepare(make_model(graph_def, producer_name='caffe2-ref-test'))
-        output = c2_rep.run({"X": X, "Y": Y})
-        np.testing.assert_almost_equal(output["W3"], W_ref)
-
-    def test_reducemean(self):
-        X = np.random.randn(4, 6, 10, 5, 3).astype(np.float32)
-
-        predict_net = caffe2_pb2.NetDef()
-        predict_net.name = 'test-reducemean-net'
-        predict_net.external_input[:] = ['X']
-        predict_net.external_output[:] = [
-                'reduce_front_mean',
-                'reduce_back_mean',
-                'reduce_mean_0',
-                'reduce_mean_1',
-            ]
-        predict_net.op.extend([
-            core.CreateOperator(
-                'ReduceFrontMean',
-                inputs=['X'],
-                outputs=['reduce_front_mean'],
-                num_reduce_dim=2,
-            ),
-            core.CreateOperator(
-                'ReduceBackMean',
-                inputs=['X'],
-                outputs=['reduce_back_mean'],
-                num_reduce_dim=2,
-            ),
-            core.CreateOperator(
-                'ReduceMean',
-                inputs=['X'],
-                outputs=['reduce_mean_0'],
-                axes=[1, 3],
-                keepdims=0,
-            ),
-            core.CreateOperator(
-                'ReduceMean',
-                inputs=['X'],
-                outputs=['reduce_mean_1'],
-                axes=[1, 3],
-                keepdims=1,
-            ),
-        ])
-        ws, c2_outputs = c2_native_run_net(
-            init_net=None,
-            predict_net=predict_net,
-            inputs=[X])
-
-        onnx_model = c2_onnx.caffe2_net_to_onnx_model(
-            predict_net=predict_net,
-            value_info={
-                'X': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype], X.shape)
-            })
-        onnx_outputs = c2.run_model(onnx_model, inputs=[X])
-        self.assertSameOutputs(c2_outputs, onnx_outputs)
-
-    def test_upsample(self):
-        X = np.random.randn(1, 1, 2, 2).astype(np.float32)
-        width_scale = 2.0
-        height_scale = 2.0
-
-        predict_net = caffe2_pb2.NetDef()
-        predict_net.name = 'test-upsample-net'
-        predict_net.external_input[:] = ['X']
-        predict_net.external_output[:] = ['Y']
-        predict_net.op.extend([
-            core.CreateOperator(
-                'ResizeNearest',
-                inputs=['X'],
-                outputs=['Y'],
-                width_scale=width_scale,
-                height_scale=height_scale,
-            ),
-        ])
-        ws, c2_outputs = c2_native_run_net(
-            init_net=None,
-            predict_net=predict_net,
-            inputs=[X])
-
-        onnx_model = c2_onnx.caffe2_net_to_onnx_model(
-            predict_net=predict_net,
-            value_info={
-                'X': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype], X.shape)
-            })
-        onnx_outputs = c2.run_model(onnx_model, inputs=[X])
-        self.assertSameOutputs(c2_outputs, onnx_outputs)
-
-    def test_fc(self):
-        X_fake = np.zeros((3, 1, 3, 1, 7), dtype=np.float32)
-        X = np.random.randn(5, 2, 3, 1, 7).astype(np.float32)
-        W = np.random.randn(11, 21).astype(np.float32)
-        B = np.random.randn(11).astype(np.float32)
-
-        predict_net = caffe2_pb2.NetDef()
-        predict_net.name = 'test-fc-net'
-        predict_net.external_input[:] = ['X', 'W', 'B']
-        predict_net.external_output[:] = ['Y']
-        predict_net.op.extend([
-            core.CreateOperator(
-                'FC',
-                inputs=['X', 'W', 'B'],
-                outputs=['Y'],
-                axis=2,
-            ),
-        ])
-        ws, c2_outputs = c2_native_run_net(
-            init_net=None,
-            predict_net=predict_net,
-            inputs=[X, W, B])
-
-        onnx_model = c2_onnx.caffe2_net_to_onnx_model(
-            predict_net=predict_net,
-            value_info={
-                'X': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype], X_fake.shape),
-                'W': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[W.dtype], W.shape),
-                'B': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[B.dtype], B.shape),
-            })
-        onnx_outputs = c2.run_model(onnx_model, inputs=[X, W, B])
-        self.assertSameOutputs(c2_outputs, onnx_outputs)
-
-    def test_gemm(self):
-        # simple
-        A = np.random.randn(3, 2).astype(np.float32)
-        B = np.random.randn(2, 4).astype(np.float32)
-        C = np.random.randn(3, 4).astype(np.float32)
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"])
-        output = c2.run_node(node_def, [A, B, C])
-        np.testing.assert_almost_equal(output["Y"], np.dot(A, B) + C)
-
-        # transA
-        A = np.transpose(A)
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            transA=1)
-        output = c2.run_node(node_def, [A, B, C])
-        np.testing.assert_almost_equal(
-            output["Y"],
-            np.dot(np.transpose(A), B) + C)
-        # revert A
-        A = np.transpose(A)
-
-        # transB
-        B = np.transpose(B)
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            transB=1)
-        output = c2.run_node(node_def, [A, B, C])
-        np.testing.assert_almost_equal(
-            output["Y"],
-            np.dot(A, np.transpose(B)) + C)
-        # revert B
-        B = np.transpose(B)
-
-        # scale
-        alpha = np.random.random()
-        beta = np.random.random()
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            alpha=alpha,
-            beta=beta)
-        output = c2.run_node(node_def, [A, B, C])
-        np.testing.assert_almost_equal(
-            output["Y"],
-            alpha * np.dot(A, B) + beta * C)
-
-        # setup broadcastable C
-        C = np.random.randn(4).astype(np.float32)
-
-        # broadcast for opset7
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            alpha=alpha,
-            beta=beta)
-        output = c2.run_node(node_def, [A, B, C], opset_version=7)
-        np.testing.assert_almost_equal(
-            output["Y"],
-            alpha * np.dot(A, B) + beta * C)
-        # broadcast for opset3 and 6
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            alpha=alpha,
-            beta=beta,
-            broadcast=1)
-        output = c2.run_node(node_def, [A, B, C], opset_version=6)
-        np.testing.assert_almost_equal(
-            output["Y"],
-            alpha * np.dot(A, B) + beta * C)
-
-        # transB
-        B = np.transpose(B)
-
-        # transB and broadcast for opset7
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            alpha=alpha,
-            beta=beta,
-            transB=1)
-        output = c2.run_node(node_def, [A, B, C], opset_version=7)
-        np.testing.assert_almost_equal(
-            output["Y"],
-            alpha * np.dot(A, np.transpose(B)) + beta * C)
-        # transB and broadcast for opset3 and 6
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            alpha=alpha,
-            beta=beta,
-            broadcast=1,
-            transB=1)
-        output = c2.run_node(node_def, [A, B, C], opset_version=6)
-        np.testing.assert_almost_equal(
-            output["Y"],
-            alpha * np.dot(A, np.transpose(B)) + beta * C)
-
-        # revert B
-        B = np.transpose(B)
-        # set a scalar to C
-        C = np.random.randn(1).astype(np.float32)
-
-        # scalar broadcast for opset7
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            alpha=alpha,
-            beta=beta)
-        output = c2.run_node(node_def, [A, B, C], opset_version=7)
-        np.testing.assert_almost_equal(
-            output["Y"],
-            alpha * np.dot(A, B) + beta * C)
-        # scalar broadcast for opset3 and 6
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            alpha=alpha,
-            beta=beta,
-            broadcast=1)
-        output = c2.run_node(node_def, [A, B, C], opset_version=6)
-        np.testing.assert_almost_equal(
-            output["Y"],
-            alpha * np.dot(A, B) + beta * C)
-
-    def test_gemm_conversion(self):
-        node_def = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            alpha=2.,
-            beta=3.)
-        node_def_broadcast = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            alpha=2.,
-            beta=3.,
-            broadcast=1)
-        node_def_transpose_b = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            alpha=2.,
-            beta=3.,
-            transB=1)
-
-        node_def_transpose_b_broadcast = make_node(
-            'Gemm',
-            ['A', 'B', 'C'],
-            ["Y"],
-            alpha=2.,
-            beta=3.,
-            transB=1,
-            broadcast=1)
-
-        backend = C.Caffe2Backend()
-
-        # without broadcast and without shape info, gemm will be
-        # converted to matmul + add
-        _, op_strs = backend.convert_node(node_def.SerializeToString())
-        op_names = []
-        for s in op_strs:
-            op = caffe2_pb2.OperatorDef()
-            op.ParseFromString(s)
-            op_names.append(op.type)
-        self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add'])
-
-        # opset7
-        # If C is a 1d tensor, gemm will be converted to FC/FCTransposed
-        _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString(
-        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()],
-        7)
-        op_names = []
-        for s in op_strs:
-            op = caffe2_pb2.OperatorDef()
-            op.ParseFromString(s)
-            op_names.append(op.type)
-        self.assertEqual(op_names, ['Scale', 'Scale', 'FC'])
-
-        _, op_strs = backend.convert_node(node_def.SerializeToString(
-        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()],
-        7)
-        op_names = []
-        for s in op_strs:
-            op = caffe2_pb2.OperatorDef()
-            op.ParseFromString(s)
-            op_names.append(op.type)
-        self.assertEqual(op_names, ['Scale', 'Scale', 'FCTransposed'])
-
-        # opset6 without broadcast(C should match A*B's dim)
-        # The gemm will be converted to matmul + add, since the FC requires c
-        # to be 1d tensor.
-        _, op_strs = backend.convert_node(node_def.SerializeToString(
-        ), [make_tensor_value_info("A", onnx.TensorProto.FLOAT, (3,2)).SerializeToString(),
-            make_tensor_value_info("B", onnx.TensorProto.FLOAT, (2,3)).SerializeToString(),
-            make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,3)).SerializeToString()],
-        6)
-        op_names = []
-        for s in op_strs:
-            op = caffe2_pb2.OperatorDef()
-            op.ParseFromString(s)
-            op_names.append(op.type)
-        self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add'])
-
-        # opset6 with broadcast
-        # If C is a 1d tensor, gemm will be converted to FC/FCTransposed
-        _, op_strs = backend.convert_node(node_def_transpose_b_broadcast.SerializeToString(
-        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()],
-        6)
-        op_names = []
-        for s in op_strs:
-            op = caffe2_pb2.OperatorDef()
-            op.ParseFromString(s)
-            op_names.append(op.type)
-        self.assertEqual(op_names, ['Scale', 'Scale', 'FC'])
-
-        _, op_strs = backend.convert_node(node_def_broadcast.SerializeToString(
-        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (3,)).SerializeToString()],
-        6)
-        op_names = []
-        for s in op_strs:
-            op = caffe2_pb2.OperatorDef()
-            op.ParseFromString(s)
-            op_names.append(op.type)
-        self.assertEqual(op_names, ['Scale', 'Scale', 'FCTransposed'])
-
-        # opset7
-        # If C is a scalar and B's last dim is 1, gemm will be converted to FC/FCTransposed
-        _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString(
-        ), [make_tensor_value_info("B", onnx.TensorProto.FLOAT, (1,2)).SerializeToString(),
-            make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()],
-        7)
-        op_names = []
-        for s in op_strs:
-            op = caffe2_pb2.OperatorDef()
-            op.ParseFromString(s)
-            op_names.append(op.type)
-        self.assertEqual(op_names, ['Scale', 'Scale', 'FC'])
-
-        _, op_strs = backend.convert_node(node_def.SerializeToString(
-        ), [make_tensor_value_info("B", onnx.TensorProto.FLOAT, (2,1)).SerializeToString(),
-            make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()],
-        7)
-        op_names = []
-        for s in op_strs:
-            op = caffe2_pb2.OperatorDef()
-            op.ParseFromString(s)
-            op_names.append(op.type)
-        self.assertEqual(op_names, ['Scale', 'Scale', 'FCTransposed'])
-        # If C is a scalar and B's last dim is not 1, gemm will be converted
-        # to matmul + add.
-        _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString(
-        ), [make_tensor_value_info("B", onnx.TensorProto.FLOAT, (2,2)).SerializeToString(),
-            make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()],
-        7)
-        op_names = []
-        for s in op_strs:
-            op = caffe2_pb2.OperatorDef()
-            op.ParseFromString(s)
-            op_names.append(op.type)
-        self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add'])
-        # If C is a scalar and B's shape info is not available,
-        # gemm will be converted to matmul + add.
-        _, op_strs = backend.convert_node(node_def_transpose_b.SerializeToString(
-        ), [make_tensor_value_info("C", onnx.TensorProto.FLOAT, (1,)).SerializeToString()],
-        7)
-        op_names = []
-        for s in op_strs:
-            op = caffe2_pb2.OperatorDef()
-            op.ParseFromString(s)
-            op_names.append(op.type)
-        self.assertEqual(op_names, ['Scale', 'Scale', 'MatMul', 'Add'])
-
-    def test_mergedim(self):
-        X = np.random.randn(2, 3, 1, 5).astype(np.float32)
-
-        predict_net = caffe2_pb2.NetDef()
-        predict_net.name = 'test-mergedim-net'
-        predict_net.external_input[:] = ['X']
-        predict_net.external_output[:] = ['Y']
-        predict_net.op.extend([
-            core.CreateOperator(
-                'MergeDim',
-                inputs=['X'],
-                outputs=['Y'],
-            ),
-        ])
-        ws, c2_outputs = c2_native_run_net(
-            init_net=None,
-            predict_net=predict_net,
-            inputs=[X])
-
-        onnx_model = c2_onnx.caffe2_net_to_onnx_model(
-            predict_net=predict_net,
-            value_info={
-                'X': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype], X.shape),
-            })
-        onnx_outputs = c2.run_model(onnx_model, inputs=[X])
-        self.assertSameOutputs(c2_outputs, onnx_outputs)
-
-    def test_tensor_filling_ops(self):
-        for dtype in [
-                onnx.TensorProto.FLOAT,
-                onnx.TensorProto.DOUBLE,
-                onnx.TensorProto.BOOL,
-                onnx.TensorProto.INT8,
-                onnx.TensorProto.INT16,
-                onnx.TensorProto.INT32,
-                onnx.TensorProto.INT64,
-                onnx.TensorProto.UINT8,
-                onnx.TensorProto.UINT16,
-                onnx.TensorProto.UINT32,
-        ]:
-            shape = (1, 2, 3)
-            vals = np.random.randn(*shape)
-            if dtype != onnx.TensorProto.BOOL:
-                vals *= 5
-            vals = vals.astype(
-                mapping.TENSOR_TYPE_TO_NP_TYPE[dtype])
-            tensor = make_tensor(
-                name='test-tensor-{}'.format(dtype),
-                data_type=dtype,
-                dims=[1, 2, 3],
-                vals=vals.flatten().tolist(),
-            )
-            op = c2.Caffe2Backend._create_tensor_filling_op(tensor)
-            self.assertEqual(len(op.input), 0)
-            self.assertEqual(op.output, [tensor.name])
-            ws, output = c2_native_run_op(op, inputs=[])
-            self.assertEqual(len(output), 1)
-            np.testing.assert_almost_equal(output[0], vals)
-            np.testing.assert_almost_equal(ws.FetchBlob(op.output[0]), vals)
-
-    def test_tensor_filling_ops_c_backend(self):
-        for dtype in [
-                onnx.TensorProto.FLOAT,
-                onnx.TensorProto.DOUBLE,
-                onnx.TensorProto.BOOL,
-                onnx.TensorProto.INT8,
-                onnx.TensorProto.INT16,
-                onnx.TensorProto.INT32,
-                onnx.TensorProto.INT64,
-                onnx.TensorProto.UINT8,
-                onnx.TensorProto.UINT16,
-                onnx.TensorProto.UINT32,
-        ]:
-            shape = (1, 2, 3)
-            vals = np.random.randn(*shape)
-            if dtype != onnx.TensorProto.BOOL:
-                vals *= 5
-            vals = vals.astype(
-                mapping.TENSOR_TYPE_TO_NP_TYPE[dtype])
-            tensor = make_tensor(
-                name='test-tensor-{}'.format(dtype),
-                data_type=dtype,
-                dims=[1, 2, 3],
-                vals=vals.flatten().tolist(),
-            )
-            b = C.Caffe2Backend()
-            op = caffe2_pb2.OperatorDef()
-            op.ParseFromString(b._build_tensor_filling_op(tensor.SerializeToString(), ''))
-            self.assertEqual(len(op.input), 0)
-            self.assertEqual(op.output, [tensor.name])
-            ws, output = c2_native_run_op(op, inputs=[])
-            self.assertEqual(len(output), 1)
-            np.testing.assert_almost_equal(output[0], vals)
-            np.testing.assert_almost_equal(ws.FetchBlob(op.output[0]), vals)
-
-    def test_concat(self):
-        I0 = np.random.randn(20, 4).astype(np.float32)
-        I1 = np.random.randn(20, 4).astype(np.float32)
-        for i in range(2):
-            predict_net = caffe2_pb2.NetDef()
-            predict_net.name = 'test-concat-net'
-            predict_net.external_input[:] = ['I0', 'I1']
-            predict_net.external_output[:] = ['Y', 'output_dim']
-            predict_net.op.extend([
-                core.CreateOperator(
-                    'Concat',
-                    inputs=['I0', 'I1'],
-                    outputs=['Y', 'output_dim'],
-                    axis=1,
-                    add_axis=(1 if i == 0 else 0),
-                ),
-            ])
-            ws, c2_outputs = c2_native_run_net(
-                init_net=None,
-                predict_net=predict_net,
-                inputs=[I0, I1])
-            onnx_model = c2_onnx.caffe2_net_to_onnx_model(
-                predict_net=predict_net,
-                value_info={
-                    'I0': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[I0.dtype], I0.shape),
-                    'I1': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[I1.dtype], I1.shape),
-                })
-            onnx_outputs = c2.run_model(onnx_model, inputs=[I0, I1])
-            self.assertSameOutputs(c2_outputs, onnx_outputs)
-
-    def test_slice(self):
-        X = np.random.randn(1, 2, 3).astype(np.float32)
-        starts = np.array([0, 1, 0], dtype=np.int32)
-        ends = np.array([-1, 2, 3], dtype=np.int32)
-
-        predict_net = caffe2_pb2.NetDef()
-        predict_net.name = 'test-slice-net'
-        predict_net.external_input[:] = ['X']
-        predict_net.external_output[:] = ['Y']
-        predict_net.op.extend([
-            core.CreateOperator(
-                'Slice',
-                inputs=['X'],
-                outputs=['Y'],
-                starts=starts,
-                ends=ends,
-            ),
-        ])
-        ws, c2_outputs = c2_native_run_net(
-            init_net=None,
-            predict_net=predict_net,
-            inputs=[X])
-
-        onnx_model = c2_onnx.caffe2_net_to_onnx_model(
-            predict_net=predict_net,
-            value_info={
-                'X': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype], X.shape)
-            })
-        onnx_outputs = c2.run_model(onnx_model, inputs=[X])
-        self.assertSameOutputs(c2_outputs, onnx_outputs)
-
-    def test_cast(self):
-        X = np.random.randn(1, 2, 3).astype(np.float32)
-
-        for to_type in ['INT8', caffe2_pb2.TensorProto.INT8,
-                        'DOUBLE', caffe2_pb2.TensorProto.DOUBLE]:
-            predict_net = caffe2_pb2.NetDef()
-            predict_net.name = 'test-cast-net'
-            predict_net.external_input[:] = ['X']
-            predict_net.external_output[:] = ['Y']
-            predict_net.op.extend([
-                core.CreateOperator(
-                    'Cast',
-                    inputs=['X'],
-                    outputs=['Y'],
-                    to=to_type,
-                ),
-            ])
-            ws, c2_outputs = c2_native_run_net(
-                init_net=None,
-                predict_net=predict_net,
-                inputs=[X])
-
-            onnx_model = c2_onnx.caffe2_net_to_onnx_model(
-                predict_net=predict_net,
-                value_info={
-                    'X': (onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[X.dtype], X.shape)
-                })
-            onnx_outputs = c2.run_model(onnx_model, inputs=[X])
-            self.assertSameOutputs(c2_outputs, onnx_outputs)
-
-
-class TestCaffe2End2End(TestCase):
-    def setUp(self):
-        self.model_downloader = ModelDownloader('ONNX_MODELS')
-
-    def _test_net(self,
-                  net_name,
-                  input_blob_dims=(1, 3, 224, 224),
-                  decimal=7):
-        np.random.seed(seed=0)
-        try:
-            c2_init_net, c2_predict_net, value_info, debug_str = self.model_downloader.get_c2_model_dbg(net_name)
-        except Exception as e:
-            # catch IOError/OSError that is caused by FileNotFoundError and PermissionError
-            # This is helpful because sometimes we get errors due to gfs not available
-            # get_c2_model_dbg wraps URLError/HTTPErrors into generic Exception
-            # Skip the tests if model can not be downloaded due to the any of the above
-            print("\n_test_net exception: ", e)
-            self.skipTest(str(e))
-
-        # start to run the model and compare outputs
-        n, c, h, w = input_blob_dims
-        data = np.random.randn(n, c, h, w).astype(np.float32)
-        inputs = [data]
-        _, c2_outputs = c2_native_run_net(c2_init_net, c2_predict_net, inputs, debug_str)
-        del _
-
-        model = c2_onnx.caffe2_net_to_onnx_model(
-            predict_net=c2_predict_net,
-            init_net=c2_init_net,
-            value_info=value_info,
-        )
-        c2_ir = c2.prepare(model)
-        onnx_outputs = c2_ir.run(inputs)
-        self.assertSameOutputs(c2_outputs, onnx_outputs, decimal=decimal)
-
-    @unittest.skipIf(
-        os.environ.get('SKIP_IN_FB'),
-        'Skip internally!')
-    def test_alexnet(self):
-        self._test_net('bvlc_alexnet', decimal=4)
-
-    @unittest.skipIf(
-        os.environ.get('SKIP_IN_FB'),
-        'Skip internally!')
-    def test_resnet50(self):
-        self._test_net('resnet50')
-
-    @unittest.skipIf(
-        os.environ.get('JENKINS_URL') or os.environ.get('SKIP_IN_FB'),
-        'Taking too long to download!')
-    def test_vgg16(self):
-        self._test_net('vgg16')
-
-    @unittest.skipIf(
-        os.environ.get('JENKINS_URL') or os.environ.get('SKIP_IN_FB'),
-        'Taking too long to download!')
-    def test_zfnet(self):
-        self._test_net('zfnet')
-
-    @unittest.skipIf(
-        os.environ.get('SKIP_IN_FB'),
-        'Skip internally!')
-    def test_inception_v1(self):
-        self._test_net('inception_v1', decimal=2)
-
-    @unittest.skipIf(
-        os.environ.get('SKIP_IN_FB'),
-        'Skip internally!')
-    def test_inception_v2(self):
-        self._test_net('inception_v2')
-
-    @unittest.skipIf(
-        os.environ.get('SKIP_IN_FB'),
-        'Skip internally!')
-    def test_squeezenet(self):
-        self._test_net('squeezenet')
-
-    @unittest.skipIf(
-        os.environ.get('SKIP_IN_FB'),
-        'Skip internally!')
-    def test_densenet121(self):
-        self._test_net('densenet121')
-
-    @unittest.skipIf(
-        os.environ.get('SKIP_IN_FB'),
-        'Skip internally!')
-    def test_bvlc_googlenet(self):
-        self._test_net('bvlc_googlenet')
-
-    @unittest.skipIf(
-        os.environ.get('SKIP_IN_FB'),
-        'Skip internally!')
-    def test_bvlc_reference_caffenet(self):
-        self._test_net('bvlc_reference_caffenet')
-
-    @unittest.skipIf(
-        os.environ.get('SKIP_IN_FB'),
-        'Skip internally!')
-    def test_bvlc_reference_rcnn_ilsvrc13(self):
-        self._test_net('bvlc_reference_rcnn_ilsvrc13')
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/onnx/tests/conversion_test.py b/caffe2/python/onnx/tests/conversion_test.py
deleted file mode 100644
index b8cf8d7479a8..000000000000
--- a/caffe2/python/onnx/tests/conversion_test.py
+++ /dev/null
@@ -1,364 +0,0 @@
-## @package onnx
-# Module caffe2.python.onnx.tests.conversion_test
-
-
-
-
-
-import json
-import tempfile
-import textwrap
-import traceback
-import unittest
-import zipfile
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import brew, core
-from caffe2.python.model_helper import ModelHelper
-from click.testing import CliRunner
-import numpy as np
-from onnx import helper, ModelProto, TensorProto
-from caffe2.python.onnx.helper import c2_native_run_net
-
-from caffe2.python.onnx.bin.conversion import caffe2_to_onnx, onnx_to_caffe2
-import caffe2.python.onnx.backend as c2
-from caffe2.python.onnx.tests.test_utils import TestCase
-
-
-class TestConversion(TestCase):
-    def _run_command(self, cmd, *args, **kwargs):
-        runner = CliRunner()
-        result = runner.invoke(cmd, *args, **kwargs)
-        self.assertEqual(result.exit_code, 0, textwrap.dedent('''
-        Command exited with non-zero exit code:
-        output: {}
-        exception: {}
-        exc_info: {}
-        '''.format(result.output,
-                   result.exception,
-                   traceback.format_exception(*result.exc_info))))
-        return result
-
-    def test_caffe2_to_onnx(self):
-        caffe2_net = tempfile.NamedTemporaryFile()
-        caffe2_init_net = tempfile.NamedTemporaryFile()
-        output = tempfile.NamedTemporaryFile()
-
-        model = ModelHelper(name='caffe2-to-onnx-test')
-        brew.relu(model, ["X"], "Y")
-        caffe2_net.write(model.net.Proto().SerializeToString())
-        caffe2_net.flush()
-
-        init_model = ModelHelper(name='caffe2-to-onnx-init-test')
-        init_model.net.GivenTensorFill([], 'X', shape=[2, 2],
-                                       values=np.zeros((2, 2)).flatten().astype(float))
-        caffe2_init_net.write(init_model.net.Proto().SerializeToString())
-        caffe2_init_net.flush()
-
-        self._run_command(
-            caffe2_to_onnx, [
-                caffe2_net.name,
-                '--caffe2-init-net', caffe2_init_net.name,
-                '--output', output.name,
-            ],
-            catch_exceptions=False,
-        )
-
-        onnx_model = ModelProto()
-        onnx_model.ParseFromString(output.read())
-        self.assertEqual(len(onnx_model.graph.node), 1)
-        self.assertEqual(onnx_model.graph.node[0].op_type, 'Relu')
-        self.assertEqual(len(onnx_model.graph.initializer), 1)
-        self.assertEqual(onnx_model.graph.initializer[0].name, onnx_model.graph.input[0].name)
-
-    def test_caffe2_to_onnx_value_info(self):
-        caffe2_net = tempfile.NamedTemporaryFile()
-        output = tempfile.NamedTemporaryFile()
-
-        model = ModelHelper(name='caffe2-to-onnx-test')
-        brew.relu(model, ["X"], "Y")
-        caffe2_net.write(model.net.Proto().SerializeToString())
-        caffe2_net.flush()
-
-        args = [caffe2_net.name, '--output', output.name]
-        self.assertRaisesRegex(Exception,
-                               'value info',
-                               self._run_command, caffe2_to_onnx, args)
-
-        args.extend([
-            '--value-info',
-            json.dumps({
-                'X': (TensorProto.FLOAT, (2, 2)),
-            })])
-        self._run_command(caffe2_to_onnx, args)
-
-        onnx_model = ModelProto()
-        onnx_model.ParseFromString(output.read())
-        self.assertEqual(len(onnx_model.graph.node), 1)
-        self.assertEqual(onnx_model.graph.node[0].op_type, 'Relu')
-        self.assertEqual(len(onnx_model.graph.initializer), 0)
-
-    @unittest.skip("Disabled due to onnx optimizer deprecation")
-    def test_onnx_to_caffe2(self):
-        onnx_model = tempfile.NamedTemporaryFile()
-        output = tempfile.NamedTemporaryFile()
-        init_net_output = tempfile.NamedTemporaryFile()
-
-        node_def = helper.make_node(
-            "Mul", ["X", "W"], ["Y"])
-        graph_def = helper.make_graph(
-            [node_def],
-            "test",
-            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3)),
-             helper.make_tensor_value_info("W", TensorProto.FLOAT, (1, 3))],
-            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 3))],
-            initializer=[helper.make_tensor("W",
-                                            TensorProto.FLOAT,
-                                            [1, 3],
-                                            np.zeros((1, 3)).flatten().astype(float))])
-        model_def = helper.make_model(graph_def, producer_name='onnx-to-caffe2-test')
-        onnx_model.write(model_def.SerializeToString())
-        onnx_model.flush()
-
-        self._run_command(
-            onnx_to_caffe2, [
-                onnx_model.name,
-                '--output', output.name,
-                '--init-net-output', init_net_output.name,
-            ])
-
-        caffe2_net = caffe2_pb2.NetDef()
-        caffe2_net.ParseFromString(output.read())
-        self.assertEqual(len(caffe2_net.op), 1)
-        self.assertEqual(caffe2_net.op[0].type, 'Mul')
-
-        caffe2_init_net = caffe2_pb2.NetDef()
-        caffe2_init_net.ParseFromString(init_net_output.read())
-        self.assertEqual(len(caffe2_init_net.op), 1)
-        self.assertEqual(set(sum([list(init_op.output)
-                                  for init_op in caffe2_init_net.op], [])),
-                         {'W'})
-
-    def test_onnx_to_caffe2_zipfile(self):
-        buf = tempfile.NamedTemporaryFile()
-        onnx_model = zipfile.ZipFile(buf, 'w')
-
-        node_def = helper.make_node(
-            "MatMul", ["X", "W"], ["Y"])
-        X = np.random.rand(2, 3).astype(np.float32)
-        W = np.random.rand(3, 2).flatten().astype(np.float32)
-        graph_def = helper.make_graph(
-            [node_def],
-            "test",
-            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3)),
-             helper.make_tensor_value_info("W", TensorProto.FLOAT, (3, 2))],
-            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 2))],
-            initializer=[helper.make_tensor("W",
-                                            TensorProto.FLOAT,
-                                            [3, 2],
-                                            W.tobytes(),
-                                            raw=True)])
-        model_def = helper.make_model(graph_def, producer_name='onnx-to-caffe2-test')
-        onnx_model.writestr('__MODEL_PROTO', model_def.SerializeToString())
-        onnx_model.writestr('W', W.tobytes())
-        onnx_model.close()
-
-        W = W.reshape((3, 2))
-        Y_expect = np.matmul(X, W)
-
-        c2_model = c2.prepare_zip_archive(buf)
-        Y = c2_model.run(X).Y
-        np.testing.assert_allclose(Y, Y_expect)
-
-    def _make_fake_if_op(self, true_nodes, false_nodes, output_types):
-        true = helper.make_tensor("condition", TensorProto.BOOL, (), [True])
-        true_graph = helper.make_graph(true_nodes, "true_graph", [], [
-            helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 2)),
-        ])
-        false_graph = helper.make_graph(false_nodes, "false_graph", [], [
-            helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 2)),
-        ])
-        if_inputs = ["condition"]
-        if_outputs = [name for _, _, name in output_types]
-        retval_nodes = [
-            helper.make_node("Constant", [], ["condition"], value=true),
-            helper.make_node("If", if_inputs, if_outputs, then_branch=true_graph,
-                             else_branch=false_graph)
-        ]
-        return retval_nodes
-
-    def test_onnx_to_caffe2_if(self):
-        true_nodes = [helper.make_node(
-            "MatMul", ["X", "W"], ["Y"])]
-        false_nodes = [helper.make_node("Slice", ["X"], ["Y"], axes=[0, 1],
-                                        starts=[0, 0], ends=[2, 2])]
-        nodes = self._make_fake_if_op(true_nodes, false_nodes, [(TensorProto.FLOAT, (2, 2), "Y")])
-        X = np.random.rand(2, 3).astype(np.float32)
-        W = np.random.rand(3, 2).flatten().astype(np.float32)
-        graph_def = helper.make_graph(
-            nodes,
-            "test",
-            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 3)),
-             helper.make_tensor_value_info("W", TensorProto.FLOAT, (3, 2))],
-            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 2))],
-            initializer=[helper.make_tensor("W",
-                                            TensorProto.FLOAT,
-                                            [3, 2],
-                                            W.tolist())]
-        )
-        onnx_id = helper.make_opsetid("", 9)
-        model_def = helper.make_model(graph_def, producer_name='onnx-to-caffe2-test',
-                                      opset_imports=[onnx_id])
-
-        p = c2.prepare(model_def)
-        Y = np.matmul(X, W.reshape(3, 2))
-        out = p.run(X)
-        np.testing.assert_allclose(out.Y, Y)
-
-    # input_types and output_types are lists of triples of (name, type, shape)
-    def _make_fake_loop_op(self, body_nodes, input_types, output_types):
-        ten = helper.make_tensor("trip_count_value", TensorProto.INT64, (1,), [10])
-        true = helper.make_tensor("condition", TensorProto.BOOL, (1,), [True])
-        # lcd is a dummy loop-carried dependency that only exists because
-        # right now the schema checker is broken and assumes a variadic
-        # input needs at least one value.
-        graph_inputs = [helper.make_tensor_value_info("i", TensorProto.INT64, (1,)),
-                        helper.make_tensor_value_info("cond", TensorProto.BOOL, (1,))]
-        for type, shape, name in input_types:
-            graph_inputs.append(helper.make_tensor_value_info("_" + name, type, shape))
-        graph_outputs = [helper.make_tensor_value_info("cond", TensorProto.BOOL, (1,))]
-        for type, shape, name in output_types:
-            graph_outputs.append(helper.make_tensor_value_info("_" + name, type, shape))
-        body_graph = helper.make_graph(body_nodes, "body_graph", graph_inputs,
-                                       graph_outputs)
-        loop_inputs = ["trip_count", "condition"]
-        loop_inputs.extend([name for _, _, name in input_types])
-        loop_outputs = [name for _, _, name in output_types]
-        retval_nodes = [
-            helper.make_node("Constant", [], ["trip_count"], value=ten),
-            helper.make_node("Constant", [], ["condition"], value=true),
-            helper.make_node("Loop", loop_inputs, loop_outputs, body=body_graph)
-        ]
-        return retval_nodes
-
-    @unittest.skip("Disabled due to onnx optimizer deprecation")
-    def test_onnx_to_caffe2_loop(self):
-        body_nodes = [helper.make_node(
-            "MatMul", ["_X", "W"], ["_Y"])]
-        nodes = self._make_fake_loop_op(body_nodes,
-                                        [(TensorProto.FLOAT, (2, 2), "X")],
-                                        [(TensorProto.FLOAT, (2, 2), "Y")])
-        X = np.random.rand(2, 2).astype(np.float32)
-        W = np.random.rand(2, 2).flatten().astype(np.float32)
-        graph_def = helper.make_graph(
-            nodes,
-            "test",
-            [helper.make_tensor_value_info("X", TensorProto.FLOAT, (2, 2)),
-             helper.make_tensor_value_info("W", TensorProto.FLOAT, (2, 2))],
-            [helper.make_tensor_value_info("Y", TensorProto.FLOAT, (2, 2))],
-            initializer=[helper.make_tensor("W",
-                                            TensorProto.FLOAT,
-                                            [2, 2],
-                                            W.tolist())]
-        )
-        model_def = helper.make_model(graph_def, producer_name='onnx-to-caffe2-test')
-        Y = X
-        for _ in range(10):
-            Y = np.matmul(Y, W.reshape(2, 2))
-        p = c2.prepare(model_def)
-        out = p.run(X)
-        np.testing.assert_allclose(out.Y, Y)
-
-    # TODO investigate why this is failing after changing Reshape
-    # operator from taking the new shape as attribute to as input
-    @unittest.skip('Start failing after Reshape op change')
-    def test_convert_end2end(self):
-        predict_net_f = tempfile.NamedTemporaryFile()
-        init_net_f = tempfile.NamedTemporaryFile()
-        onnx_model_f = tempfile.NamedTemporaryFile()
-
-        x = 'X'
-        w = 'W'
-        b = 'b'
-        y = 'Y'
-
-        predict_net = caffe2_pb2.NetDef()
-        predict_net.name = 'test-convert-end2end'
-        predict_net.external_input[:] = [x, w, b]
-        predict_net.external_output[:] = [y]
-        predict_net.op.extend([
-            core.CreateOperator(
-                'FC',
-                inputs=[x, w, b],
-                outputs=[y],
-                axis=2,
-            ),
-        ])
-        predict_net_f.write(predict_net.SerializeToString())
-        predict_net_f.flush()
-
-        init_net = caffe2_pb2.NetDef()
-        init_net.name = 'test-convert-end2end-init'
-        init_net.external_output[:] = [w, b]
-        x_val = np.random.randn(1, 3, 2).astype(np.float32)
-        w_val = np.random.randn(4, 2).astype(np.float32)
-        b_val = np.random.randn(4).astype(np.float32)
-        init_net.op.extend([
-            core.CreateOperator(
-                'GivenTensorFill',
-                [],
-                [w],
-                values=w_val,
-                shape=w_val.shape,
-            ),
-            core.CreateOperator(
-                'GivenTensorFill',
-                [],
-                [b],
-                values=b_val,
-                shape=b_val.shape,
-            ),
-        ])
-        init_net_f.write(init_net.SerializeToString())
-        init_net_f.flush()
-
-        y_val = np.matmul(x_val, w_val.transpose()) + b_val
-        for _ in range(5):
-            self._run_command(
-                caffe2_to_onnx, [
-                    predict_net_f.name,
-                    '--caffe2-init-net', init_net_f.name,
-                    '--output', onnx_model_f.name,
-                    '--value-info',
-                    json.dumps({
-                        x: (TensorProto.FLOAT, (1, 3, 2)),
-                    }),
-                ],
-                catch_exceptions=False,
-            )
-
-            onnx_model_f.seek(0)
-            onnx_model = ModelProto()
-            onnx_model.ParseFromString(onnx_model_f.read())
-            np.testing.assert_almost_equal(
-                c2.run_model(
-                    onnx_model, {onnx_model.graph.input[0].name: x_val}),
-                [y_val])
-
-            self._run_command(
-                onnx_to_caffe2, [
-                    onnx_model_f.name,
-                    '--output', predict_net_f.name,
-                    '--init-net-output', init_net_f.name,
-                ])
-            predict_net_f.seek(0)
-            predict_net = caffe2_pb2.NetDef()
-            predict_net.ParseFromString(predict_net_f.read())
-            init_net_f.seek(0)
-            init_net = caffe2_pb2.NetDef()
-            init_net.ParseFromString(init_net_f.read())
-            x = predict_net.external_input[0]
-            np.testing.assert_almost_equal(c2_native_run_net(init_net=init_net,
-                                                             predict_net=predict_net,
-                                                             inputs={x: x_val})[1],
-                                           [y_val])
diff --git a/caffe2/python/onnx/tests/helper_test.py b/caffe2/python/onnx/tests/helper_test.py
deleted file mode 100644
index 9000ad94fd9b..000000000000
--- a/caffe2/python/onnx/tests/helper_test.py
+++ /dev/null
@@ -1,34 +0,0 @@
-## @package onnx
-# Module caffe2.python.onnx.tests.helper_test
-
-
-
-
-
-
-import unittest
-
-from caffe2.python.onnx.tests.test_utils import TestCase
-import caffe2.python._import_c_extension as C
-
-
-class TestCaffe2Basic(TestCase):
-    def test_dummy_name(self):
-        g = C.DummyName()
-        g.reset()
-        names_1 = [g.new_dummy_name() for _ in range(3)]
-        g.reset()
-        names_2 = [g.new_dummy_name() for _ in range(3)]
-        self.assertEqual(names_1, names_2)
-
-        g.reset(set(names_1))
-        names_3 = [g.new_dummy_name() for _ in range(3)]
-        self.assertFalse(set(names_1) & set(names_3))
-
-        g.reset(set(names_1))
-        names_4 = [g.new_dummy_name() for _ in range(3)]
-        self.assertFalse(set(names_1) & set(names_4))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/onnx/tests/onnx_backend_test.py b/caffe2/python/onnx/tests/onnx_backend_test.py
deleted file mode 100644
index 918a701db958..000000000000
--- a/caffe2/python/onnx/tests/onnx_backend_test.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# @package onnx
-# Module caffe2.python.onnx.tests.onnx_backend_test
-
-
-
-
-
-
-import os
-import unittest
-
-import onnx.backend.test
-
-import caffe2.python.onnx.backend as c2
-from caffe2.python import core
-
-core.SetEnginePref({}, {})
-
-# This is a pytest magic variable to load extra plugins
-pytest_plugins = 'onnx.backend.test.report',
-
-backend_test = onnx.backend.test.BackendTest(c2, __name__)
-
-backend_test.exclude(r'(test_hardsigmoid'  # Does not support Hardsigmoid.
-                     '|test_hardmax'  # Does not support Hardmax.
-                     '|test_.*FLOAT16.*'  # Does not support Cast on Float16.
-                     '|test_depthtospace.*'  # Does not support DepthToSpace.
-                     '|test_reduce_l1.*'  # Does not support ReduceL1.
-                     '|test_reduce_l2.*'  # Does not support ReduceL2.
-                     '|test_reduce_log_sum.*'  # Does not support ReduceLogSum.
-                     '|test_reduce_prod.*'  # Does not support ReduceProd.
-                     '|test_reduce_sum_square.*'  # Does not support ReduceSumSquare
-                     '|test_det.*'  # Does not support Det
-                     '|test_range.*'  # Does not support Range
-                     '|test_tile.*'  # Tile's Caffe2 implementation needs some tweak
-                     '|test_lstm.*'  # Seems LSTM case has some problem
-                     '|test_simple_rnn.*'  # Seems simple RNN case has some problem
-                     '|test_gru.*'  # Seems GRU case has some problem
-                     '|test_prelu.*'  # PRelu is not compliant with ONNX yet
-                     '|test_operator_repeat.*'  # Tile is not compliant with ONNX yet
-                     '|test_.*pool_.*same.*'  # Does not support pool same.
-                     '|test_.*pool_.*ceil.*'  # Does not support pool same.
-                     '|test_maxpool_with_argmax.*'  # MaxPool outputs indices in different format.
-                     '|test_maxpool.*dilation.*'  # MaxPool doesn't support dilation yet.
-                     '|test_maxpool.*uint8.*'  # MaxPool doesn't support uint8 yet.
-                     '|test_convtranspose.*'  # ConvTranspose needs some more complicated translation
-                     '|test_mvn.*'  # MeanVarianceNormalization is experimental and not supported.
-                     '|test_dynamic_slice.*'  # MeanVarianceNormalization is experimental and not supported.
-                     '|test_eyelike.*'  # Needs implementation
-                     '|test_maxunpool.*'  # Needs implementation
-                     '|test_acosh.*'  # Needs implementation
-                     '|test_asinh.*'  # Needs implementation
-                     '|test_atanh.*'  # Needs implementation
-                     '|test_onehot.*'  # Needs implementation
-                     '|test_scan.*'  # Needs implementation
-                     '|test_isnan.*'  # Needs implementation
-                     '|test_scatter.*'  # Should be similar to ScatterAssign
-                     '|test_constantofshape_int.*'  # Needs implementation
-                     '|test_shrink.*'  # Needs implementation
-                     '|test_strnorm.*'  # Needs implementation
-                     '|test_nonzero.*'  # Needs implementation
-                     '|test_tfidfvectorizer.*'  # Needs implementation
-                     '|test_top_k.*'  # opset 10 is not supported yet
-                     '|test_resize.*'  # opset 10 is not supported yet
-                     '|test_slice.*'  # opset 10 is not supported yet
-                     '|test_.*qlinear.*'  # Skip quantized op test
-                     '|test_.*quantize.*'  # Skip quantized op test
-                     '|test_.*matmulinteger.*'  # Skip quantized op test
-                     '|test_.*convinteger.*'  # Skip quantized op test
-                     '|test_isinf.*'  # Needs implementation
-                     '|test_mod.*'  # Needs implementation
-                     '|test_nonmaxsuppression.*'  # Needs implementation
-                     '|test_reversesequence.*'  # Needs implementation
-                     '|test_roialign.*'  # Needs implementation
-                     '|test_bitshift.*'  # Needs implementation
-                     '|test_round.*'  # Needs implementation
-                     '|test_cumsum.*'  # Needs implementation
-                     '|test_clip.*'  # opset 11 is not supported yet
-                     '|test_gather_elements.*'  # opset 11 is not supported yet
-                     '|test_scatter.*'  # opset 11 is not supported yet
-                     '|test_unique.*'  # opset 11 is not supported yet
-                     '|test_gathernd.*'  # opset 11 is not supported yet
-                     '|test_dropout_random.*'  # opset 12 is not supported
-                     '|test_dropout_default.*'  # opset 12 is not supported
-                     '|test_einsum.*'  # opset 12 is not supported
-                     '|test_.*training.*'  # training is not supported
-                     '|test_.*_loss.*'  # training is not supported
-                     '|test_split_zero_size.*'  # unsupported case
-                     '|test_constantofshape_int_shape_zero.*'  # unsupported case
-                     '|test_constant_pad.*'  # 1d pad is not supported
-                     '|test_edge_pad.*'  # 1d pad is not supported
-                     '|test_reflect_pad.*'  # 1d pad is not supported
-                     '|test_gemm_default_no_bias.*'  # no bias is not supported
-                     '|test_gemm_default_scalar_bias.*'  # incorrect type
-                     '|test_sequence_.*'  # type sequence is not supported yet
-                     '|test_.*negative_ax.*'  # negative axis is not supported yet
-                     '|test_.*negative_ind.*'  # negative axis is not supported yet
-                     '|test_argmax_.*select_last_index.*'  # unsupported case
-                     '|test_argmin_.*select_last_index_.*'  # unsupported case
-                     '|test_celu.*'  # unsupported case
-                     '|test_gathernd.*'  # unsupported case
-                     '|test_greater_equal.*'  # unsupported case
-                     '|test_less_equal.*'  # unsupported case
-                     '|test_max_.*'  # unsupported case
-                     '|test_min_.*'  # unsupported case
-                     '|test_.*momentum_.*'  # unsupported case
-                     '|test_sce.*'  # unsupported case
-                     '|test_nllloss.*'  # unsupported case
-                     '|test_unfoldtodepth.*'  # unsupported case
-                     '|test_.*gradient.*'  # no support for gradient op in c2-onnx
-                     '|test_.*adagrad.*'  # no support for gradient op in c2-onnx
-                     '|test_.*loss.*'  # no support for loss op in c2-onnx
-                     '|test_.*adam.*'  # no support for adam op
-                     '|test_.*identity.*'  # no support for adam op
-                     ')')
-
-# Quick patch to unbreak master CI, is working on the debugging.
-backend_test.exclude('(test_cast_.*'
-                     '|test_compress_.*'
-                     '|test_Conv1d_.*cuda'
-                     '|test_Conv3d_groups_cuda'
-                     '|test_rnn_seq_length'
-                     '|test_operator_add.*_cuda'
-                     '|test_operator_lstm_cuda'
-                     '|test_operator_rnn.*_cuda'
-                     '|test_lrn_default_cuda)')
-
-# Temporarily skip some ONNX backend tests with broadcasting.
-backend_test.exclude('(test_pow_bcast'
-                     '|test_pow_types.*'
-                     ')')
-
-# Temporarily skip some ONNX backend tests due to updates in opset 13.
-backend_test.exclude('(test_if_.*'  # added support for sequence type inputs
-                     '|test_if_seq_.*'  # added support for sequence type inputs
-                     '|test_logsoftmax_.*'  # axis attr default value changed from 1 to -1
-                     '|test_loop11_.*'  # seg fault issue
-                     '|test_loop16_.*'  # seg fault issue
-                     '|test_loop13_seq_.*'  # no support for sequence inputs for scan input
-                     '|test_reduce_sum_.*'  # axes is now an input (not attr), added noop_with_empty_axes
-                     '|test_softmax_.*'  # axis attr default value changed from 1 to -1
-                     '|test_split_variable_parts_.*'  # axes is now an input (not attr)
-                     '|test_squeeze_.*'  # axes is now an input (not attr)
-                     '|test_unsqueeze_.*'  # axes is now an input (not attr)
-                     '|test_MaxPool1d_stride_padding_dilation_.*'
-                     '|test_MaxPool2d_stride_padding_dilation_.*'
-                     ')')
-
-# Temporarily skip some ONNX backend tests due to updates in opset 14.
-backend_test.exclude('(test_add_uint8_.*'  # uint8 dtype added
-                     '|test_div_uint8_.*'  # uint8 dtype added
-                     '|test_hardswish_.*'  # new operator added
-                     '|test_mul_uint8_.*'  # uint8 dtype added
-                     '|test_sub_uint8_.*'  # uint8 dtype added
-                     '|test_tril_.*'  # new operator added
-                     '|test_triu_.*'  # new operator added
-                     '|test_identity_sequence_.*'  # new operator added
-                     '|test_reshape_allowzero_reordered_.*'
-                     '|test_conv_with_autopad_same_.*'
-                     ')')
-
-# Unsupported ops in opset 15
-backend_test.exclude('(test_bernoulli_.*'
-                     '|test_castlike_.*'
-                     '|test_optional_.*'
-                     '|test_shape_end_.*'
-                     '|test_shape_start_.*'
-                     '|test_identity_opt_*'
-                     '|test_loop16_seq_none_*'
-                     '|test_if_opt_*'
-                     ')')
-
-# Unsupported ops in opset 16
-backend_test.exclude('(test_gridsample_.*'
-                     '|test_spacetodepth_.*'
-                     ')')
-
-# Unsupported ops in opset 17
-backend_test.exclude('(test_layer_normalization_.*'
-                     '|test_blackmanwindow_.*'
-                     '|test_dft_.*'
-                     '|test_hammingwindow_.*'
-                     '|test_hannwindow_.*'
-                     '|test_melweightmatrix_.*'
-                     '|test_stft_.*'
-                     '|test_sequencemap_.*'
-                     ')')
-
-# Unsupported ops in opset 18
-backend_test.exclude('(test_center_crop_pad_.*'
-                     '|test_col2im*'
-                     '|test_bitwise*)')
-
-# Skip vgg to speed up CI
-if 'JENKINS_URL' in os.environ:
-    backend_test.exclude(r'(test_vgg19|test_vgg)')
-
-# import all test cases at global scope to make them visible to python.unittest
-globals().update(backend_test
-                 .enable_report()
-                 .test_cases)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/onnx/tests/ssa_test.py b/caffe2/python/onnx/tests/ssa_test.py
deleted file mode 100644
index 96f954037178..000000000000
--- a/caffe2/python/onnx/tests/ssa_test.py
+++ /dev/null
@@ -1,134 +0,0 @@
-## @package onnx
-# Module caffe2.python.onnx.tests.ssa_test
-
-
-
-
-
-
-import copy
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from onnx import TensorProto
-
-import caffe2.python.onnx.frontend as c2_onnx
-from caffe2.python.onnx.helper import c2_native_run_net
-from caffe2.python.onnx.tests.test_utils import TestCase
-
-
-class TestFrontendSSAConversion(TestCase):
-    def test_ssa(self):
-        X = np.random.randn(4, 2).astype(np.float32)
-        W = np.random.randn(3, 2).astype(np.float32)
-        b = np.random.randn(3).astype(np.float32)
-        s = np.random.randn(1).astype(np.float32)
-        np_result = X.dot(W.transpose()) + b + s
-
-        net = caffe2_pb2.NetDef()
-        net.name = 'test-ssa'
-        net.external_input[:] = ['W', 'X', 'b', 's']
-        net.op.extend([
-            core.CreateOperator(
-                'FC',
-                ['X', 'W', 'b'],
-                ['Y']
-            ),
-            core.CreateOperator(
-                'Add',
-                ['Y', 's'],
-                ['Y'],
-                broadcast=True,
-            )
-        ])
-        net.external_output[:] = ['Y']
-
-        init_net = caffe2_pb2.NetDef()
-        init_net.name = 'test-ssa-init'
-        init_net.op.extend([
-            core.CreateOperator(
-                'GivenTensorFill',
-                [],
-                ['W'],
-                values=W,
-                shape=W.shape,
-            ),
-            core.CreateOperator(
-                'GivenTensorFill',
-                [],
-                ['b'],
-                values=b,
-                shape=b.shape,
-            ),
-            core.CreateOperator(
-                'GivenTensorFill',
-                [],
-                ['s'],
-                values=s,
-                shape=s.shape,
-            )
-        ])
-        init_net.external_output[:] = ['W', 'b', 's']
-
-        _, orig_output = c2_native_run_net(
-            predict_net=net,
-            init_net=init_net,
-            inputs=[X])
-
-        value_info = {'X': (TensorProto.FLOAT, X.shape)}
-        c2_onnx.Caffe2Frontend._ssa_rewrite(
-            net,
-            init_net,
-            value_info)
-
-        self.assertEqual(net.external_input, ['W', 'X', 'b', 's'])
-        self.assertEqual(net.op[0].input, ['X', 'W', 'b'])
-        self.assertEqual(net.op[0].output, ['Y_1'])
-        self.assertEqual(net.op[1].input, ['Y_1', 's'])
-        self.assertEqual(net.op[1].output, ['Y_2'])
-        self.assertEqual(net.external_output, ['Y_2'])
-
-        self.assertEqual(init_net.external_input, [])
-        self.assertEqual(init_net.op[0].input, [])
-        self.assertEqual(init_net.op[0].output, ['W'])
-        self.assertEqual(init_net.op[1].input, [])
-        self.assertEqual(init_net.op[1].output, ['b'])
-        self.assertEqual(init_net.op[2].input, [])
-        self.assertEqual(init_net.op[2].output, ['s'])
-        self.assertEqual(init_net.external_output, ['W', 'b', 's'])
-        self.assertEqual(value_info, {'X': (TensorProto.FLOAT, X.shape)})
-
-        _, ssa_output = c2_native_run_net(
-            predict_net=net,
-            init_net=init_net,
-            inputs=[X])
-
-        self.assertSameOutputs(ssa_output, orig_output)
-        self.assertSameOutputs(ssa_output, [np_result])
-
-    def test_idempotence(self):
-        net = caffe2_pb2.NetDef()
-        net.name = 'test-idempotence'
-        net.external_input[:] = ['W', 'X', 'b', 's']
-        net.op.extend([
-            core.CreateOperator(
-                'FC',
-                ['X', 'W', 'b'],
-                ['Y']
-            ),
-            core.CreateOperator(
-                'Add',
-                ['Y', 's'],
-                ['Z'],
-                broadcast=True,
-            )
-        ])
-        net.external_output[:] = ['Z']
-
-        value_info = {'X': (TensorProto.FLOAT, [4, 2])}
-        net_copy = copy.deepcopy(net)
-        c2_onnx.Caffe2Frontend._ssa_rewrite(
-            net_copy,
-            None,
-            value_info)
-        self.assertEqual(net, net_copy)
diff --git a/caffe2/python/onnx/tests/test_utils.py b/caffe2/python/onnx/tests/test_utils.py
deleted file mode 100644
index 8585624e1af8..000000000000
--- a/caffe2/python/onnx/tests/test_utils.py
+++ /dev/null
@@ -1,29 +0,0 @@
-## @package onnx
-# Module caffe2.python.onnx.tests.test_utils
-
-
-
-
-
-
-import unittest
-
-import numpy as np
-
-class TestCase(unittest.TestCase):
-
-    def setUp(self):
-        np.random.seed(seed=0)
-
-    def assertSameOutputs(self, outputs1, outputs2, decimal=7):
-        self.assertEqual(len(outputs1), len(outputs2))
-        for o1, o2 in zip(outputs1, outputs2):
-            self.assertEqual(o1.dtype, o2.dtype)
-            np.testing.assert_almost_equal(o1, o2, decimal=decimal)
-
-    def add_test_case(self, name, test_func):
-        if not name.startswith('test_'):
-            raise ValueError('Test name must start with test_: {}'.format(name))
-        if hasattr(self, name):
-            raise ValueError('Duplicated test name: {}'.format(name))
-        setattr(self, name, test_func)
diff --git a/caffe2/python/onnx/workspace.py b/caffe2/python/onnx/workspace.py
deleted file mode 100644
index b15ef1dd9186..000000000000
--- a/caffe2/python/onnx/workspace.py
+++ /dev/null
@@ -1,71 +0,0 @@
-## @package onnx
-# Module caffe2.python.onnx.workspace
-
-
-
-
-
-
-import uuid
-
-from caffe2.python import workspace
-
-# Separating out the context manager part so that users won't
-# (mis-)use Workspace instances as context managers
-class _WorkspaceCtx:
-    def __init__(self, workspace_id):
-        self.workspace_id = workspace_id
-        # A stack, so that the context manager is reentrant.
-        self.workspace_stack = []
-
-    def __enter__(self):
-        self.workspace_stack.append(workspace.CurrentWorkspace())
-        workspace.SwitchWorkspace(self.workspace_id, create_if_missing=True)
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        w = self.workspace_stack.pop()
-        # Strictly speaking, create_if_missing here is unnecessary, since a user
-        # is not supposed to be allowed to destruct a workspace while we're in
-        # it.  However, empirically, it has been observed that during abnormal
-        # shutdown, Caffe2 deletes its default workspace fairly early in the
-        # final calls to destructors.  In this case, we may attempt to exit
-        # to a default workspace which no longer exists.  create_if_missing=True
-        # will (harmlessly) recreate the workspace before we finally quit.)
-        workspace.SwitchWorkspace(w, create_if_missing=True)
-
-
-class Workspace:
-    """
-    An object representing a Caffe2 workspace.  It is a context manager,
-    so you can say 'with workspace:' to use the represented workspace
-    as your global workspace.  It also supports every method supported
-    by caffe2.python.workspace, but instead of running these operations
-    in the global workspace, it runs them in the workspace represented
-    by this object.  When this object goes dead, the workspace (and all
-    nets and blobs within it) are freed.
-
-    Why do we need this class?  Caffe2's workspace model is very "global state"
-    oriented, in that there is always some ambient global workspace you are
-    working in which holds on to all of your networks and blobs.  This class
-    makes it possible to work with workspaces more locally, and without
-    forgetting to deallocate everything in the end.
-    """
-    def __init__(self):
-        # Caffe2 (apparently) doesn't provide any native method of generating
-        # a fresh, unused workspace, so we have to fake it by generating
-        # a unique ID and hoping it's not used already / will not be used
-        # directly in the future.
-        self._ctx = _WorkspaceCtx(str(uuid.uuid4()))
-
-    def __getattr__(self, attr):
-        def f(*args, **kwargs):
-            with self._ctx:
-                return getattr(workspace, attr)(*args, **kwargs)
-        return f
-
-    def __del__(self):
-        # NB: This is a 'self' call because we need to switch into the workspace
-        # we want to reset before we actually reset it.  A direct call to
-        # workspace.ResetWorkspace() will reset the ambient workspace, which
-        # is not want we want.
-        self.ResetWorkspace()
diff --git a/caffe2/python/operator_fp_exceptions_test.py b/caffe2/python/operator_fp_exceptions_test.py
deleted file mode 100644
index 52cf75de79fa..000000000000
--- a/caffe2/python/operator_fp_exceptions_test.py
+++ /dev/null
@@ -1,40 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-
-import numpy as np
-import unittest
-
-
-def setThrowIfFpExceptions(enabled):
-    core.GlobalInit(["caffe2", "--caffe2_operator_throw_if_fp_exceptions=%d" % (1 if enabled else 0)])
-
-
-class OperatorFPExceptionsTest(TestCase):
-    def test_fp_exception_divbyzero(self):
-        # This test asserts the followings
-        # - If flag caffe2_operator_throw_if_fp_exceptions is set,
-        # floating point exceptions will be thrown
-        # - If flag caffe2_operator_throw_if_fp_exceptions is not set,
-        # floating point exceptions will not be thrown
-        workspace.blobs["0"] = np.array([0.0], dtype=np.float32)
-        workspace.blobs["1"] = np.array([1.0], dtype=np.float32)
-
-        net = core.Net("test_fp")
-        net.Div(["1", "0"], "out")
-
-        for throw_if_fp_exceptions in (True, False):
-            setThrowIfFpExceptions(throw_if_fp_exceptions)
-            exception_raised = False
-            try:
-                workspace.RunNetOnce(net)
-            except Exception as e:
-                exception_raised = True
-            self.assertEqual(exception_raised, throw_if_fp_exceptions)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/operator_test/__init__.py b/caffe2/python/operator_test/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/operator_test/_utils.py b/caffe2/python/operator_test/_utils.py
deleted file mode 100644
index 3ee1def89e71..000000000000
--- a/caffe2/python/operator_test/_utils.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-This file only exists since `torch.testing.assert_allclose` is deprecated, but used extensively throughout the tests in
-this package. The replacement `torch.testing.assert_close` doesn't support one feature that is needed here: comparison
-between numpy arrays and torch tensors. See https://github.com/pytorch/pytorch/issues/61844 for the reasoning why this
-was removed.
-"""
-
-import torch
-from typing import Tuple, Any, Optional
-
-_DTYPE_PRECISIONS = {
-    torch.float16: (1e-3, 1e-3),
-    torch.float32: (1e-4, 1e-5),
-    torch.float64: (1e-5, 1e-8),
-}
-
-
-def _get_default_rtol_and_atol(actual: torch.Tensor, expected: torch.Tensor) -> Tuple[float, float]:
-    actual_rtol, actual_atol = _DTYPE_PRECISIONS.get(actual.dtype, (0.0, 0.0))
-    expected_rtol, expected_atol = _DTYPE_PRECISIONS.get(expected.dtype, (0.0, 0.0))
-    return max(actual_rtol, expected_rtol), max(actual_atol, expected_atol)
-
-
-def assert_allclose(
-    actual: Any,
-    expected: Any,
-    rtol: Optional[float] = None,
-    atol: Optional[float] = None,
-    equal_nan: bool = True,
-    msg: str = "",
-) -> None:
-    if not isinstance(actual, torch.Tensor):
-        actual = torch.tensor(actual)
-    if not isinstance(expected, torch.Tensor):
-        expected = torch.tensor(expected, dtype=actual.dtype)
-
-    if rtol is None and atol is None:
-        rtol, atol = _get_default_rtol_and_atol(actual, expected)
-
-    torch.testing.assert_close(
-        actual,
-        expected,
-        rtol=rtol,
-        atol=atol,
-        equal_nan=equal_nan,
-        check_device=True,
-        check_dtype=False,
-        check_stride=False,
-        msg=msg or None,
-    )
\ No newline at end of file
diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py
deleted file mode 100644
index 47216d51500c..000000000000
--- a/caffe2/python/operator_test/activation_ops_test.py
+++ /dev/null
@@ -1,294 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-from hypothesis import given, assume, settings
-import hypothesis.strategies as st
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from scipy.stats import norm
-
-import unittest
-
-
-class TestActivations(serial.SerializedTestCase):
-    @given(X=hu.tensor(), in_place=st.booleans(),
-                  engine=st.sampled_from(["", "CUDNN"]), **mu.gcs)
-    @settings(deadline=10000)
-    def test_relu(self, X, in_place, engine, gc, dc):
-        if gc == mu.mkl_do:
-            in_place = False
-
-        op = core.CreateOperator(
-            "Relu",
-            ["X"],
-            ["X"] if in_place else ["Y"],
-            engine=engine,
-        )
-
-        def relu_ref(X):
-            return [np.maximum(X, 0.0)]
-
-        # go away from the origin point to avoid kink problems
-        X += 0.02 * np.sign(X)
-        X[X == 0.0] += 0.02
-
-        self.assertReferenceChecks(gc, op, [X], relu_ref, ensure_outputs_are_inferred=True)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], ensure_outputs_are_inferred=True)
-
-    @given(N=st.integers(1, 10), M=st.integers(1, 10), in_place=st.booleans(),
-           **hu.gcs)
-    def test_relu_empty_input(self, N, M, in_place, gc, dc):
-        op = core.CreateOperator(
-            "Relu",
-            ["X"],
-            ["X"] if in_place else ["Y"],
-        )
-
-        def relu_ref(X):
-            return [np.maximum(X, 0.0)]
-
-        X = np.random.randn(0, N, M).astype(np.float32)
-
-        self.assertReferenceChecks(gc, op, [X], relu_ref, ensure_outputs_are_inferred=True)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], ensure_outputs_are_inferred=True)
-
-    @unittest.skipIf(not workspace.has_gpu_support,
-                     "Relu for float16 can only run on GPU now.")
-    @given(X=hu.tensor(dtype=np.float16), in_place=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
-    def test_relu_fp16(self, X, in_place, engine, gc, dc):
-        # fp16 is only supported on CUDA/HIP
-        assume(core.IsGPUDeviceType(gc.device_type))
-        op = core.CreateOperator(
-            "Relu",
-            ["X"],
-            ["X"] if in_place else ["Y"],
-            engine=engine,
-        )
-
-        def relu_ref(X):
-            return [np.maximum(X, 0.0)]
-
-        def relu_grad_ref(g_out, outputs, fwd_inputs):
-            dY = g_out
-            [Y] = outputs
-            dX = dY
-            dX[Y == 0] = 0
-            return [dX]
-
-        # go away from the origin point to avoid kink problems
-        X += 0.02 * np.sign(X)
-        X[X == 0.0] += 0.02
-
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [X],
-            relu_ref,
-            output_to_grad="X" if in_place else "Y",
-            grad_reference=relu_grad_ref)
-
-    @serial.given(X=hu.tensor(elements=hu.floats(-3.0, 3.0)),
-                  n=hu.floats(min_value=0.5, max_value=2.0),
-                  in_place=st.booleans(), **hu.gcs)
-    def test_relu_n(self, X, n, in_place, gc, dc):
-        op = core.CreateOperator(
-            "ReluN",
-            ["X"],
-            ["X"] if in_place else ["Y"],
-            n=n,
-        )
-
-        def relu_n_ref(X):
-            return [np.minimum(np.maximum(X, 0.0), n)]
-
-        # go away from 0 and n to avoid kink problems
-        X += 0.04 * np.sign(X)
-        X[X == 0.0] += 0.04
-        X -= n
-        X += 0.02 * np.sign(X)
-        X[X == 0.0] -= 0.02
-        X += n
-
-        self.assertReferenceChecks(gc, op, [X], relu_n_ref, ensure_outputs_are_inferred=True)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.005,
-                                  ensure_outputs_are_inferred=True)
-
-    @serial.given(X=hu.tensor(),
-                  alpha=hu.floats(min_value=0.1, max_value=2.0),
-                  in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
-                  **hu.gcs)
-    def test_elu(self, X, alpha, in_place, engine, gc, dc):
-        op = core.CreateOperator(
-            "Elu",
-            ["X"],
-            ["X"] if in_place else ["Y"],
-            alpha=alpha,
-            engine=engine,
-        )
-
-        def elu_ref(X):
-            Y = X
-            Y[X < 0] = alpha * (np.exp(X[X < 0]) - 1.0)
-            return [Y]
-
-        # go away from the origin point to avoid kink problems
-        X += 0.04 * np.sign(X)
-        X[X == 0.0] += 0.04
-
-        self.assertReferenceChecks(gc, op, [X], elu_ref, ensure_outputs_are_inferred=True)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2, ensure_outputs_are_inferred=True)
-
-    @given(X=hu.tensor(min_dim=4, max_dim=4),
-           alpha=hu.floats(min_value=0.1, max_value=2.0),
-           inplace=st.booleans(),
-           shared=st.booleans(),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           seed=st.sampled_from([20, 100]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_prelu(self, X, alpha, inplace, shared, order, seed, gc, dc):
-        np.random.seed(seed)
-        W = np.random.randn(
-            X.shape[1] if order == "NCHW" else X.shape[3]).astype(np.float32)
-
-        if shared:
-            W = np.random.randn(1).astype(np.float32)
-
-        # go away from the origin point to avoid kink problems
-        X += 0.04 * np.sign(X)
-        X[X == 0.0] += 0.04
-
-        def prelu_ref(X, W):
-            Y = X.copy()
-            W = W.reshape(1, -1, 1, 1) if order == "NCHW" \
-                else W.reshape(1, 1, 1, -1)
-            assert len(X.shape) == 4
-            neg_indices = X <= 0
-            assert len(neg_indices.shape) == 4
-            assert X.shape == neg_indices.shape
-            Y[neg_indices] = (Y * W)[neg_indices]
-            return (Y,)
-
-        op = core.CreateOperator(
-            "PRelu", ["X", "W"], ["Y" if not inplace else "X"],
-            alpha=alpha, order=order)
-        self.assertReferenceChecks(gc, op, [X, W], prelu_ref, ensure_outputs_are_inferred=True)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X, W], [0])
-
-        if not inplace:
-            # Gradient check wrt X
-            self.assertGradientChecks(gc, op, [X, W], 0, [0], stepsize=1e-2, ensure_outputs_are_inferred=True)
-            # Gradient check wrt W
-            self.assertGradientChecks(gc, op, [X, W], 1, [0], stepsize=1e-2, ensure_outputs_are_inferred=True)
-
-    @serial.given(X=hu.tensor(),
-                  alpha=hu.floats(min_value=0.1, max_value=2.0),
-                  inplace=st.booleans(),
-                  **hu.gcs)
-    def test_leaky_relu(self, X, alpha, inplace, gc, dc):
-        # go away from the origin point to avoid kink problems
-        X += 0.04 * np.sign(X)
-        X[X == 0.0] += 0.04
-
-        def leaky_relu_ref(X):
-            Y = X.copy()
-            neg_indices = X <= 0
-            Y[neg_indices] = Y[neg_indices] * alpha
-            return (Y,)
-
-        op = core.CreateOperator(
-            "LeakyRelu",
-            ["X"], ["Y" if not inplace else "X"],
-            alpha=alpha)
-        self.assertReferenceChecks(gc, op, [X], leaky_relu_ref,
-                                   ensure_outputs_are_inferred=True)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(X=hu.tensor(),
-           inplace=st.booleans(),
-           **hu.gcs)
-    def test_leaky_relu_default(self, X, inplace, gc, dc):
-        # go away from the origin point to avoid kink problems
-        X += 0.04 * np.sign(X)
-        X[X == 0.0] += 0.04
-
-        def leaky_relu_ref(X):
-            Y = X.copy()
-            neg_indices = X <= 0
-            Y[neg_indices] = Y[neg_indices] * 0.01
-            return (Y,)
-
-        op = core.CreateOperator(
-            "LeakyRelu",
-            ["X"], ["Y" if not inplace else "X"])
-        self.assertReferenceChecks(gc, op, [X], leaky_relu_ref)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(X=hu.tensor(),
-           fast_gelu=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_gelu(self, X, fast_gelu, gc, dc):
-        op = core.CreateOperator(
-            "Gelu",
-            ["X"],
-            ["Y"],
-            fast_gelu=fast_gelu,
-        )
-
-        def gelu_ref(X):
-            return (X * norm.cdf(X),)
-
-        tol = 1e-3 if fast_gelu else 1e-4
-        self.assertReferenceChecks(gc, op, [X], gelu_ref, threshold=tol,
-                                   ensure_outputs_are_inferred=True)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0],
-                                  ensure_outputs_are_inferred=True)
-
-
-    @given(n=st.integers(0, 6), m=st.integers(4, 6),
-           seed=st.integers(0, 1000), **hu.gcs_cpu_only)
-    def test_mish(self, n, m, gc, dc, seed):
-        np.random.seed(seed)
-        X = np.random.rand(n, m).astype(np.float32)
-
-        def mish_ref(X):
-            return (X * np.tanh(np.log1p(np.exp(X))),)
-
-        op = core.CreateOperator(
-            "Mish",
-            ["X"],
-            ["Y"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=mish_ref,
-            ensure_outputs_are_inferred=True,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X], 0, [0], ensure_outputs_are_inferred=True)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/adadelta_test.py b/caffe2/python/operator_test/adadelta_test.py
deleted file mode 100644
index 6c40c379697f..000000000000
--- a/caffe2/python/operator_test/adadelta_test.py
+++ /dev/null
@@ -1,199 +0,0 @@
-
-
-
-
-
-import functools
-
-import hypothesis
-from hypothesis import given, settings, HealthCheck
-import hypothesis.strategies as st
-import numpy as np
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-class TestAdadelta(serial.SerializedTestCase):
-    @staticmethod
-    def ref_adadelta(param_in,
-                     mom_in,
-                     mom_delta_in,
-                     grad, lr,
-                     epsilon,
-                     decay,
-                     using_fp16=False):
-        param_in_f32 = param_in
-        mom_in_f32 = mom_in
-        mom_delta_in_f32 = mom_delta_in
-        if(using_fp16):
-            param_in_f32 = param_in.astype(np.float32)
-            mom_in_f32 = mom_in.astype(np.float32)
-            mom_delta_in_f32 = mom_delta_in.astype(np.float32)
-
-        mom_out = decay * mom_in_f32 + (1.0 - decay) * grad * grad
-        new_grad = (np.sqrt(mom_delta_in_f32 + epsilon) /
-                    np.sqrt(mom_out + epsilon)) * grad
-        param_out = param_in_f32 + lr * new_grad
-        mom_delta_out = decay * mom_delta_in_f32 + (1.0 - decay
-                                                    ) * new_grad * new_grad
-        if(using_fp16):
-            return (param_out.astype(np.float16), mom_out.astype(np.float16),
-                    mom_delta_out.astype(np.float16))
-        else:
-            return (param_out.astype(np.float32), mom_out.astype(np.float32),
-                    mom_delta_out.astype(np.float32))
-
-    @given(inputs=hu.tensors(n=4),
-           lr=hu.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           epsilon=hu.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           decay=hu.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_adadelta(self, inputs, lr, epsilon, decay, gc, dc):
-        param, moment, moment_delta, grad = inputs
-        moment = np.abs(moment)
-        moment_delta = np.abs(moment_delta)
-        lr = np.array([lr], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "Adadelta",
-            ["param", "moment", "moment_delta", "grad", "lr"],
-            ["param", "moment", "moment_delta"],
-            epsilon=epsilon,
-            decay=decay,
-            device_option=gc,
-        )
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, moment, moment_delta, grad, lr],
-            functools.partial(self.ref_adadelta, epsilon=epsilon, decay=decay))
-
-    # Suppress filter_too_much health check.
-    # Likely caused by `assume` call falling through too often.
-    @settings(suppress_health_check=[HealthCheck.filter_too_much], deadline=10000)
-    @given(inputs=hu.tensors(n=4),
-           lr=hu.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           epsilon=hu.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           decay=hu.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           **hu.gcs)
-    def test_sparse_adadelta(self, inputs, lr, epsilon, decay, gc, dc):
-        param, moment, moment_delta, grad = inputs
-        moment = np.abs(moment)
-        moment_delta = np.abs(moment_delta)
-        lr = np.array([lr], dtype=np.float32)
-
-        # Create an indexing array containing values that are lists of indices,
-        # which index into grad
-        indices = np.random.choice(np.arange(grad.shape[0]),
-                                   size=np.random.randint(grad.shape[0]), replace=False)
-
-        # Sparsify grad
-        grad = grad[indices]
-
-        op = core.CreateOperator(
-            "SparseAdadelta",
-            ["param", "moment", "moment_delta", "indices", "grad", "lr"],
-            ["param", "moment", "moment_delta"],
-            epsilon=epsilon,
-            decay=decay,
-            device_option=gc)
-
-        def ref_sparse(param, moment, moment_delta, indices, grad, lr, decay,
-                       ref_using_fp16):
-            param_out = np.copy(param)
-            moment_out = np.copy(moment)
-            moment_delta_out = np.copy(moment_delta)
-            for i, index in enumerate(indices):
-                param_out[index], moment_out[index], moment_delta_out[
-                    index] = self.ref_adadelta(param[index], moment[index],
-                                               moment_delta[index], grad[i], lr,
-                                               epsilon, decay, ref_using_fp16)
-            return (param_out, moment_out, moment_delta_out)
-
-        ref_using_fp16_values = [False]
-        if gc == hu.gpu_do:
-            ref_using_fp16_values.append(True)
-
-        for ref_using_fp16 in ref_using_fp16_values:
-            moment_i = None
-            moment_delta_i = None
-            param_i = None
-            if(ref_using_fp16):
-                moment_i = moment.astype(np.float16)
-                moment_delta_i = moment_delta.astype(np.float16)
-                param_i = param.astype(np.float16)
-            else:
-                moment_i = moment.astype(np.float32)
-                moment_delta_i = moment_delta.astype(np.float32)
-                param_i = param.astype(np.float32)
-
-            self.assertReferenceChecks(gc, op, [
-                param_i, moment_i, moment_delta_i, indices, grad, lr, decay,
-                ref_using_fp16
-            ], ref_sparse)
-
-    @given(inputs=hu.tensors(n=3),
-           lr=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           decay=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           **hu.gcs)
-    @settings(deadline=None)
-    def test_sparse_adadelta_empty(self, inputs, lr, epsilon, decay, gc, dc):
-        param, moment, moment_delta = inputs
-        moment = np.abs(moment)
-        lr = np.array([lr], dtype=np.float32)
-
-        grad = np.empty(shape=(0,) + param.shape[1:], dtype=np.float32)
-        indices = np.empty(shape=(0,), dtype=np.int64)
-
-        hypothesis.note('indices.shape: %s' % str(indices.shape))
-
-        op = core.CreateOperator(
-            "SparseAdadelta",
-            ["param", "moment", "moment_delta", "indices", "grad", "lr"],
-            ["param", "moment", "moment_delta"],
-            epsilon=epsilon,
-            decay=decay,
-            device_option=gc)
-
-        def ref_sparse_empty(param, moment, moment_delta, indices, grad, lr, decay):
-            param_out = np.copy(param)
-            moment_out = np.copy(moment)
-            moment_delta_out = np.copy(moment_delta)
-            return (param_out, moment_out, moment_delta_out)
-
-        ref_using_fp16_values = [False]
-        if gc == hu.gpu_do:
-            ref_using_fp16_values.append(True)
-
-        for ref_using_fp16 in ref_using_fp16_values:
-            moment_i = None
-            moment_delta_i = None
-            param_i = None
-            if(ref_using_fp16):
-                moment_i = moment.astype(np.float16)
-                moment_delta_i = moment_delta.astype(np.float16)
-                param_i = param.astype(np.float16)
-            else:
-                moment_i = moment.astype(np.float32)
-                moment_delta_i = moment_delta.astype(np.float32)
-                param_i = param.astype(np.float32)
-
-            self.assertReferenceChecks(
-                gc,
-                op,
-                [param_i, moment_i, moment_delta_i, indices, grad, lr, decay],
-                ref_sparse_empty
-            )
diff --git a/caffe2/python/operator_test/adagrad_test.py b/caffe2/python/operator_test/adagrad_test.py
deleted file mode 100644
index 17ecb2617a3c..000000000000
--- a/caffe2/python/operator_test/adagrad_test.py
+++ /dev/null
@@ -1,241 +0,0 @@
-import functools
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core
-from caffe2.python.operator_test.adagrad_test_helper import (
-    adagrad_sparse_test_helper,
-    ref_adagrad,
-)
-from hypothesis import HealthCheck, given, settings
-
-
-class TestAdagrad(serial.SerializedTestCase):
-    @given(
-        inputs=hu.tensors(n=3),
-        lr=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        epsilon=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        weight_decay=st.sampled_from([0.0, 0.1]),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_adagrad(self, inputs, lr, epsilon, weight_decay, gc, dc):
-        param, momentum, grad = inputs
-        momentum = np.abs(momentum)
-        lr = np.array([lr], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "Adagrad",
-            ["param", "momentum", "grad", "lr"],
-            ["param", "momentum"],
-            epsilon=epsilon,
-            weight_decay=weight_decay,
-            device_option=gc,
-        )
-
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [param, momentum, grad, lr],
-            functools.partial(ref_adagrad, epsilon=epsilon, weight_decay=weight_decay),
-        )
-
-    @given(
-        inputs=hu.tensors(n=3),
-        lr=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        epsilon=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        weight_decay=st.sampled_from([0.0, 0.1]),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=10000)
-    def test_adagrad_output_effective_lr(
-        self, inputs, lr, epsilon, weight_decay, gc, dc
-    ):
-        param, momentum, grad = inputs
-        momentum = np.abs(momentum)
-        lr = np.array([lr], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "Adagrad",
-            ["param", "momentum", "grad", "lr"],
-            ["param", "momentum", "effective_lr"],
-            epsilon=epsilon,
-            weight_decay=weight_decay,
-            device_option=gc,
-        )
-
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [param, momentum, grad, lr],
-            functools.partial(
-                ref_adagrad,
-                epsilon=epsilon,
-                output_effective_lr=True,
-                weight_decay=weight_decay,
-            ),
-        )
-
-    @given(
-        inputs=hu.tensors(n=3),
-        lr=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        epsilon=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=10000)
-    def test_adagrad_output_effective_lr_and_update(self, inputs, lr, epsilon, gc, dc):
-        param, momentum, grad = inputs
-        momentum = np.abs(momentum)
-        lr = np.array([lr], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "Adagrad",
-            ["param", "momentum", "grad", "lr"],
-            ["param", "momentum", "effective_lr", "update"],
-            epsilon=epsilon,
-            device_option=gc,
-        )
-
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [param, momentum, grad, lr],
-            functools.partial(
-                ref_adagrad, epsilon=epsilon, output_effective_lr_and_update=True
-            ),
-        )
-
-    # Suppress filter_too_much health check.
-    # Likely caused by `assume` call falling through too often.
-    @settings(suppress_health_check=[HealthCheck.filter_too_much], deadline=10000)
-    @given(
-        inputs=hu.tensors(n=3),
-        lr=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        epsilon=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        weight_decay=st.sampled_from([0.0, 0.1]),
-        **hu.gcs
-    )
-    def test_sparse_adagrad(self, inputs, lr, epsilon, weight_decay, gc, dc):
-        adagrad_sparse_test_helper(
-            self,
-            inputs,
-            lr,
-            epsilon,
-            None,
-            ref_adagrad,
-            gc,
-            dc,
-            weight_decay=weight_decay,
-        )
-
-    @given(
-        inputs=hu.tensors(n=2),
-        lr=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        epsilon=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_sparse_adagrad_empty(self, inputs, lr, epsilon, gc, dc):
-        param, momentum = inputs
-        grad = np.empty(shape=(0,) + param.shape[1:], dtype=np.float32)
-
-        ref_using_fp16_values = [False]
-        if gc == hu.gpu_do:
-            ref_using_fp16_values.append(True)
-
-        for ref_using_fp16 in ref_using_fp16_values:
-            if ref_using_fp16:
-                print("test_sparse_adagrad_empty with half precision embedding")
-                momentum_i = momentum.astype(np.float16)
-                param_i = param.astype(np.float16)
-            else:
-                print("test_sparse_adagrad_empty with full precision embedding")
-                momentum_i = momentum.astype(np.float32)
-                param_i = param.astype(np.float32)
-
-            adagrad_sparse_test_helper(
-                self,
-                [param_i, momentum_i, grad],
-                lr,
-                epsilon,
-                None,
-                ref_adagrad,
-                gc,
-                dc,
-            )
-
-    # Suppress filter_too_much health check.
-    # Likely caused by `assume` call falling through too often.
-    @settings(suppress_health_check=[HealthCheck.filter_too_much], deadline=10000)
-    @given(
-        inputs=hu.tensors(n=3),
-        lr=st.sampled_from([0.01, 0.99]),
-        epsilon=st.sampled_from([0.01, 0.99]),
-        weight_decay=st.sampled_from([0.0, 0.1]),
-        counter_halflife=st.sampled_from([-1, 5]),
-        **hu.gcs
-    )
-    def test_row_wise_sparse_adagrad(
-        self, inputs, lr, epsilon, weight_decay, counter_halflife, gc, dc
-    ):
-        adagrad_sparse_test_helper(
-            self,
-            inputs,
-            lr,
-            epsilon,
-            None,
-            functools.partial(ref_adagrad, row_wise=True),
-            gc,
-            dc,
-            row_wise=True,
-            weight_decay=weight_decay,
-            counter_halflife=counter_halflife,
-        )
-
-    @given(
-        inputs=hu.tensors(n=2),
-        lr=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        epsilon=st.floats(
-            min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False
-        ),
-        **hu.gcs
-    )
-    @settings(deadline=None)
-    def test_row_wise_sparse_adagrad_empty(self, inputs, lr, epsilon, gc, dc):
-        param, momentum = inputs
-        grad = np.empty(shape=(0,) + param.shape[1:], dtype=np.float32)
-        adagrad_sparse_test_helper(
-            self,
-            [param, momentum, grad],
-            lr,
-            epsilon,
-            None,
-            ref_adagrad,
-            gc,
-            dc,
-            row_wise=True,
-        )
diff --git a/caffe2/python/operator_test/adagrad_test_helper.py b/caffe2/python/operator_test/adagrad_test_helper.py
deleted file mode 100644
index 1fd017c4d2ac..000000000000
--- a/caffe2/python/operator_test/adagrad_test_helper.py
+++ /dev/null
@@ -1,165 +0,0 @@
-from functools import partial
-
-import caffe2.python.hypothesis_test_util as hu
-import numpy as np
-from caffe2.python import core
-
-
-def ref_adagrad(
-    param_in,
-    mom_in,
-    grad,
-    lr,
-    epsilon,
-    using_fp16=False,
-    output_effective_lr=False,
-    output_effective_lr_and_update=False,
-    decay=1.0,
-    row_wise=False,
-    weight_decay=0.0,
-    counter_halflife=-1,
-    count=None,  # only used when counter_halflife != -1
-):
-    mom_in_f32 = mom_in
-    param_in_f32 = param_in
-    if using_fp16:
-        mom_in_f32 = mom_in.astype(np.float32)
-        param_in_f32 = param_in.astype(np.float32)
-
-    if count and count > 0 and counter_halflife > 0:
-        weight_decay *= counter_halflife / count
-    grad_temp = grad + weight_decay * param_in_f32
-    if row_wise:
-        mom_out = decay * mom_in_f32 + np.mean(np.square(grad_temp))
-    else:
-        mom_out = decay * mom_in_f32 + np.square(grad_temp)
-    effective_lr = lr / (np.sqrt(mom_out) + epsilon)
-    grad_adj = effective_lr * grad_temp
-    param_out = param_in_f32 + grad_adj
-
-    if output_effective_lr_and_update:
-        if using_fp16:
-            return (
-                param_out.astype(np.float16),
-                mom_out.astype(np.float16),
-                effective_lr.astype(np.float16),
-                grad_adj.astype(np.float16),
-            )
-        else:
-            return (
-                param_out.astype(np.float32),
-                mom_out.astype(np.float32),
-                effective_lr.astype(np.float32),
-                grad_adj.astype(np.float32),
-            )
-    elif output_effective_lr:
-        if using_fp16:
-            return (
-                param_out.astype(np.float16),
-                mom_out.astype(np.float16),
-                effective_lr.astype(np.float16),
-            )
-        else:
-            return (
-                param_out.astype(np.float32),
-                mom_out.astype(np.float32),
-                effective_lr.astype(np.float32),
-            )
-
-    if using_fp16:
-        return (param_out.astype(np.float16), mom_out.astype(np.float16))
-    else:
-        return (param_out.astype(np.float32), mom_out.astype(np.float32))
-
-
-def adagrad_sparse_test_helper(
-    parent_test,
-    inputs,
-    lr,
-    epsilon,
-    engine,
-    ref_adagrad,
-    gc,
-    dc,
-    row_wise=False,
-    weight_decay=0.0,
-    counter_halflife=-1,
-):
-    param, momentum, grad = inputs
-    if row_wise:
-        # For row-wise adagrad, only take the first element of each row
-        momentum = momentum.reshape(momentum.shape[0], -1)[:, 0]
-    momentum = np.abs(momentum)
-    lr = np.array([lr], dtype=np.float32)
-    count = None
-    if counter_halflife != -1:
-        count = np.random.rand(param.shape[0])
-
-    # Create an indexing array containing values that are lists of indices,
-    # which index into grad
-    if grad.size == 0:
-        indices = np.empty(shape=(0,), dtype=int)
-    else:
-        indices = np.random.choice(
-            np.arange(grad.shape[0]),
-            size=np.random.randint(grad.shape[0]),
-            replace=False,
-        )
-
-    # Sparsify grad
-    grad = grad[indices]
-
-    op = core.CreateOperator(
-        "RowWiseSparseAdagrad" if row_wise else "SparseAdagrad",
-        ["param", "momentum", "indices", "grad", "lr"] if count is None else ["param", "momentum", "indices", "grad", "lr", "count"],
-        ["param", "momentum"],
-        epsilon=epsilon,
-        weight_decay=weight_decay,
-        counter_halflife=counter_halflife,
-        engine=engine,
-        device_option=gc,
-    )
-
-    def ref_sparse(param, momentum, indices, grad, lr, count=None, ref_using_fp16=False):
-        param_out = np.copy(param)
-        momentum_out = np.copy(momentum)
-        # Need to do this because it's possible ref_adagrad's using_fp16 could
-        # have been already specialized.
-        ref_adagrad_temp = (
-            partial(ref_adagrad, using_fp16=ref_using_fp16)
-            if ref_using_fp16
-            else ref_adagrad
-        )
-        for i, index in enumerate(indices):
-            param_out[index], momentum_out[index] = ref_adagrad_temp(
-                param[index],
-                momentum[index],
-                grad[i],
-                lr,
-                epsilon,
-                weight_decay=weight_decay,
-                counter_halflife=counter_halflife,
-                count=None if count is None else count[index],
-            )
-        return (param_out, momentum_out)
-
-    ref_using_fp16_values = [False]
-    if gc == hu.gpu_do and not row_wise:
-        ref_using_fp16_values.append(True)
-
-    for ref_using_fp16 in ref_using_fp16_values:
-        if ref_using_fp16:
-            print("test_sparse_adagrad with half precision embedding")
-            momentum_i = momentum.astype(np.float16)
-            param_i = param.astype(np.float16)
-        else:
-            print("test_sparse_adagrad with full precision embedding")
-            momentum_i = momentum.astype(np.float32)
-            param_i = param.astype(np.float32)
-
-        parent_test.assertReferenceChecks(
-            gc,
-            op,
-            [param_i, momentum_i, indices, grad, lr, count, ref_using_fp16],
-            ref_sparse
-        )
diff --git a/caffe2/python/operator_test/adam_test.py b/caffe2/python/operator_test/adam_test.py
deleted file mode 100644
index ee522b155dc8..000000000000
--- a/caffe2/python/operator_test/adam_test.py
+++ /dev/null
@@ -1,538 +0,0 @@
-
-
-
-
-
-import functools
-
-import hypothesis
-from hypothesis import given
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestAdam(hu.HypothesisTestCase):
-
-    @staticmethod
-    def ref_adam(param, mom1, mom2, grad, LR, ITER,
-                 beta1, beta2, epsilon, output_grad=False):
-        t = ITER + 1
-        corrected_local_rate = np.sqrt(1 - np.power(beta2, t)) / \
-            (1 - np.power(beta1, t))
-        mom1_out = (beta1 * mom1) + (1 - beta1) * grad
-        mom2_out = (beta2 * mom2) + (1 - beta2) * np.square(grad)
-        grad_out = corrected_local_rate * mom1_out / \
-            (np.sqrt(mom2_out) + epsilon)
-        param_out = param + LR * grad_out
-        if output_grad:
-            return param_out, mom1_out, mom2_out, grad_out
-        else:
-            return param_out, mom1_out, mom2_out
-
-    @staticmethod
-    def ref_smart_decay_adam(param, mom1, mom2, last_seen, grad, LR, ITER,
-                             beta1, beta2, epsilon):
-
-        for name in ('param', 'mom1', 'mom2', 'last_seen', 'grad',
-                     'LR', 'ITER', 'beta1', 'beta2', 'epsilon'):
-            print("{} {} {}".format(name, locals()['name'], type(locals()['name'])))
-
-
-        t = ITER + 1
-        k = t - last_seen
-        k = k.flatten()[0]
-
-        last_seen_out = t * np.ones_like(last_seen)
-
-        # Make up for lost minibatches.
-        mom2_out = (beta2**k * mom2) + (1 - beta2) * np.square(grad)
-        param_out = param
-        mom1_out = mom1
-
-        # For catchup
-        assert k >= 1
-        for i in range(k):
-            mom1_out *= beta1
-            if i == k - 1:
-                mom1_out += grad * (1 - beta1)
-            param_out += LR * mom1_out / (np.sqrt(mom2_out) + epsilon)
-        grad_out = mom1_out / (np.sqrt(mom2_out) + epsilon)
-
-        return param_out, mom1_out, mom2_out, last_seen_out
-
-    @staticmethod
-    def ref_row_wise_adam(param, mom1, mom2, grad, LR, ITER,
-                          beta1, beta2, epsilon, output_grad=False):
-        t = ITER + 1
-        corrected_local_rate = np.sqrt(1 - np.power(beta2, t)) / \
-            (1 - np.power(beta1, t))
-        mom1_out = (beta1 * mom1) + (1 - beta1) * grad
-        mom2_out = (beta2 * mom2) + (1 - beta2) * np.mean(np.square(grad))
-        grad_out = corrected_local_rate * mom1_out / (np.sqrt(mom2_out) + epsilon)
-        param_out = param + LR * grad_out
-        if output_grad:
-            return param_out, mom1_out, mom2_out, grad_out
-        else:
-            return param_out, mom1_out, mom2_out
-
-    @given(inputs=hu.tensors(n=4),
-           ITER=st.integers(min_value=0, max_value=10000),
-           LR=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           beta1=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           beta2=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           **hu.gcs)
-    def test_adam(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
-        param, mom1, mom2, grad = inputs
-        mom2 = np.abs(mom2)
-        ITER = np.array([ITER], dtype=np.int64)
-        LR = np.array([LR], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "Adam",
-            ["param", "mom1", "mom2", "grad", "lr", "iter"],
-            ["output_param", "output_mom1", "output_mom2"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-
-        # Iter lives on the CPU
-        input_device_options = {'iter': hu.cpu_do}
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, mom1, mom2, grad, LR, ITER],
-            functools.partial(
-                self.ref_adam,
-                beta1=beta1, beta2=beta2, epsilon=epsilon),
-            input_device_options=input_device_options)
-
-    @given(inputs=hu.tensors(n=4),
-           ITER=st.integers(min_value=0, max_value=10000),
-           LR=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           beta1=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           beta2=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           **hu.gcs_cpu_only)
-    def test_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon, gc, dc):
-        param, mom1, mom2, grad = inputs
-        mom2 = np.abs(mom2)
-        ITER = np.array([ITER], dtype=np.int64)
-        LR = np.array([LR], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "Adam",
-            ["param", "mom1", "mom2", "grad", "lr", "iter"],
-            ["output_param", "output_mom1", "output_mom2", "output_grad"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-
-        # Iter lives on the CPU
-        input_device_options = {'iter': hu.cpu_do}
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, mom1, mom2, grad, LR, ITER],
-            functools.partial(
-                self.ref_adam,
-                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
-            input_device_options=input_device_options)
-
-    @given(inputs=hu.tensors(n=4),
-           ITER=st.integers(min_value=0, max_value=10000),
-           LR=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           beta1=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           beta2=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           data_strategy=st.data(),
-           **hu.gcs)
-    def test_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
-                         data_strategy, gc, dc):
-        param, mom1, mom2, grad = inputs
-        mom2 = np.absolute(mom2)
-        ITER = np.array([ITER], dtype=np.int64)
-        LR = np.array([LR], dtype=np.float32)
-
-        # Create an indexing array containing values which index into grad
-        indices = data_strategy.draw(
-            hu.tensor(
-                max_dim=1,
-                min_value=1,
-                max_value=grad.shape[0],
-                dtype=np.int64,
-                elements=st.sampled_from(np.arange(grad.shape[0])),
-            ),
-        )
-
-        # Verify that the generated indices are unique
-        hypothesis.assume(
-            np.array_equal(
-                np.unique(indices.flatten()),
-                np.sort(indices.flatten())))
-
-        # Sparsify grad
-        grad = grad[indices]
-
-        op = core.CreateOperator(
-            "SparseAdam",
-            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
-            ["param", "mom1", "mom2"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-
-        def ref_sparse(param, mom1, mom2, indices, grad, LR, ITER):
-            param_out = np.copy(param)
-            mom1_out = np.copy(mom1)
-            mom2_out = np.copy(mom2)
-
-            for i, index in enumerate(indices):
-                param_out[index], mom1_out[index], mom2_out[index] = \
-                    self.ref_adam(param[index], mom1[index], mom2[index],
-                                  grad[i], LR, ITER,
-                                  beta1, beta2, epsilon)
-            return (param_out, mom1_out, mom2_out)
-
-        # Iter lives on the CPU
-        input_device_options = {'iter': hu.cpu_do}
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, mom1, mom2, indices, grad, LR, ITER],
-            ref_sparse,
-            input_device_options=input_device_options)
-
-    @unittest.skipIf(not workspace.has_cuda_support, "no cuda support")
-    @given(inputs=hu.tensors(n=4),
-           ITER=st.integers(min_value=0, max_value=10),
-           LR=st.floats(min_value=0.000001, max_value=0.1,
-                        allow_nan=False, allow_infinity=False),
-           beta1=st.floats(min_value=0.0, max_value=0.99999,
-                           allow_nan=False, allow_infinity=False),
-           beta2=st.floats(min_value=0.9, max_value=0.999999,
-                           allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.00001, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           data_strategy=st.data(),
-           **hu.gcs)
-    def test_smart_decay_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
-                                     data_strategy, gc, dc):
-        param, mom1, mom2, grad = inputs
-        mom2 = np.absolute(mom2)
-        _iter, _lr = ITER, LR  # Keep the scalar types for reference
-        ITER = np.array([ITER], dtype=np.int64)
-        LR = np.array([LR], dtype=np.float32)
-
-        # Here we will define the last_seen tensor as being randomly from 0 to ITER
-        # (the value of t to be tested will be ITER+1)
-        last_seen = data_strategy.draw(
-            hypothesis.extra.numpy.arrays(
-                dtype=np.int64,
-                shape=(param.shape[0],),
-                elements=st.integers(min_value=0, max_value=_iter),
-                unique=False,
-            )
-        )
-
-        # Create an indexing array containing values which index into grad
-        indices = data_strategy.draw(
-            hu.tensor(
-                max_dim=1,
-                min_value=1,
-                max_value=grad.shape[0],
-                dtype=np.int64,
-                elements=st.sampled_from(np.arange(grad.shape[0])),
-            ),
-        )
-
-        # Verify that the generated indices are unique
-        hypothesis.assume(
-            np.array_equal(
-                np.unique(indices.flatten()),
-                np.sort(indices.flatten())))
-
-        # Sparsify grad
-        grad = grad[indices]
-
-        op = core.CreateOperator(
-            "SmartDecaySparseAdam",
-            ["param", "mom1", "mom2", "last_seen", "indices", "grad", "lr", "iter"],
-            ["param", "mom1", "mom2", "last_seen"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-
-        def ref_sparse(param, mom1, mom2, last_seen, indices, grad, LR, ITER):
-            param_out = np.copy(param)
-            mom1_out = np.copy(mom1)
-            mom2_out = np.copy(mom2)
-            last_seen_out = np.copy(last_seen)
-
-            for i, index in enumerate(indices):
-                param_out[index], mom1_out[index], mom2_out[index], last_seen_out[index] = \
-                    self.ref_smart_decay_adam(param[index], mom1[index], mom2[index], last_seen[index],
-                                              grad[i], LR, ITER,
-                                              beta1, beta2, epsilon)
-            return (param_out, mom1_out, mom2_out, last_seen_out)
-
-        # Iter lives on the CPU
-        input_device_options = {'iter': hu.cpu_do}
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, mom1, mom2, last_seen, indices, grad, LR, ITER],
-            ref_sparse,
-            input_device_options=input_device_options)
-
-    @given(inputs=hu.tensors(n=4),
-           ITER=st.integers(min_value=0, max_value=10000),
-           LR=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           beta1=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           beta2=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           data_strategy=st.data(),
-           **hu.gcs)
-    def test_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2, epsilon,
-                                     data_strategy, gc, dc):
-        param, mom1, mom2, grad = inputs
-        mom2 = np.absolute(mom2)
-        ITER = np.array([ITER], dtype=np.int64)
-        LR = np.array([LR], dtype=np.float32)
-
-        # Create an indexing array containing values which index into grad
-        indices = data_strategy.draw(
-            hu.tensor(
-                max_dim=1,
-                min_value=1,
-                max_value=grad.shape[0],
-                dtype=np.int64,
-                elements=st.sampled_from(np.arange(grad.shape[0])),
-            ),
-        )
-
-        # Verify that the generated indices are unique
-        hypothesis.assume(
-            np.array_equal(
-                np.unique(indices.flatten()),
-                np.sort(indices.flatten())))
-
-        # Sparsify grad
-        grad = grad[indices]
-
-        op = core.CreateOperator(
-            "SparseAdam",
-            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
-            ["param", "mom1", "mom2", "output_grad"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-
-        def ref_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
-                                   beta1, beta2, epsilon, output_grad):
-            param_out = np.copy(param)
-            mom1_out = np.copy(mom1)
-            mom2_out = np.copy(mom2)
-            grad_out = np.copy(grad)
-
-            for i, index in enumerate(indices):
-                param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \
-                    self.ref_adam(param[index], mom1[index], mom2[index],
-                                  grad[i], LR, ITER,
-                                  beta1, beta2, epsilon, output_grad)
-            return (param_out, mom1_out, mom2_out, grad_out)
-
-        # Iter lives on the CPU
-        input_device_options = {'iter': hu.cpu_do}
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, mom1, mom2, indices, grad, LR, ITER],
-            functools.partial(
-                ref_sparse_output_grad,
-                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
-            input_device_options=input_device_options)
-
-    @given(inputs=hu.tensors(n=3),
-           ITER=st.integers(min_value=0, max_value=10000),
-           LR=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           beta1=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           beta2=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           data_strategy=st.data(),
-           **hu.gcs)
-    def test_row_wise_sparse_adam(self, inputs, ITER, LR, beta1, beta2, epsilon,
-                                  data_strategy, gc, dc):
-        param, mom1, grad = inputs
-        ITER = np.array([ITER], dtype=np.int64)
-        LR = np.array([LR], dtype=np.float32)
-
-        # Create a 1D row-wise average 2nd moment tensor.
-        mom2 = data_strategy.draw(
-            hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0],
-                        elements=hu.elements_of_type(dtype=np.float32))
-        )
-        mom2 = np.absolute(mom2)
-
-        # Create an indexing array containing values which index into grad
-        indices = data_strategy.draw(
-            hu.tensor(
-                max_dim=1,
-                min_value=1,
-                max_value=grad.shape[0],
-                dtype=np.int64,
-                elements=st.sampled_from(np.arange(grad.shape[0])),
-            ),
-        )
-
-        # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment
-        # tensor that is strictly 1-dimensional and equal in length to the
-        # first dimension of the parameters, so indices must also be
-        # 1-dimensional.
-        indices = indices.flatten()
-
-        hypothesis.note('indices.shape: %s' % str(indices.shape))
-
-        # Verify that the generated indices are unique
-        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))
-
-        # Sparsify grad
-        grad = grad[indices]
-
-        op = core.CreateOperator(
-            "RowWiseSparseAdam",
-            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
-            ["param", "mom1", "mom2"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-
-        def ref_row_wise_sparse(param, mom1, mom2, indices, grad, LR, ITER):
-            param_out = np.copy(param)
-            mom1_out = np.copy(mom1)
-            mom2_out = np.copy(mom2)
-            for i, index in enumerate(indices):
-                param_out[index], mom1_out[index], mom2_out[index] = \
-                    self.ref_row_wise_adam(param[index], mom1[index], mom2[index],
-                                           grad[i], LR, ITER,
-                                           beta1, beta2, epsilon)
-            return (param_out, mom1_out, mom2_out)
-
-        # Iter lives on the CPU
-        input_device_options = {'iter': hu.cpu_do}
-
-        self.assertDeviceChecks(
-            dc, op,
-            [param, mom1, mom2, indices, grad, LR, ITER],
-            [0, 1, 2],
-            input_device_options=input_device_options)
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, mom1, mom2, indices, grad, LR, ITER],
-            ref_row_wise_sparse,
-            input_device_options=input_device_options)
-
-    @given(inputs=hu.tensors(n=3),
-           ITER=st.integers(min_value=0, max_value=10000),
-           LR=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           beta1=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           beta2=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           data_strategy=st.data(),
-           **hu.gcs)
-    def test_row_wise_sparse_adam_output_grad(self, inputs, ITER, LR, beta1, beta2,
-                                              epsilon, data_strategy, gc, dc):
-        param, mom1, grad = inputs
-        ITER = np.array([ITER], dtype=np.int64)
-        LR = np.array([LR], dtype=np.float32)
-
-        # Create a 1D row-wise average 2nd moment tensor.
-        mom2 = data_strategy.draw(
-            hu.tensor1d(min_len=param.shape[0], max_len=param.shape[0],
-                        elements=hu.elements_of_type(dtype=np.float32))
-        )
-        mom2 = np.absolute(mom2)
-
-        # Create an indexing array containing values which index into grad
-        indices = data_strategy.draw(
-            hu.tensor(
-                max_dim=1,
-                min_value=1,
-                max_value=grad.shape[0],
-                dtype=np.int64,
-                elements=st.sampled_from(np.arange(grad.shape[0])),
-            ),
-        )
-
-        # Note that unlike SparseAdam, RowWiseSparseAdam uses a moment
-        # tensor that is strictly 1-dimensional and equal in length to the
-        # first dimension of the parameters, so indices must also be
-        # 1-dimensional.
-        indices = indices.flatten()
-
-        hypothesis.note('indices.shape: %s' % str(indices.shape))
-
-        # Verify that the generated indices are unique
-        hypothesis.assume(np.array_equal(np.unique(indices), np.sort(indices)))
-
-        # Sparsify grad
-        grad = grad[indices]
-
-        op = core.CreateOperator(
-            "RowWiseSparseAdam",
-            ["param", "mom1", "mom2", "indices", "grad", "lr", "iter"],
-            ["param", "mom1", "mom2", "output_grad"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon)
-
-        def ref_row_wise_sparse_output_grad(param, mom1, mom2, indices, grad, LR, ITER,
-                                            beta1, beta2, epsilon, output_grad):
-            param_out = np.copy(param)
-            mom1_out = np.copy(mom1)
-            mom2_out = np.copy(mom2)
-            grad_out = np.copy(grad)
-
-            for i, index in enumerate(indices):
-                param_out[index], mom1_out[index], mom2_out[index], grad_out[i] = \
-                    self.ref_row_wise_adam(param[index], mom1[index], mom2[index],
-                                           grad[i], LR, ITER,
-                                           beta1, beta2, epsilon, output_grad)
-            return (param_out, mom1_out, mom2_out, grad_out)
-
-        # Iter lives on the CPU
-        input_device_options = {'iter': hu.cpu_do}
-
-        self.assertDeviceChecks(
-            dc, op,
-            [param, mom1, mom2, indices, grad, LR, ITER],
-            [0, 1, 2, 3],
-            input_device_options=input_device_options)
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, mom1, mom2, indices, grad, LR, ITER],
-            functools.partial(
-                ref_row_wise_sparse_output_grad,
-                beta1=beta1, beta2=beta2, epsilon=epsilon, output_grad=True),
-            input_device_options=input_device_options)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/affine_channel_op_test.py b/caffe2/python/operator_test/affine_channel_op_test.py
deleted file mode 100644
index 76b09fdd5cd6..000000000000
--- a/caffe2/python/operator_test/affine_channel_op_test.py
+++ /dev/null
@@ -1,109 +0,0 @@
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestAffineChannelOp(serial.SerializedTestCase):
-    def affine_channel_nchw_ref(self, X, scale, bias):
-        dims = X.shape
-        N = dims[0]
-        C = dims[1]
-        X = X.reshape(N, C, -1)
-        scale = scale.reshape(C, 1)
-        bias = bias.reshape(C, 1)
-        Y = X * scale + bias
-        return [Y.reshape(dims)]
-
-    def affine_channel_nhwc_ref(self, X, scale, bias):
-        dims = X.shape
-        N = dims[0]
-        C = dims[-1]
-        X = X.reshape(N, -1, C)
-        Y = X * scale + bias
-        return [Y.reshape(dims)]
-
-    @serial.given(N=st.integers(1, 5), C=st.integers(1, 5),
-            H=st.integers(1, 5), W=st.integers(1, 5),
-            order=st.sampled_from(["NCHW", "NHWC"]), is_learnable=st.booleans(),
-            in_place=st.booleans(), **hu.gcs)
-    def test_affine_channel_2d(
-            self, N, C, H, W, order, is_learnable, in_place, gc, dc):
-        op = core.CreateOperator(
-            "AffineChannel",
-            ["X", "scale", "bias"],
-            ["X"] if in_place and not is_learnable else ["Y"],
-            order=order,
-            is_learnable=is_learnable,
-        )
-
-        if order == "NCHW":
-            X = np.random.randn(N, C, H, W).astype(np.float32)
-        else:
-            X = np.random.randn(N, H, W, C).astype(np.float32)
-        scale = np.random.randn(C).astype(np.float32)
-        bias = np.random.randn(C).astype(np.float32)
-        inputs = [X, scale, bias]
-
-        def ref_op(X, scale, bias):
-            if order == "NCHW":
-                return self.affine_channel_nchw_ref(X, scale, bias)
-            else:
-                return self.affine_channel_nhwc_ref(X, scale, bias)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=ref_op,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        num_grad = len(inputs) if is_learnable else 1
-        for i in range(num_grad):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-    @given(N=st.integers(1, 5), C=st.integers(1, 5), T=st.integers(1, 3),
-           H=st.integers(1, 3), W=st.integers(1, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]), is_learnable=st.booleans(),
-           in_place=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_affine_channel_3d(
-            self, N, C, T, H, W, order, is_learnable, in_place, gc, dc):
-        op = core.CreateOperator(
-            "AffineChannel",
-            ["X", "scale", "bias"],
-            ["X"] if in_place and not is_learnable else ["Y"],
-            order=order,
-            is_learnable=is_learnable,
-        )
-
-        if order == "NCHW":
-            X = np.random.randn(N, C, T, H, W).astype(np.float32)
-        else:
-            X = np.random.randn(N, T, H, W, C).astype(np.float32)
-        scale = np.random.randn(C).astype(np.float32)
-        bias = np.random.randn(C).astype(np.float32)
-        inputs = [X, scale, bias]
-
-        def ref_op(X, scale, bias):
-            if order == "NCHW":
-                return self.affine_channel_nchw_ref(X, scale, bias)
-            else:
-                return self.affine_channel_nhwc_ref(X, scale, bias)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=ref_op,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        num_grad = len(inputs) if is_learnable else 1
-        for i in range(num_grad):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
diff --git a/caffe2/python/operator_test/alias_with_name_test.py b/caffe2/python/operator_test/alias_with_name_test.py
deleted file mode 100644
index 6d62cb691e4e..000000000000
--- a/caffe2/python/operator_test/alias_with_name_test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env python3
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, utils
-from hypothesis import given
-
-
-class TestAliasWithNameOp(hu.HypothesisTestCase):
-    @given(
-        shape=st.lists(st.integers(0, 5), min_size=1, max_size=3),
-        dtype=st.sampled_from([np.float32, np.int64]),
-        **hu.gcs
-    )
-    def test_alias_with_name_op(self, shape, dtype, dc, gc):
-        test_input = (100 * np.random.random(shape)).astype(dtype)
-        test_inputs = [test_input]
-
-        alias_op = core.CreateOperator(
-            "AliasWithName",
-            ["input"],
-            ["output"],
-            device_option=gc,
-        )
-        alias_op.arg.add().CopyFrom(utils.MakeArgument("name", "whatever_name"))
-
-        def reference_func(x):
-            return (x,)
-
-        self.assertReferenceChecks(gc, alias_op, test_inputs, reference_func)
diff --git a/caffe2/python/operator_test/apmeter_test.py b/caffe2/python/operator_test/apmeter_test.py
deleted file mode 100644
index 2c95834d9251..000000000000
--- a/caffe2/python/operator_test/apmeter_test.py
+++ /dev/null
@@ -1,84 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-
-def calculate_ap(predictions, labels):
-    N, D = predictions.shape
-    ap = np.zeros(D)
-    num_range = np.arange((N), dtype=np.float32) + 1
-    for k in range(D):
-        scores = predictions[:N, k]
-        label = labels[:N, k]
-        sortind = np.argsort(-scores, kind='mergesort')
-        truth = label[sortind]
-        precision = np.cumsum(truth) / num_range
-        ap[k] = precision[truth.astype(bool)].sum() / max(1, truth.sum())
-    return ap
-
-
-class TestAPMeterOps(hu.HypothesisTestCase):
-    @given(predictions=hu.arrays(dims=[10, 3],
-           elements=hu.floats(allow_nan=False,
-                              allow_infinity=False,
-                              min_value=0.1,
-                              max_value=1)),
-           labels=hu.arrays(dims=[10, 3],
-                            dtype=np.int32,
-                            elements=st.integers(min_value=0,
-                                                 max_value=1)),
-           **hu.gcs_cpu_only)
-    def test_average_precision(self, predictions, labels, gc, dc):
-        op = core.CreateOperator(
-            "APMeter",
-            ["predictions", "labels"],
-            ["AP"],
-            buffer_size=10,
-        )
-
-        def op_ref(predictions, labels):
-            ap = calculate_ap(predictions, labels)
-            return (ap, )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[predictions, labels],
-            reference=op_ref)
-
-    @given(predictions=hu.arrays(dims=[10, 3],
-           elements=hu.floats(allow_nan=False,
-                              allow_infinity=False,
-                              min_value=0.1,
-                              max_value=1)),
-           labels=hu.arrays(dims=[10, 3],
-                            dtype=np.int32,
-                            elements=st.integers(min_value=0,
-                                                 max_value=1)),
-           **hu.gcs_cpu_only)
-    def test_average_precision_small_buffer(self, predictions, labels, gc, dc):
-        op_small_buffer = core.CreateOperator(
-            "APMeter",
-            ["predictions", "labels"],
-            ["AP"],
-            buffer_size=5,
-        )
-
-        def op_ref(predictions, labels):
-            # We can only hold the last 5 in the buffer
-            ap = calculate_ap(predictions[5:], labels[5:])
-            return (ap, )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op_small_buffer,
-            inputs=[predictions, labels],
-            reference=op_ref
-        )
diff --git a/caffe2/python/operator_test/arg_ops_test.py b/caffe2/python/operator_test/arg_ops_test.py
deleted file mode 100644
index 330d17ed6999..000000000000
--- a/caffe2/python/operator_test/arg_ops_test.py
+++ /dev/null
@@ -1,63 +0,0 @@
-
-
-
-
-
-import hypothesis.strategies as st
-import numpy as np
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-class TestArgOps(serial.SerializedTestCase):
-    @given(
-        X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
-        keepdims=st.booleans(), **hu.gcs)
-    @settings(deadline=None)
-    def test_argmax(self, X, axis, keepdims, gc, dc):
-        if axis >= len(X.shape):
-            axis %= len(X.shape)
-        op = core.CreateOperator(
-            "ArgMax", ["X"], ["Indices"], axis=axis, keepdims=keepdims,
-            device_option=gc)
-
-        def argmax_ref(X):
-            indices = np.argmax(X, axis=axis)
-            if keepdims:
-                out_dims = list(X.shape)
-                out_dims[axis] = 1
-                indices = indices.reshape(tuple(out_dims))
-            return [indices]
-
-        self.assertReferenceChecks(gc, op, [X], argmax_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(
-        X=hu.tensor(dtype=np.float32), axis=st.integers(-1, 5),
-        keepdims=st.booleans(), **hu.gcs)
-    @settings(deadline=None)
-    def test_argmin(self, X, axis, keepdims, gc, dc):
-        if axis >= len(X.shape):
-            axis %= len(X.shape)
-        op = core.CreateOperator(
-            "ArgMin", ["X"], ["Indices"], axis=axis, keepdims=keepdims,
-            device_option=gc)
-
-        def argmin_ref(X):
-            indices = np.argmin(X, axis=axis)
-            if keepdims:
-                out_dims = list(X.shape)
-                out_dims[axis] = 1
-                indices = indices.reshape(tuple(out_dims))
-            return [indices]
-
-        self.assertReferenceChecks(gc, op, [X], argmin_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/assert_test.py b/caffe2/python/operator_test/assert_test.py
deleted file mode 100644
index eef33bc22bc0..000000000000
--- a/caffe2/python/operator_test/assert_test.py
+++ /dev/null
@@ -1,29 +0,0 @@
-
-
-
-
-import numpy as np
-from hypothesis import given, settings
-import hypothesis.strategies as st
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestAssert(hu.HypothesisTestCase):
-    @given(
-        dtype=st.sampled_from(['bool_', 'int32', 'int64']),
-        shape=st.lists(elements=st.integers(1, 10), min_size=1, max_size=4),
-        **hu.gcs)
-    @settings(deadline=10000)
-    def test_assert(self, dtype, shape, gc, dc):
-        test_tensor = np.random.rand(*shape).astype(np.dtype(dtype))
-
-        op = core.CreateOperator('Assert', ['X'], [])
-
-        def assert_ref(X):
-            return []
-
-        try:
-            self.assertReferenceChecks(gc, op, [test_tensor], assert_ref)
-        except Exception:
-            assert(not np.all(test_tensor))
diff --git a/caffe2/python/operator_test/async_net_barrier_test.py b/caffe2/python/operator_test/async_net_barrier_test.py
deleted file mode 100644
index c12cd9a2fe53..000000000000
--- a/caffe2/python/operator_test/async_net_barrier_test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/usr/bin/env python3
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core
-from hypothesis import given
-
-
-class TestAsyncNetBarrierOp(hu.HypothesisTestCase):
-    @given(
-        n=st.integers(1, 5),
-        shape=st.lists(st.integers(0, 5), min_size=1, max_size=3),
-        **hu.gcs
-    )
-    def test_async_net_barrier_op(self, n, shape, dc, gc):
-        test_inputs = [(100 * np.random.random(shape)).astype(np.float32) for _ in range(n)]
-        test_input_blobs = ["x_{}".format(i) for i in range(n)]
-
-        barrier_op = core.CreateOperator(
-            "AsyncNetBarrier",
-            test_input_blobs,
-            test_input_blobs,
-            device_option=gc,
-        )
-
-        def reference_func(*args):
-            self.assertEqual(len(args), n)
-            return args
-
-        self.assertReferenceChecks(gc, barrier_op, test_inputs, reference_func)
diff --git a/caffe2/python/operator_test/atomic_ops_test.py b/caffe2/python/operator_test/atomic_ops_test.py
deleted file mode 100644
index 7f568f523bbf..000000000000
--- a/caffe2/python/operator_test/atomic_ops_test.py
+++ /dev/null
@@ -1,91 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-
-import unittest
-
-
-class TestAtomicOps(TestCase):
-    @unittest.skip("Test is flaky: https://github.com/pytorch/pytorch/issues/28179")
-    def test_atomic_ops(self):
-        """
-        Test that both countdown and checksum are update atomically by having
-        cowntdown count from 20k to 0 from parallel the workers and updating
-        the checksum to the value fetched. If operations are trully atomic,
-        each value from 1 to 20k should be fetched exactly once from the
-        countdown, and fed exactly once to the checksum, such that at the end
-        checksum must contain the exact value of sum[i=0..20000](i).
-        """
-        init_net = core.Net('init')
-        mutex_countdown = init_net.CreateMutex([])
-        mutex_checksum = init_net.CreateMutex([])
-        countdown = init_net.ConstantFill([], shape=[], value=20000,
-                                          dtype=core.DataType.INT32)
-        checksum = init_net.ConstantFill(
-            [], shape=[], value=0, dtype=core.DataType.INT32)
-        minus_one = init_net.ConstantFill(
-            [], shape=[], value=-1, dtype=core.DataType.INT32)
-        steps = []
-        for i in range(0, 100):
-            net = core.Net('net:%d' % i)
-            _, fetched_count = net.AtomicFetchAdd(
-                [mutex_countdown, countdown, minus_one],
-                [countdown, 'fetched_count:%d' % i])
-            net.AtomicFetchAdd(
-                [mutex_checksum, checksum, fetched_count],
-                [checksum, 'not_used'])
-            steps.append(
-                core.execution_step('worker:%d' % i, net, num_iter=200))
-        super_step = core.execution_step(
-            'parent', steps, concurrent_substeps=True)
-        plan = core.Plan('plan')
-        plan.AddStep(core.execution_step('init', init_net))
-        plan.AddStep(super_step)
-        workspace.RunPlan(plan)
-        # checksum = sum[i=1..20000](i) = 20000 * 20001 / 2 = 200010000
-        self.assertEqual(workspace.FetchBlob(checksum), 200010000)
-
-    @unittest.skip("Test is flaky: https://github.com/pytorch/pytorch/issues/28179")
-    def test_atomic64_ops(self):
-        """
-        Test that both countdown and checksum are update atomically by having
-        cowntdown count from 20k to 0 from parallel the workers and updating
-        the checksum to the value fetched. If operations are trully atomic,
-        each value from 1 to 20k should be fetched exactly once from the
-        countdown, and fed exactly once to the checksum, such that at the end
-        checksum must contain the exact value of sum[i=0..20000](i).
-        """
-        init_net = core.Net('init')
-        mutex_countdown = init_net.CreateMutex([])
-        mutex_checksum = init_net.CreateMutex([])
-        countdown = init_net.ConstantFill([], shape=[], value=20000,
-                                          dtype=core.DataType.INT64)
-        checksum = init_net.ConstantFill(
-            [], shape=[], value=0, dtype=core.DataType.INT64)
-        minus_one = init_net.ConstantFill(
-            [], shape=[], value=-1, dtype=core.DataType.INT64)
-        steps = []
-        for i in range(0, 100):
-            net = core.Net('net:%d' % i)
-            _, fetched_count = net.AtomicFetchAdd64(
-                [mutex_countdown, countdown, minus_one],
-                [countdown, 'fetched_count:%d' % i])
-            net.AtomicFetchAdd64(
-                [mutex_checksum, checksum, fetched_count],
-                [checksum, 'not_used'])
-            steps.append(
-                core.execution_step('worker:%d' % i, net, num_iter=200))
-        super_step = core.execution_step(
-            'parent', steps, concurrent_substeps=True)
-        plan = core.Plan('plan')
-        plan.AddStep(core.execution_step('init', init_net))
-        plan.AddStep(super_step)
-        workspace.RunPlan(plan)
-        # checksum = sum[i=1..20000](i) = 20000 * 20001 / 2 = 200010000
-        self.assertEqual(workspace.FetchBlob(checksum), 200010000)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/basic_rnn_test.py b/caffe2/python/operator_test/basic_rnn_test.py
deleted file mode 100644
index e863289d488c..000000000000
--- a/caffe2/python/operator_test/basic_rnn_test.py
+++ /dev/null
@@ -1,137 +0,0 @@
-
-
-
-
-
-from caffe2.python import workspace, core, rnn_cell
-from caffe2.python.model_helper import ModelHelper
-from caffe2.python.rnn.rnn_cell_test_util import tanh
-import caffe2.python.hypothesis_test_util as hu
-
-from hypothesis import given
-from hypothesis import settings as ht_settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-def basic_rnn_reference(input, hidden_initial,
-                        i2h_w, i2h_b,
-                        gate_w, gate_b,
-                        seq_lengths,
-                        drop_states,
-                        use_sequence_lengths):
-    D = hidden_initial.shape[-1]
-    T = input.shape[0]
-    N = input.shape[1]
-
-    if seq_lengths is not None:
-        seq_lengths = (np.ones(shape=(N, D)) *
-                       seq_lengths.reshape(N, 1)).astype(np.int32)
-
-    ret = []
-
-    hidden_prev = hidden_initial
-
-    for t in range(T):
-        input_fc = np.dot(input[t], i2h_w.T) + i2h_b
-        recur_fc = np.dot(hidden_prev, gate_w.T) + gate_b
-        hidden_t = tanh(input_fc + recur_fc)
-
-        if seq_lengths is not None:
-            valid = (t < seq_lengths).astype(np.int32)
-            assert valid.shape == (N, D), (valid.shape, (N, D))
-            hidden_t = hidden_t * valid + \
-                       hidden_prev * (1 - valid) * (1 - drop_states)
-
-        ret.append(hidden_t)
-        hidden_prev = hidden_t
-    return ret
-
-
-class BasicRNNCellTest(hu.HypothesisTestCase):
-    @given(
-        seed=st.integers(0, 2**32 - 1),
-        seq_length=st.integers(min_value=1, max_value=5),
-        batch_size=st.integers(min_value=1, max_value=5),
-        input_size=st.integers(min_value=1, max_value=5),
-        hidden_size=st.integers(min_value=1, max_value=5),
-        drop_states=st.booleans(),
-        sequence_lengths=st.booleans(),
-        **hu.gcs
-    )
-    @ht_settings(max_examples=15)
-    def test_basic_rnn(self, seed, seq_length, batch_size, input_size, hidden_size,
-                       drop_states, sequence_lengths, gc, dc):
-        np.random.seed(seed)
-
-        seq_lengths_data = np.random.randint(
-            1, seq_length + 1, size=(batch_size,)).astype(np.int32)
-        input_blob_data = np.random.randn(
-            seq_length, batch_size, input_size).astype(np.float32)
-        initial_h_data = np.random.randn(
-            batch_size, hidden_size).astype(np.float32)
-        gates_t_w_data = np.random.randn(
-            hidden_size, hidden_size).astype(np.float32)
-        gates_t_b_data = np.random.randn(
-            hidden_size).astype(np.float32)
-        i2h_w_data = np.random.randn(
-            hidden_size, input_size).astype(np.float32)
-        i2h_b_data = np.random.randn(
-            hidden_size).astype(np.float32)
-
-        with core.DeviceScope(gc):
-            with hu.temp_workspace():
-                workspace.FeedBlob(
-                    'input_blob', input_blob_data, device_option=gc)
-                workspace.FeedBlob(
-                    'seq_lengths', seq_lengths_data, device_option=gc)
-                workspace.FeedBlob(
-                    'initial_h', initial_h_data, device_option=gc)
-                workspace.FeedBlob(
-                    'basic_rnn/gates_t_w', gates_t_w_data, device_option=gc)
-                workspace.FeedBlob(
-                    'basic_rnn/gates_t_b', gates_t_b_data, device_option=gc)
-                workspace.FeedBlob(
-                    'basic_rnn/i2h_w', i2h_w_data, device_option=gc)
-                workspace.FeedBlob(
-                    'basic_rnn/i2h_b', i2h_b_data, device_option=gc)
-
-                model = ModelHelper(name='model')
-                hidden_t_all, _ = rnn_cell.BasicRNN(
-                    model,
-                    'input_blob',
-                    'seq_lengths' if sequence_lengths else None,
-                    ['initial_h'],
-                    input_size,
-                    hidden_size,
-                    "basic_rnn",
-                    activation='tanh',
-                    forward_only=True,
-                    drop_states=drop_states)
-
-                workspace.RunNetOnce(model.net)
-
-                result = workspace.FetchBlob(hidden_t_all)
-
-        reference = basic_rnn_reference(
-            input_blob_data,
-            initial_h_data,
-            i2h_w_data,
-            i2h_b_data,
-            gates_t_w_data,
-            gates_t_b_data,
-            seq_lengths_data if sequence_lengths else None,
-            drop_states=drop_states,
-            use_sequence_lengths=sequence_lengths
-        )
-
-        np.testing.assert_allclose(result, reference, atol=1e-4, rtol=1e-4)
-
-
-if __name__ == "__main__":
-    workspace.GlobalInit([
-        'caffe2',
-        '--caffe2_log_level=0',
-    ])
-    unittest.main()
diff --git a/caffe2/python/operator_test/batch_box_cox_test.py b/caffe2/python/operator_test/batch_box_cox_test.py
deleted file mode 100644
index c9306ce1ab07..000000000000
--- a/caffe2/python/operator_test/batch_box_cox_test.py
+++ /dev/null
@@ -1,141 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-# The reference implementation is susceptible to numerical cancellation when
-# *lambda1* is small and *data* is near one. We leave it up to the caller to
-# truncate lambda to zero or bound data away from one. Unfortunately, the C++
-# implementation may be using higher precision than the python version, which
-# could cause this test to fail. We bound inputs away from the critical values.
-# (Note that a tolerance of 1e-6 on _either_ parameter is typically sufficient
-# to avoid catastrophic cancellation when the other is far from zero/one.)
-TOLERANCE = 1e-3
-
-
-@st.composite
-def _inputs(draw):
-    N = draw(st.integers(min_value=0, max_value=5))
-    D = draw(st.integers(min_value=1, max_value=5))
-    # N, D, data, lambda1, lambda2
-    return (
-        N,
-        D,
-        draw(st.lists(
-            min_size=N * D,
-            max_size=N * D,
-            elements=st.one_of(
-                st.floats(min_value=-10, max_value=1 - TOLERANCE),
-                st.floats(min_value=1 + TOLERANCE, max_value=10))
-        )),
-        draw(st.lists(
-            elements=st.one_of(
-                st.floats(min_value=-2, max_value=-TOLERANCE),
-                st.floats(min_value=TOLERANCE, max_value=2)),
-            min_size=D,
-            max_size=D,
-        )),
-        draw(st.lists(
-            elements=st.floats(min_value=-2, max_value=2),
-            min_size=D,
-            max_size=D,
-        )),
-    )
-
-
-class TestBatchBoxCox(serial.SerializedTestCase):
-    @given(
-        inputs=_inputs(),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=10000)
-    def test_batch_box_cox(self, inputs, gc, dc):
-        self.batch_box_cox(inputs, gc, dc)
-
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_lambda1_is_all_zero(self, gc, dc):
-        inputs = (1, 1, [[2]], [0], [0])
-        self.batch_box_cox(inputs, gc, dc)
-        inputs = (2, 1, [[2], [4]], [0], [0])
-        self.batch_box_cox(inputs, gc, dc)
-        inputs = (1, 3, [[1, 2, 3]], [0, 0, 0], [0, 0, 0])
-        self.batch_box_cox(inputs, gc, dc)
-        inputs = (2, 3, [[1, 2, 3], [4, 5, 6]], [0, 0, 0], [0, 0, 0])
-        self.batch_box_cox(inputs, gc, dc)
-
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_lambda1_is_partially_zero(self, gc, dc):
-        inputs = (1, 5, [[1, 2, 3, 4, 5]],
-                  [0, -.5, 0, .5, 0], [0.1, 0.2, 0.3, 0.4, 0.5])
-        self.batch_box_cox(inputs, gc, dc)
-        inputs = (3, 5, [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [1, 2, 3, 4, 5]],
-                  [0, -.5, 0, .5, 0], [0.1, 0.2, 0.3, 0.4, 0.5])
-        self.batch_box_cox(inputs, gc, dc)
-        inputs = (2, 6, [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
-                  [0, -.5, 0, .5, 0, 1], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
-        self.batch_box_cox(inputs, gc, dc)
-        inputs = (2, 7, [[1, 2, 3, 4, 5, 6, 7], [8, 9, 10, 11, 12, 13, 14]],
-                  [0, -.5, 0, .5, 0, 1, 0], [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7])
-        self.batch_box_cox(inputs, gc, dc)
-
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_bound_base_away_from_zero(self, gc, dc):
-        inputs = (2, 3, [[1e-5, 1e-6, 1e-7], [1e-7, -1e-6, 1e-5]],
-                  [0, 0, 0], [0, 0, 1e-6])
-        self.batch_box_cox(inputs, gc, dc)
-
-    def batch_box_cox(self, inputs, gc, dc):
-        N, D, data, lambda1, lambda2 = inputs
-
-        data = np.array(data, dtype=np.float32).reshape(N, D)
-        lambda1 = np.array(lambda1, dtype=np.float32)
-        lambda2 = np.array(lambda2, dtype=np.float32)
-
-        # Bound data away from one. See comment in _inputs() above.
-        base = data + lambda2
-        data[(base > 1 - TOLERANCE) & (base < 1 + TOLERANCE)] += 2 * TOLERANCE
-
-        def ref(data, lambda1, lambda2):
-            dim_1 = data.shape[1]
-            output = np.copy(data)
-            if data.size <= 0:
-                return [output]
-
-            for i in range(dim_1):
-                output[:, i] = data[:, i] + lambda2[i]
-                output[:, i] = np.maximum(output[:, i], 1e-6)
-                if lambda1[i] == 0:
-                    output[:, i] = np.log(output[:, i])
-                else:
-                    output[:, i] =\
-                        (np.power(output[:, i], lambda1[i]) - 1) / lambda1[i]
-            return [output]
-
-        for naive in [False, True]:
-            op = core.CreateOperator(
-                'BatchBoxCox',
-                ['data', 'lambda1', 'lambda2'],
-                ['output'],
-                naive=naive,
-                # Note examples above with D=5, 6, 7.
-                # A zero value falls back to the naive implementation.
-                min_block_size=0 if naive else 6
-            )
-            self.assertReferenceChecks(gc, op, [data, lambda1, lambda2], ref)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/batch_bucketize_op_test.py b/caffe2/python/operator_test/batch_bucketize_op_test.py
deleted file mode 100644
index 82def0572686..000000000000
--- a/caffe2/python/operator_test/batch_bucketize_op_test.py
+++ /dev/null
@@ -1,91 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given
-import hypothesis.strategies as st
-
-
-class TestBatchBucketize(serial.SerializedTestCase):
-    @serial.given(**hu.gcs_cpu_only)
-    def test_batch_bucketize_example(self, gc, dc):
-        op = core.CreateOperator('BatchBucketize',
-                                 ["FEATURE", "INDICES", "BOUNDARIES", "LENGTHS"],
-                                 ["O"])
-        float_feature = np.array([[1.42, 2.07, 3.19, 0.55, 4.32],
-                                  [4.57, 2.30, 0.84, 4.48, 3.09],
-                                  [0.89, 0.26, 2.41, 0.47, 1.05],
-                                  [0.03, 2.97, 2.43, 4.36, 3.11],
-                                  [2.74, 5.77, 0.90, 2.63, 0.38]], dtype=np.float32)
-        indices = np.array([0, 1, 4], dtype=np.int32)
-        lengths = np.array([2, 3, 1], dtype=np.int32)
-        boundaries = np.array([0.5, 1.0, 1.5, 2.5, 3.5, 2.5], dtype=np.float32)
-
-        def ref(float_feature, indices, boundaries, lengths):
-            output = np.array([[2, 1, 1],
-                               [2, 1, 1],
-                               [1, 0, 0],
-                               [0, 2, 1],
-                               [2, 3, 0]], dtype=np.int32)
-            return (output,)
-
-        self.assertReferenceChecks(gc, op,
-                                   [float_feature, indices, boundaries, lengths],
-                                   ref)
-
-    @given(
-        x=hu.tensor(
-            min_dim=2, max_dim=2, dtype=np.float32,
-            elements=hu.floats(min_value=0, max_value=5),
-            min_value=5),
-        seed=st.integers(min_value=2, max_value=1000),
-        **hu.gcs_cpu_only)
-    def test_batch_bucketize(self, x, seed, gc, dc):
-        op = core.CreateOperator('BatchBucketize',
-                                 ["FEATURE", "INDICES", "BOUNDARIES", "LENGTHS"],
-                                 ['O'])
-        np.random.seed(seed)
-        d = x.shape[1]
-        lens = np.random.randint(low=1, high=3, size=d - 3)
-        indices = np.random.choice(range(d), d - 3, replace=False)
-        indices.sort()
-        boundaries = []
-        for i in range(d - 3):
-            # add [0, 0] as duplicated boundary for duplicated bucketization
-            if lens[i] > 2:
-                cur_boundary = np.append(
-                    np.random.randn(lens[i] - 2) * 5, [0, 0])
-            else:
-                cur_boundary = np.random.randn(lens[i]) * 5
-            cur_boundary.sort()
-            boundaries += cur_boundary.tolist()
-
-        lens = np.array(lens, dtype=np.int32)
-        boundaries = np.array(boundaries, dtype=np.float32)
-        indices = np.array(indices, dtype=np.int32)
-
-        def ref(x, indices, boundaries, lens):
-            output_dim = indices.shape[0]
-            ret = np.zeros((x.shape[0], output_dim)).astype(np.int32)
-            boundary_offset = 0
-            for i, l in enumerate(indices):
-                temp_bound = boundaries[boundary_offset : lens[i] + boundary_offset]
-                for j in range(x.shape[0]):
-                    for k, bound_val in enumerate(temp_bound):
-                        if k == len(temp_bound) - 1 and x[j, l] > bound_val:
-                            ret[j, i] = k + 1
-                        elif x[j, l] > bound_val:
-                            continue
-                        else:
-                            ret[j, i] = k
-                            break
-                boundary_offset += lens[i]
-            return (ret,)
-
-        self.assertReferenceChecks(gc, op, [x, indices, boundaries, lens], ref)
diff --git a/caffe2/python/operator_test/batch_moments_op_test.py b/caffe2/python/operator_test/batch_moments_op_test.py
deleted file mode 100644
index 12dd72a4160a..000000000000
--- a/caffe2/python/operator_test/batch_moments_op_test.py
+++ /dev/null
@@ -1,92 +0,0 @@
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestBatchMomentsOp(serial.SerializedTestCase):
-    def batch_moments_nchw_ref(self, X):
-        dims = X.shape
-        N = dims[0]
-        C = dims[1]
-        X = X.reshape(N, C, -1)
-        mu = np.mean(X, axis=(0, 2))
-        var = np.mean(np.square(X), axis=(0, 2))
-        return [mu, var]
-
-    def batch_moments_nhwc_ref(self, X):
-        dims = X.shape
-        C = dims[-1]
-        X = X.reshape(-1, C)
-        mu = np.mean(X, axis=0)
-        var = np.mean(np.square(X), axis=0)
-        return [mu, var]
-
-    @serial.given(N=st.integers(1, 5), C=st.integers(1, 5),
-            H=st.integers(1, 5), W=st.integers(1, 5),
-            order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
-    def test_batch_moments_2d(self, N, C, H, W, order, gc, dc):
-        op = core.CreateOperator(
-            "BatchMoments",
-            ["X"],
-            ["mu", "var"],
-            order=order,
-        )
-
-        if order == "NCHW":
-            X = np.random.randn(N, C, H, W).astype(np.float32)
-        else:
-            X = np.random.randn(N, H, W, C).astype(np.float32)
-
-        def ref(X):
-            if order == "NCHW":
-                return self.batch_moments_nchw_ref(X)
-            else:
-                return self.batch_moments_nhwc_ref(X)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=ref,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0, 1])
-        self.assertGradientChecks(gc, op, [X], 0, [0, 1])
-
-    @given(N=st.integers(1, 5), C=st.integers(1, 5), T=st.integers(1, 3),
-           H=st.integers(1, 3), W=st.integers(1, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
-    @settings(deadline=10000)
-    def test_batch_moments_3d(self, N, C, T, H, W, order, gc, dc):
-        op = core.CreateOperator(
-            "BatchMoments",
-            ["X"],
-            ["mu", "var"],
-            order=order,
-        )
-
-        if order == "NCHW":
-            X = np.random.randn(N, C, T, H, W).astype(np.float32)
-        else:
-            X = np.random.randn(N, T, H, W, C).astype(np.float32)
-
-        def ref(X):
-            if order == "NCHW":
-                return self.batch_moments_nchw_ref(X)
-            else:
-                return self.batch_moments_nhwc_ref(X)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=ref,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0, 1])
-        self.assertGradientChecks(gc, op, [X], 0, [0, 1])
diff --git a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py b/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
deleted file mode 100644
index 96132e8f5e93..000000000000
--- a/caffe2/python/operator_test/batch_sparse_to_dense_op_test.py
+++ /dev/null
@@ -1,114 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestBatchSparseToDense(serial.SerializedTestCase):
-
-    @given(
-        batch_size=st.integers(5, 10),
-        dense_last_dim=st.integers(5, 10),
-        default_value=st.floats(min_value=2.0, max_value=3.0),
-        **hu.gcs
-    )
-    @settings(deadline=None)
-    def test_batch_sparse_to_dense(
-        self, batch_size, dense_last_dim, default_value, gc, dc
-    ):
-        L = np.random.randint(1, dense_last_dim + 1, size=(batch_size))
-        num_data = L.sum()
-        # The following logic ensure that indices in each batch will not be duplicated
-        I = np.array([]).astype(np.int32)
-        for l in L:
-            I_l = np.random.choice(dense_last_dim, l, replace=False)
-            I = np.concatenate((I, I_l))
-        V = np.random.rand(num_data).astype(np.float32)
-
-        op = core.CreateOperator(
-            'BatchSparseToDense',
-            ['L', 'I', 'V'],
-            ['O'],
-            dense_last_dim=dense_last_dim,
-            default_value=default_value,
-        )
-
-        S = np.random.rand(batch_size, dense_last_dim).astype(np.float32)
-        op2 = core.CreateOperator(
-            'BatchSparseToDense',
-            ['L', 'I', 'V', 'S'],
-            ['O'],
-            default_value=default_value,
-        )
-
-        def batch_sparse_to_dense_ref(L, I, V, S=None):
-            if S is None:
-                ret = np.zeros((batch_size, dense_last_dim))
-            else:
-                ret = np.zeros(S.shape)
-            ret.fill(default_value)
-            batch = 0
-            v_idx = 0
-            for length in L:
-                for _ in range(length):
-                    ret[batch][I[v_idx]] = V[v_idx]
-                    v_idx += 1
-                batch += 1
-            return [ret]
-
-        self.assertDeviceChecks(dc, op, [L, I, V], [0])
-        self.assertReferenceChecks(gc, op, [L, I, V], batch_sparse_to_dense_ref)
-        self.assertGradientChecks(gc, op, [L, I, V], 2, [0])
-        self.assertDeviceChecks(dc, op2, [L, I, V, S], [0])
-        self.assertReferenceChecks(gc, op2, [L, I, V, S], batch_sparse_to_dense_ref)
-        self.assertGradientChecks(gc, op2, [L, I, V, S], 2, [0])
-        self.assertDeviceChecks(dc, op, [L.astype(np.int32), I, V], [0])
-        self.assertReferenceChecks(gc, op, [L.astype(np.int32), I, V], batch_sparse_to_dense_ref)
-        self.assertGradientChecks(gc, op, [L.astype(np.int32), I, V], 2, [0])
-
-    @given(
-        batch_size=st.integers(5, 10),
-        dense_last_dim=st.integers(5, 10),
-        **hu.gcs
-    )
-    @settings(deadline=None)
-    def test_batch_dense_to_sparse(self, batch_size, dense_last_dim, gc, dc):
-        L = np.random.randint(1, dense_last_dim + 1, size=(batch_size))
-        # The following logic ensure that indices in each batch will not be duplicated
-        I = np.array([]).astype(np.int32)
-        for l in L:
-            I_l = np.random.choice(dense_last_dim, l, replace=False)
-            I = np.concatenate((I, I_l))
-        D = np.random.rand(batch_size, dense_last_dim).astype(np.float32)
-
-        op = core.CreateOperator(
-            'BatchDenseToSparse',
-            ['L', 'I', 'D'],
-            ['V'],
-        )
-
-        def batch_dense_to_sparse_ref(L, I, D):
-            ret = np.zeros(I.shape)
-            batch = 0
-            i_idx = 0
-            for length in L:
-                for _ in range(length):
-                    ret[i_idx] = D[batch][I[i_idx]]
-                    i_idx += 1
-                batch += 1
-            return [ret]
-        print(L, I, D)
-
-        self.assertDeviceChecks(dc, op, [L, I, D], [0])
-        self.assertReferenceChecks(gc, op, [L, I, D], batch_dense_to_sparse_ref)
-        self.assertGradientChecks(gc, op, [L, I, D], 2, [0])
-        self.assertDeviceChecks(dc, op, [L.astype(np.int32), I, D], [0])
-        self.assertReferenceChecks(gc, op, [L.astype(np.int32), I, D], batch_dense_to_sparse_ref)
-        self.assertGradientChecks(gc, op, [L.astype(np.int32), I, D], 2, [0])
diff --git a/caffe2/python/operator_test/bbox_transform_test.py b/caffe2/python/operator_test/bbox_transform_test.py
deleted file mode 100644
index adcc2f8723d2..000000000000
--- a/caffe2/python/operator_test/bbox_transform_test.py
+++ /dev/null
@@ -1,358 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-# Reference implementation from detectron/lib/utils/boxes.py
-def bbox_transform(boxes, deltas, weights=(1.0, 1.0, 1.0, 1.0)):
-    """Forward transform that maps proposal boxes to predicted ground-truth
-    boxes using bounding-box regression deltas. See bbox_transform_inv for a
-    description of the weights argument.
-    """
-    if boxes.shape[0] == 0:
-        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
-
-    boxes = boxes.astype(deltas.dtype, copy=False)
-
-    widths = boxes[:, 2] - boxes[:, 0] + 1.0
-    heights = boxes[:, 3] - boxes[:, 1] + 1.0
-    ctr_x = boxes[:, 0] + 0.5 * widths
-    ctr_y = boxes[:, 1] + 0.5 * heights
-
-    wx, wy, ww, wh = weights
-    dx = deltas[:, 0::4] / wx
-    dy = deltas[:, 1::4] / wy
-    dw = deltas[:, 2::4] / ww
-    dh = deltas[:, 3::4] / wh
-
-    # Prevent sending too large values into np.exp()
-    BBOX_XFORM_CLIP = np.log(1000. / 16.)
-    dw = np.minimum(dw, BBOX_XFORM_CLIP)
-    dh = np.minimum(dh, BBOX_XFORM_CLIP)
-
-    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
-    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
-    pred_w = np.exp(dw) * widths[:, np.newaxis]
-    pred_h = np.exp(dh) * heights[:, np.newaxis]
-
-    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
-    # x1
-    pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
-    # y1
-    pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
-    # x2 (note: "- 1" is correct; don't be fooled by the asymmetry)
-    pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1
-    # y2 (note: "- 1" is correct; don't be fooled by the asymmetry)
-    pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1
-
-    return pred_boxes
-
-
-# Reference implementation from detectron/lib/utils/boxes.py
-def clip_tiled_boxes(boxes, im_shape):
-    """Clip boxes to image boundaries. im_shape is [height, width] and boxes
-    has shape (N, 4 * num_tiled_boxes)."""
-    assert (
-        boxes.shape[1] % 4 == 0
-    ), "boxes.shape[1] is {:d}, but must be divisible by 4.".format(
-        boxes.shape[1]
-    )
-    # x1 >= 0
-    boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
-    # y1 >= 0
-    boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
-    # x2 < im_shape[1]
-    boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
-    # y2 < im_shape[0]
-    boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
-    return boxes
-
-
-def generate_rois(roi_counts, im_dims):
-    assert len(roi_counts) == len(im_dims)
-    all_rois = []
-    for i, num_rois in enumerate(roi_counts):
-        if num_rois == 0:
-            continue
-        # [batch_idx, x1, y1, x2, y2]
-        rois = np.random.uniform(0, im_dims[i], size=(roi_counts[i], 5)).astype(
-            np.float32
-        )
-        rois[:, 0] = i  # batch_idx
-        # Swap (x1, x2) if x1 > x2
-        rois[:, 1], rois[:, 3] = (
-            np.minimum(rois[:, 1], rois[:, 3]),
-            np.maximum(rois[:, 1], rois[:, 3]),
-        )
-        # Swap (y1, y2) if y1 > y2
-        rois[:, 2], rois[:, 4] = (
-            np.minimum(rois[:, 2], rois[:, 4]),
-            np.maximum(rois[:, 2], rois[:, 4]),
-        )
-        all_rois.append(rois)
-    if len(all_rois) > 0:
-        return np.vstack(all_rois)
-    return np.empty((0, 5)).astype(np.float32)
-
-
-def bbox_transform_rotated(
-    boxes,
-    deltas,
-    weights=(1.0, 1.0, 1.0, 1.0),
-    angle_bound_on=True,
-    angle_bound_lo=-90,
-    angle_bound_hi=90,
-):
-    """
-    Similar to bbox_transform but for rotated boxes with angle info.
-    """
-    if boxes.shape[0] == 0:
-        return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
-
-    boxes = boxes.astype(deltas.dtype, copy=False)
-
-    ctr_x = boxes[:, 0]
-    ctr_y = boxes[:, 1]
-    widths = boxes[:, 2]
-    heights = boxes[:, 3]
-    angles = boxes[:, 4]
-
-    wx, wy, ww, wh = weights
-    dx = deltas[:, 0::5] / wx
-    dy = deltas[:, 1::5] / wy
-    dw = deltas[:, 2::5] / ww
-    dh = deltas[:, 3::5] / wh
-    da = deltas[:, 4::5] * 180.0 / np.pi
-
-    # Prevent sending too large values into np.exp()
-    BBOX_XFORM_CLIP = np.log(1000. / 16.)
-    dw = np.minimum(dw, BBOX_XFORM_CLIP)
-    dh = np.minimum(dh, BBOX_XFORM_CLIP)
-
-    pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
-    pred_boxes[:, 0::5] = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
-    pred_boxes[:, 1::5] = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
-    pred_boxes[:, 2::5] = np.exp(dw) * widths[:, np.newaxis]
-    pred_boxes[:, 3::5] = np.exp(dh) * heights[:, np.newaxis]
-
-    pred_angle = da + angles[:, np.newaxis]
-    if angle_bound_on:
-        period = angle_bound_hi - angle_bound_lo
-        assert period % 180 == 0
-        pred_angle[np.where(pred_angle < angle_bound_lo)] += period
-        pred_angle[np.where(pred_angle > angle_bound_hi)] -= period
-    pred_boxes[:, 4::5] = pred_angle
-
-    return pred_boxes
-
-
-def clip_tiled_boxes_rotated(boxes, im_shape, angle_thresh=1.0):
-    """
-    Similar to clip_tiled_boxes but for rotated boxes with angle info.
-    Only clips almost horizontal boxes within angle_thresh. The rest are
-    left unchanged.
-    """
-    assert (
-        boxes.shape[1] % 5 == 0
-    ), "boxes.shape[1] is {:d}, but must be divisible by 5.".format(
-        boxes.shape[1]
-    )
-
-    (H, W) = im_shape[:2]
-
-    # Filter boxes that are almost upright within angle_thresh tolerance
-    idx = np.where(np.abs(boxes[:, 4::5]) <= angle_thresh)
-    idx5 = idx[1] * 5
-    # convert to (x1, y1, x2, y2)
-    x1 = boxes[idx[0], idx5] - (boxes[idx[0], idx5 + 2] - 1) / 2.0
-    y1 = boxes[idx[0], idx5 + 1] - (boxes[idx[0], idx5 + 3] - 1) / 2.0
-    x2 = boxes[idx[0], idx5] + (boxes[idx[0], idx5 + 2] - 1) / 2.0
-    y2 = boxes[idx[0], idx5 + 1] + (boxes[idx[0], idx5 + 3] - 1) / 2.0
-    # clip
-    x1 = np.maximum(np.minimum(x1, W - 1), 0)
-    y1 = np.maximum(np.minimum(y1, H - 1), 0)
-    x2 = np.maximum(np.minimum(x2, W - 1), 0)
-    y2 = np.maximum(np.minimum(y2, H - 1), 0)
-    # convert back to (xc, yc, w, h)
-    boxes[idx[0], idx5] = (x1 + x2) / 2.0
-    boxes[idx[0], idx5 + 1] = (y1 + y2) / 2.0
-    boxes[idx[0], idx5 + 2] = x2 - x1 + 1
-    boxes[idx[0], idx5 + 3] = y2 - y1 + 1
-
-    return boxes
-
-
-def generate_rois_rotated(roi_counts, im_dims):
-    rois = generate_rois(roi_counts, im_dims)
-    # [batch_id, ctr_x, ctr_y, w, h, angle]
-    rotated_rois = np.empty((rois.shape[0], 6)).astype(np.float32)
-    rotated_rois[:, 0] = rois[:, 0]  # batch_id
-    rotated_rois[:, 1] = (rois[:, 1] + rois[:, 3]) / 2.  # ctr_x = (x1 + x2) / 2
-    rotated_rois[:, 2] = (rois[:, 2] + rois[:, 4]) / 2.  # ctr_y = (y1 + y2) / 2
-    rotated_rois[:, 3] = rois[:, 3] - rois[:, 1] + 1.0  # w = x2 - x1 + 1
-    rotated_rois[:, 4] = rois[:, 4] - rois[:, 2] + 1.0  # h = y2 - y1 + 1
-    rotated_rois[:, 5] = np.random.uniform(-90.0, 90.0)  # angle in degrees
-    return rotated_rois
-
-
-class TestBBoxTransformOp(serial.SerializedTestCase):
-    @given(
-        num_rois=st.integers(1, 10),
-        num_classes=st.integers(1, 10),
-        im_dim=st.integers(100, 600),
-        skip_batch_id=st.booleans(),
-        rotated=st.booleans(),
-        angle_bound_on=st.booleans(),
-        clip_angle_thresh=st.sampled_from([-1.0, 1.0]),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=10000)
-    def test_bbox_transform(
-        self,
-        num_rois,
-        num_classes,
-        im_dim,
-        skip_batch_id,
-        rotated,
-        angle_bound_on,
-        clip_angle_thresh,
-        gc,
-        dc,
-    ):
-        """
-        Test with all rois belonging to a single image per run.
-        """
-        rois = (
-            generate_rois_rotated([num_rois], [im_dim])
-            if rotated
-            else generate_rois([num_rois], [im_dim])
-        )
-        box_dim = 5 if rotated else 4
-        if skip_batch_id:
-            rois = rois[:, 1:]
-        deltas = np.random.randn(num_rois, box_dim * num_classes).astype(np.float32)
-        im_info = np.array([im_dim, im_dim, 1.0]).astype(np.float32).reshape(1, 3)
-
-        def bbox_transform_ref(rois, deltas, im_info):
-            boxes = rois if rois.shape[1] == box_dim else rois[:, 1:]
-            im_shape = im_info[0, 0:2]
-            if rotated:
-                box_out = bbox_transform_rotated(
-                    boxes, deltas, angle_bound_on=angle_bound_on
-                )
-                box_out = clip_tiled_boxes_rotated(
-                    box_out, im_shape, angle_thresh=clip_angle_thresh
-                )
-            else:
-                box_out = bbox_transform(boxes, deltas)
-                box_out = clip_tiled_boxes(box_out, im_shape)
-            return [box_out]
-
-        op = core.CreateOperator(
-            "BBoxTransform",
-            ["rois", "deltas", "im_info"],
-            ["box_out"],
-            apply_scale=False,
-            correct_transform_coords=True,
-            rotated=rotated,
-            angle_bound_on=angle_bound_on,
-            clip_angle_thresh=clip_angle_thresh,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[rois, deltas, im_info],
-            reference=bbox_transform_ref,
-        )
-
-    @given(
-        roi_counts=st.lists(st.integers(0, 5), min_size=1, max_size=10),
-        num_classes=st.integers(1, 10),
-        rotated=st.booleans(),
-        angle_bound_on=st.booleans(),
-        clip_angle_thresh=st.sampled_from([-1.0, 1.0]),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=10000)
-    def test_bbox_transform_batch(
-        self,
-        roi_counts,
-        num_classes,
-        rotated,
-        angle_bound_on,
-        clip_angle_thresh,
-        gc,
-        dc,
-    ):
-        """
-        Test with rois for multiple images in a batch
-        """
-        batch_size = len(roi_counts)
-        total_rois = sum(roi_counts)
-        im_dims = np.random.randint(100, 600, batch_size)
-        rois = (
-            generate_rois_rotated(roi_counts, im_dims)
-            if rotated
-            else generate_rois(roi_counts, im_dims)
-        )
-        box_dim = 5 if rotated else 4
-        deltas = np.random.randn(total_rois, box_dim * num_classes).astype(np.float32)
-        im_info = np.zeros((batch_size, 3)).astype(np.float32)
-        im_info[:, 0] = im_dims
-        im_info[:, 1] = im_dims
-        im_info[:, 2] = 1.0
-
-        def bbox_transform_ref(rois, deltas, im_info):
-            box_out = []
-            offset = 0
-            for i, num_rois in enumerate(roi_counts):
-                if num_rois == 0:
-                    continue
-                cur_boxes = rois[offset : offset + num_rois, 1:]
-                cur_deltas = deltas[offset : offset + num_rois]
-                im_shape = im_info[i, 0:2]
-                if rotated:
-                    cur_box_out = bbox_transform_rotated(
-                        cur_boxes, cur_deltas, angle_bound_on=angle_bound_on
-                    )
-                    cur_box_out = clip_tiled_boxes_rotated(
-                        cur_box_out, im_shape, angle_thresh=clip_angle_thresh
-                    )
-                else:
-                    cur_box_out = bbox_transform(cur_boxes, cur_deltas)
-                    cur_box_out = clip_tiled_boxes(cur_box_out, im_shape)
-                box_out.append(cur_box_out)
-                offset += num_rois
-
-            if len(box_out) > 0:
-                box_out = np.vstack(box_out)
-            else:
-                box_out = np.empty(deltas.shape).astype(np.float32)
-            return [box_out, roi_counts]
-
-        op = core.CreateOperator(
-            "BBoxTransform",
-            ["rois", "deltas", "im_info"],
-            ["box_out", "roi_batch_splits"],
-            apply_scale=False,
-            correct_transform_coords=True,
-            rotated=rotated,
-            angle_bound_on=angle_bound_on,
-            clip_angle_thresh=clip_angle_thresh,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[rois, deltas, im_info],
-            reference=bbox_transform_ref,
-        )
diff --git a/caffe2/python/operator_test/bisect_percentile_op_test.py b/caffe2/python/operator_test/bisect_percentile_op_test.py
deleted file mode 100644
index 2d22064d5712..000000000000
--- a/caffe2/python/operator_test/bisect_percentile_op_test.py
+++ /dev/null
@@ -1,179 +0,0 @@
-from typing import List
-
-import hypothesis.strategies as st
-
-from caffe2.python import core, workspace
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-
-import bisect
-import numpy as np
-
-
-class TestBisectPercentileOp(hu.HypothesisTestCase):
-    def compare_reference(
-            self,
-            raw_data,
-            pct_raw_data,
-            pct_mapping,
-            pct_upper,
-            pct_lower,
-            lengths,
-    ):
-        def bisect_percentile_op_ref(
-            raw_data,
-            pct_raw_data,
-            pct_mapping,
-            pct_lower,
-            pct_upper,
-            lengths
-        ):
-            results = np.zeros_like(raw_data)
-            indices = [0]
-            for j in range(len(lengths)):
-                indices.append(indices[j] + lengths[j])
-            for i in range(len(raw_data)):
-                for j in range(len(raw_data[0])):
-                    start = indices[j]
-                    end = indices[j + 1]
-                    val = raw_data[i][j]
-                    pct_raw_data_i = pct_raw_data[start:end]
-                    pct_lower_i = pct_lower[start:end]
-                    pct_upper_i = pct_upper[start:end]
-                    pct_mapping_i = pct_mapping[start:end]
-
-                    # Corner cases
-                    if val < pct_raw_data_i[0]:
-                        results[i][j] = 0
-                        continue
-                    if val > pct_raw_data_i[-1]:
-                        results[i][j] = 1.
-                        continue
-
-                    # interpolation
-                    k = bisect.bisect_left(pct_raw_data_i, val)
-                    if pct_raw_data_i[k] == val:
-                        results[i][j] = pct_mapping_i[k]
-                    else:
-                        k = k - 1
-                        slope = ((pct_lower_i[k + 1] - pct_upper_i[k])
-                            / (pct_raw_data_i[k + 1] - pct_raw_data_i[k]))
-                        results[i][j] = pct_upper_i[k] + \
-                            slope * (val - pct_raw_data_i[k])
-
-            return results
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("raw_data", raw_data)
-
-        op = core.CreateOperator(
-            "BisectPercentile",
-            ["raw_data"],
-            ["pct_output"],
-            percentile_raw=pct_raw_data,
-            percentile_mapping=pct_mapping,
-            percentile_lower=pct_lower,
-            percentile_upper=pct_upper,
-            lengths=lengths
-        )
-        workspace.RunOperatorOnce(op)
-
-        expected_output = bisect_percentile_op_ref(
-            raw_data,
-            pct_raw_data,
-            pct_mapping,
-            pct_lower,
-            pct_upper,
-            lengths
-        )
-        output = workspace.blobs['pct_output']
-        np.testing.assert_array_almost_equal(output, expected_output)
-
-    def test_bisect_percentil_op_simple(self):
-        raw_data = np.array([
-            [1, 1],
-            [2, 2],
-            [3, 3],
-            [3, 1],
-            [9, 10],
-            [1.5, 5],
-            [1.32, 2.4],
-            [2.9, 5.7],
-            [-1, -1],
-            [3, 7]
-        ], dtype=np.float32)
-        pct_raw_data = np.array([1, 2, 3, 2, 7], dtype=np.float32)
-        pct_lower = np.array([0.1, 0.2, 0.9, 0.1, 0.5], dtype=np.float32)
-        pct_upper = np.array([0.1, 0.8, 1.0, 0.4, 1.0], dtype=np.float32)
-        pct_mapping = np.array([0.1, 0.5, 0.95, 0.25, 0.75], dtype=np.float32)
-        lengths = np.array([3, 2], dtype=np.int32)
-        self.compare_reference(
-            raw_data, pct_raw_data, pct_mapping, pct_lower, pct_upper, lengths)
-
-    @given(
-        N=st.integers(min_value=20, max_value=100),
-        lengths_in=st.lists(
-            elements=st.integers(min_value=2, max_value=10),
-            min_size=2,
-            max_size=5,
-        ),
-        max_value=st.integers(min_value=100, max_value=1000),
-        discrete=st.booleans(),
-        p=st.floats(min_value=0, max_value=0.9),
-        **hu.gcs_cpu_only
-    )
-    def test_bisect_percentil_op_large(
-        self, N: int, lengths_in: List[int], max_value: int, discrete: bool, p: float, gc, dc
-    ):
-        lengths = np.array(lengths_in, dtype=np.int32)
-        D = len(lengths)
-
-        if discrete:
-            raw_data = np.random.randint(0, max_value, size=(N, D))
-        else:
-            raw_data = np.random.randn(N, D)
-
-        # To generate valid pct_lower and pct_upper
-        pct_lower = []
-        pct_upper = []
-        pct_raw_data = []
-        for i in range(D):
-            pct_lower_val = 0.
-            pct_upper_val = 0.
-            pct_lower_cur = []
-            pct_upper_cur = []
-            # There is no duplicated values in pct_raw_data
-            if discrete:
-                pct_raw_data_cur = np.random.choice(
-                    np.arange(max_value), size=lengths[i], replace=False)
-            else:
-                pct_raw_data_cur = np.random.randn(lengths[i])
-                while len(set(pct_raw_data_cur)) < lengths[i]:
-                    pct_raw_data_cur = np.random.randn(lengths[i])
-            pct_raw_data_cur = np.sort(pct_raw_data_cur)
-            for _ in range(lengths[i]):
-                pct_lower_val = pct_upper_val + 0.01
-                pct_lower_cur.append(pct_lower_val)
-                pct_upper_val = pct_lower_val + \
-                    0.01 * np.random.randint(1, 20) * (np.random.uniform() < p)
-                pct_upper_cur.append(pct_upper_val)
-            # normalization
-            pct_lower_cur = np.array(pct_lower_cur, np.float32) / pct_upper_val
-            pct_upper_cur = np.array(pct_upper_cur, np.float32) / pct_upper_val
-            pct_lower.extend(pct_lower_cur)
-            pct_upper.extend(pct_upper_cur)
-            pct_raw_data.extend(pct_raw_data_cur)
-
-        pct_lower = np.array(pct_lower, dtype=np.float32)
-        pct_upper = np.array(pct_upper, dtype=np.float32)
-        pct_mapping = (pct_lower + pct_upper) / 2.
-        raw_data = np.array(raw_data, dtype=np.float32)
-        pct_raw_data = np.array(pct_raw_data, dtype=np.float32)
-
-        self.compare_reference(
-            raw_data, pct_raw_data, pct_mapping, pct_lower, pct_upper, lengths)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/blobs_queue_db_test.py b/caffe2/python/operator_test/blobs_queue_db_test.py
deleted file mode 100644
index 88197d16d70b..000000000000
--- a/caffe2/python/operator_test/blobs_queue_db_test.py
+++ /dev/null
@@ -1,93 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-import caffe2.proto.caffe2_pb2 as caffe2_pb2
-from caffe2.python import core, workspace, timeout_guard, test_util
-
-
-class BlobsQueueDBTest(test_util.TestCase):
-    def test_create_blobs_queue_db_string(self):
-        def add_blobs(queue, num_samples):
-            blob = core.BlobReference("blob")
-            status = core.BlobReference("blob_status")
-            for i in range(num_samples):
-                self._add_blob_to_queue(
-                    queue, self._create_test_tensor_protos(i), blob, status
-                )
-        self._test_create_blobs_queue_db(add_blobs)
-
-    def test_create_blobs_queue_db_tensor(self):
-        def add_blobs(queue, num_samples):
-            blob = core.BlobReference("blob")
-            status = core.BlobReference("blob_status")
-            for i in range(num_samples):
-                data = self._create_test_tensor_protos(i)
-                data = np.array([data], dtype=str)
-                self._add_blob_to_queue(
-                    queue, data, blob, status
-                )
-        self._test_create_blobs_queue_db(add_blobs)
-
-    def _test_create_blobs_queue_db(self, add_blobs_fun):
-        num_samples = 10000
-        batch_size = 10
-        init_net = core.Net('init_net')
-        net = core.Net('test_create_blobs_queue_db')
-        queue = init_net.CreateBlobsQueue([], 'queue', capacity=num_samples)
-        reader = init_net.CreateBlobsQueueDB(
-            [queue],
-            'blobs_queue_db_reader',
-            value_blob_index=0,
-            timeout_secs=0.1,
-        )
-        workspace.RunNetOnce(init_net)
-
-        add_blobs_fun(queue, num_samples)
-
-        net.TensorProtosDBInput(
-            [reader], ['image', 'label'], batch_size=batch_size)
-        workspace.CreateNet(net)
-
-        close_net = core.Net('close_net')
-        close_net.CloseBlobsQueue([queue], [])
-
-        for i in range(int(num_samples / batch_size)):
-            print("Running net, iteration {}".format(i))
-            with timeout_guard.CompleteInTimeOrDie(2.0):
-                workspace.RunNet(net)
-
-            images = workspace.FetchBlob('image')
-            labels = workspace.FetchBlob('label')
-            self.assertEqual(batch_size, len(images))
-            self.assertEqual(batch_size, len(labels))
-            for idx, item in enumerate(images):
-                self.assertEqual(
-                    "foo{}".format(i * batch_size + idx).encode('utf-8'), item
-                )
-            for item in labels:
-                self.assertEqual(1, item)
-        workspace.RunNetOnce(close_net)
-
-    def _add_blob_to_queue(self, queue, data, blob, status):
-        workspace.FeedBlob(blob, data)
-        op = core.CreateOperator(
-            "SafeEnqueueBlobs",
-            [queue, blob],
-            [blob, status],
-        )
-        workspace.RunOperatorOnce(op)
-
-    def _create_test_tensor_protos(self, idx):
-        item = caffe2_pb2.TensorProtos()
-        data = item.protos.add()
-        data.data_type = core.DataType.STRING
-        data.string_data.append("foo{}".format(idx).encode('utf-8'))
-        label = item.protos.add()
-        label.data_type = core.DataType.INT32
-        label.int32_data.append(1)
-
-        return item.SerializeToString()
diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
deleted file mode 100644
index 0ccdbd928512..000000000000
--- a/caffe2/python/operator_test/boolean_mask_test.py
+++ /dev/null
@@ -1,404 +0,0 @@
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import assume, given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestBooleanMaskOp(serial.SerializedTestCase):
-    @given(x=hu.tensor1d(min_len=1,
-                         max_len=100,
-                         elements=hu.floats(min_value=0.5, max_value=1.0)),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_boolean_mask_gradient(self, x, gc, dc):
-        op = core.CreateOperator("BooleanMask",
-                                 ["data", "mask"],
-                                 "masked_data")
-        mask = np.random.choice(a=[True, False], size=x.shape[0])
-        expected_gradient = np.copy(mask).astype(int)
-        self.assertDeviceChecks(dc, op, [x, mask], [0])
-        self.assertGradientChecks(gc, op, [x, mask], 0, [0])
-
-
-    @given(x=hu.tensor1d(min_len=1,
-                         max_len=5,
-                         elements=hu.floats(min_value=0.5, max_value=1.0)),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_boolean_mask(self, x, gc, dc):
-        op = core.CreateOperator("BooleanMask",
-                                 ["data", "mask"],
-                                 "masked_data")
-        mask = np.random.choice(a=[True, False], size=x.shape[0])
-
-        def ref(x, mask):
-            return (x[mask],)
-        self.assertReferenceChecks(gc, op, [x, mask], ref)
-        self.assertDeviceChecks(dc, op, [x, mask], [0])
-
-    @given(x=hu.tensor1d(min_len=1,
-                         max_len=5,
-                         elements=hu.floats(min_value=0.5, max_value=1.0)),
-           **hu.gcs)
-    def test_boolean_mask_indices(self, x, gc, dc):
-        op = core.CreateOperator("BooleanMask",
-                                 ["data", "mask"],
-                                 ["masked_data", "masked_indices"])
-        mask = np.random.choice(a=[True, False], size=x.shape[0])
-
-        def ref(x, mask):
-            return (x[mask], np.where(mask)[0])
-
-        self.assertReferenceChecks(gc, op, [x, mask], ref)
-        self.assertDeviceChecks(dc, op, [x, mask], [0])
-
-    @staticmethod
-    def _dtype_conversion(x, dtype, gc, dc):
-        """SequenceMask only supports fp16 with CUDA/ROCm."""
-        if dtype == np.float16:
-            assume(core.IsGPUDeviceType(gc.device_type))
-            dc = [d for d in dc if core.IsGPUDeviceType(d.device_type)]
-            x = x.astype(dtype)
-        return x, dc
-
-    @given(x=hu.tensor(min_dim=2,
-                       max_dim=5,
-                       elements=hu.floats(min_value=0.5, max_value=1.0)),
-           dtype=st.sampled_from([np.float32, np.float16]),
-           **hu.gcs)
-    def test_sequence_mask_with_lengths(self, x, dtype, gc, dc):
-        x, dc = self._dtype_conversion(x, dtype, gc, dc)
-        # finite fill value needed for gradient check
-        fill_val = 1e-3 if dtype == np.float16 else 1e-9
-        op = core.CreateOperator("SequenceMask",
-                                 ["data", "lengths"],
-                                 ["masked_data"],
-                                 mode="sequence",
-                                 axis=len(x.shape) - 1,
-                                 fill_val=fill_val)
-        elem_dim = x.shape[-1]
-        leading_dim = 1
-        for dim in x.shape[:-1]:
-            leading_dim *= dim
-        lengths = np.random.randint(0, elem_dim, [leading_dim])\
-            .astype(np.int32)
-
-        def ref(x, lengths):
-            ref = np.reshape(x, [leading_dim, elem_dim])
-            for i in range(leading_dim):
-                for j in range(elem_dim):
-                    if j >= lengths[i]:
-                        ref[i, j] = fill_val
-            return [ref.reshape(x.shape)]
-
-        self.assertReferenceChecks(gc, op, [x, lengths], ref)
-        self.assertDeviceChecks(dc, op, [x, lengths], [0])
-
-    @given(x=hu.tensor(min_dim=2,
-                       max_dim=5,
-                       elements=hu.floats(min_value=0.5, max_value=1.0)),
-           dtype=st.sampled_from([np.float32, np.float16]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_sequence_mask_with_window(self, x, dtype, gc, dc):
-        x, dc = self._dtype_conversion(x, dtype, gc, dc)
-        # finite fill value needed for gradient check
-        fill_val = 1e-3 if dtype == np.float16 else 1e-9
-        radius = 2
-        op = core.CreateOperator("SequenceMask",
-                                 ["data", "centers"],
-                                 ["masked_data"],
-                                 mode="window",
-                                 radius=radius,
-                                 axis=len(x.shape) - 1,
-                                 fill_val=fill_val)
-        elem_dim = x.shape[-1]
-        leading_dim = 1
-        for dim in x.shape[:-1]:
-            leading_dim *= dim
-        centers = np.random.randint(0, elem_dim, [leading_dim])\
-            .astype(np.int32)
-
-        def ref(x, centers):
-            ref = np.reshape(x, [leading_dim, elem_dim])
-            for i in range(leading_dim):
-                for j in range(elem_dim):
-                    if j > centers[i] + radius or j < centers[i] - radius:
-                        ref[i, j] = fill_val
-            return [ref.reshape(x.shape)]
-
-        self.assertReferenceChecks(gc, op, [x, centers], ref)
-        self.assertDeviceChecks(dc, op, [x, centers], [0])
-
-        # Gradient check with np.float16 is found to be flakey, disable for now
-        # with high threshold (to repro, set threshold to 0.4).
-        threshold = 1.0 if dtype == np.float16 else 0.005
-        self.assertGradientChecks(gc, op, [x, centers], 0, [0],
-                                  threshold=threshold)
-
-    @given(x=hu.tensor(min_dim=2,
-                       max_dim=5,
-                       elements=hu.floats(min_value=0.5, max_value=1.0)),
-           mode=st.sampled_from(['upper', 'lower', 'upperdiag', 'lowerdiag']),
-           dtype=st.sampled_from([np.float32, np.float16]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_sequence_mask_triangle(self, x, mode, dtype, gc, dc):
-        x, dc = self._dtype_conversion(x, dtype, gc, dc)
-        # finite fill value needed for gradient check
-        fill_val = 1e-3 if dtype == np.float16 else 1e-9
-        op = core.CreateOperator("SequenceMask",
-                                 ["data"],
-                                 ["masked_data"],
-                                 mode=mode,
-                                 axis=len(x.shape) - 1,
-                                 fill_val=fill_val)
-        elem_dim = x.shape[-1]
-        leading_dim = 1
-        for dim in x.shape[:-1]:
-            leading_dim *= dim
-
-        if mode == 'upper':
-            def compare(i, j):
-                return j > i
-        elif mode == 'lower':
-            def compare(i, j):
-                return j < i
-        elif mode == 'upperdiag':
-            def compare(i, j):
-                return j >= i
-        elif mode == 'lowerdiag':
-            def compare(i, j):
-                return j <= i
-
-        def ref(x):
-            ref = np.reshape(x, [leading_dim, elem_dim])
-            for i in range(leading_dim):
-                for j in range(elem_dim):
-                    if compare(i, j):
-                        ref[i, j] = fill_val
-            return [ref.reshape(x.shape)]
-
-        self.assertReferenceChecks(gc, op, [x], ref)
-        self.assertDeviceChecks(dc, op, [x], [0])
-
-        # Gradient check with np.float16 is found to be flakey, disable for now
-        # with high threshold (to repro, set threshold to 0.4).
-        threshold = 1.0 if dtype == np.float16 else 0.005
-        stepsize = 0.1 if dtype == np.float16 else 0.05
-        self.assertGradientChecks(gc, op, [x], 0, [0],
-                                  threshold=threshold, stepsize=stepsize)
-
-    @given(x=hu.tensor(min_dim=2,
-                       max_dim=5,
-                       elements=hu.floats(min_value=0.5, max_value=1.0)),
-           dtype=st.sampled_from([np.float32, np.float16]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_sequence_mask_batching_lengths(self, x, dtype, gc, dc):
-        x, dc = self._dtype_conversion(x, dtype, gc, dc)
-        # finite fill value needed for gradient check
-        fill_val = 1e-3 if dtype == np.float16 else 1e-9
-        # choose _different_ batch and axis dimensions, w/ axis != 0.
-        axis = 0
-        batch = 0
-        while axis == 0 or axis < batch:
-            inds = np.arange(len(x.shape))
-            np.random.shuffle(inds)
-            batch = inds[0]
-            axis = inds[1]
-        op = core.CreateOperator("SequenceMask",
-                                 ["data", "lengths"],
-                                 ["masked_data"],
-                                 mode='sequence',
-                                 axis=axis,
-                                 fill_val=fill_val,
-                                 batch=batch)
-
-        before = int(np.prod(x.shape[:batch + 1]))
-        between = int(np.prod(x.shape[batch + 1:axis]))
-        after = int(np.prod(x.shape[axis:]))
-
-        lengths = np.random.randint(0, after, [between])\
-            .astype(np.int32)
-
-        def ref(z, l):
-            w = np.reshape(z, [before, between, after])
-
-            for b in range(before):
-                r = w[b, :, :]
-                for i in range(between):
-                    for j in range(after):
-                        if j >= l[i]:
-                            r[i, j] = fill_val
-            return [w.reshape(z.shape)]
-
-        self.assertReferenceChecks(gc, op, [x, lengths], ref)
-        self.assertDeviceChecks(dc, op, [x, lengths], [0])
-
-        # Gradient check with np.float16 is found to be flakey, disable for now
-        # with high threshold (to repro, set threshold to 0.4).
-        threshold = 1.0 if dtype == np.float16 else 0.005
-        self.assertGradientChecks(gc, op, [x, lengths], 0, [0],
-                                  threshold=threshold)
-
-    @given(x=hu.tensor(min_dim=4,
-                       max_dim=4,
-                       elements=hu.floats(min_value=0.5, max_value=1.0)),
-           dtype=st.sampled_from([np.float32, np.float16]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_sequence_mask_batching_window(self, x, dtype, gc, dc):
-        x, dc = self._dtype_conversion(x, dtype, gc, dc)
-        # finite fill value needed for gradient check
-        fill_val = 1e-3 if dtype == np.float16 else 1e-9
-        radius = 1
-        # choose _different_ batch and axis dimensions, w/ axis != 0.
-        axis = 0
-        batch = 0
-        while axis == 0 or axis < batch:
-            inds = np.arange(len(x.shape))
-            np.random.shuffle(inds)
-            batch = inds[0]
-            axis = inds[1]
-        op = core.CreateOperator("SequenceMask",
-                                 ["data", "centers"],
-                                 ["masked_data"],
-                                 mode='window',
-                                 radius=radius,
-                                 axis=axis,
-                                 fill_val=fill_val,
-                                 batch=batch)
-
-        before = int(np.prod(x.shape[:batch + 1]))
-        between = int(np.prod(x.shape[batch + 1:axis]))
-        after = int(np.prod(x.shape[axis:]))
-
-        centers = np.random.randint(0, after, [between])\
-            .astype(np.int32)
-
-        def ref(z, c):
-            w = np.reshape(z, [before, between, after])
-
-            for b in range(before):
-                r = w[b, :, :]
-                for i in range(between):
-                    for j in range(after):
-                        if j > c[i] + radius or j < c[i] - radius:
-                            r[i, j] = fill_val
-            return [w.reshape(z.shape)]
-
-        self.assertReferenceChecks(gc, op, [x, centers], ref)
-        self.assertDeviceChecks(dc, op, [x, centers], [0])
-
-        # Gradient check with np.float16 is found to be flakey, disable for now
-        # with high threshold (to repro, set threshold to 0.4).
-        threshold = 1.0 if dtype == np.float16 else 0.005
-        self.assertGradientChecks(gc, op, [x, centers], 0, [0],
-                                  threshold=threshold)
-
-    @given(x=hu.tensor(min_dim=3,
-                       max_dim=5,
-                       elements=hu.floats(min_value=0.5, max_value=1.0)),
-           mode=st.sampled_from(['upper', 'lower', 'upperdiag', 'lowerdiag']),
-           dtype=st.sampled_from([np.float32, np.float16]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_sequence_mask_batching_triangle(self, x, mode, dtype, gc, dc):
-        x, dc = self._dtype_conversion(x, dtype, gc, dc)
-        # finite fill value needed for gradient check
-        fill_val = 1e-3 if dtype == np.float16 else 1e-9
-        # choose _different_ batch and axis dimensions, w/ axis != 0.
-        axis = 0
-        batch = 0
-        while axis == 0 or axis < batch:
-            inds = np.arange(len(x.shape))
-            np.random.shuffle(inds)
-            batch = inds[0]
-            axis = inds[1]
-        op = core.CreateOperator("SequenceMask",
-                                 ["data"],
-                                 ["masked_data"],
-                                 mode=mode,
-                                 axis=axis,
-                                 fill_val=fill_val,
-                                 batch=batch)
-
-        if mode == 'upper':
-            def compare(i, j):
-                return j > i
-        elif mode == 'lower':
-            def compare(i, j):
-                return j < i
-        elif mode == 'upperdiag':
-            def compare(i, j):
-                return j >= i
-        elif mode == 'lowerdiag':
-            def compare(i, j):
-                return j <= i
-
-        def ref(z):
-            before = int(np.prod(z.shape[:batch + 1]))
-            between = int(np.prod(z.shape[batch + 1:axis]))
-            after = int(np.prod(z.shape[axis:]))
-
-            w = np.reshape(z, [before, between, after])
-
-            for b in range(before):
-                r = w[b, :, :]
-                for i in range(between):
-                    for j in range(after):
-                        if compare(i, j):
-                            r[i, j] = fill_val
-            return [w.reshape(z.shape)]
-
-        self.assertReferenceChecks(gc, op, [x], ref)
-        self.assertDeviceChecks(dc, op, [x], [0])
-
-        # Gradient check with np.float16 is found to be flakey, disable for now
-        # with high threshold (to repro, set threshold to 0.4).
-        threshold = 1.0 if dtype == np.float16 else 0.005
-        stepsize = 0.1 if dtype == np.float16 else 0.05
-        self.assertGradientChecks(gc, op, [x], 0, [0],
-                                  threshold=threshold, stepsize=stepsize)
-
-    @given(x=hu.tensor(min_dim=3,
-                       max_dim=5,
-                       elements=hu.floats(min_value=0.5, max_value=1.0)),
-           dtype=st.sampled_from([np.float32, np.float16]),
-           **hu.gcs)
-    def test_sequence_mask_repeated(self, x, dtype, gc, dc):
-        x, dc = self._dtype_conversion(x, dtype, gc, dc)
-        # finite fill value needed for gradient check
-        fill_val = 1e-3 if dtype == np.float16 else 1e-9
-        op = core.CreateOperator("SequenceMask",
-                                 ["data", "lengths"],
-                                 ["masked_data"],
-                                 mode="sequence",
-                                 axis=len(x.shape) - 2,
-                                 repeat_from_axis=-1,
-                                 fill_val=fill_val)
-
-        elem_dim = x.shape[-2]
-        leading_dim = 1
-        for dim in x.shape[:-2]:
-            leading_dim *= dim
-        lengths = np.random.randint(0, elem_dim, [leading_dim])\
-            .astype(np.int32)
-
-        def ref(x, lengths):
-            ref = np.reshape(x, [leading_dim, elem_dim, -1])
-            for i in range(leading_dim):
-                for j in range(elem_dim):
-                    if j >= lengths[i]:
-                        ref[i, j, :] = fill_val
-            return [ref.reshape(x.shape)]
-
-        self.assertReferenceChecks(gc, op, [x, lengths], ref)
-        self.assertDeviceChecks(dc, op, [x, lengths], [0])
diff --git a/caffe2/python/operator_test/boolean_unmask_test.py b/caffe2/python/operator_test/boolean_unmask_test.py
deleted file mode 100644
index 8cba2aecf1a4..000000000000
--- a/caffe2/python/operator_test/boolean_unmask_test.py
+++ /dev/null
@@ -1,62 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestUnmaskOp(serial.SerializedTestCase):
-    @serial.given(N=st.integers(min_value=2, max_value=20),
-           dtype=st.sampled_from([
-               np.bool_,
-               np.int8,
-               np.int16,
-               np.int32,
-               np.int64,
-               np.uint8,
-               np.uint16,
-               np.float16,
-               np.float32,
-               np.float64]),
-           **hu.gcs)
-    def test(self, N, dtype, gc, dc):
-        if dtype is np.bool_:
-            all_value = np.random.choice(a=[True, False], size=N)
-        else:
-            all_value = (np.random.rand(N) * N).astype(dtype)
-
-        M = np.random.randint(1, N)
-        split = sorted(np.random.randint(1, N, size=M))
-        indices = np.random.permutation(N)
-        pieces = np.split(indices, split)
-
-        def ref(*args, **kwargs):
-            return (all_value,)
-
-        inputs = []
-        inputs_names = []
-        for i, piece in enumerate(pieces):
-            piece.sort()
-            mask = np.zeros(N, dtype=np.bool_)
-            mask[piece] = True
-            values = all_value[piece]
-            inputs.extend([mask, values])
-            inputs_names.extend(["mask%d" % i, "value%d" % i])
-
-        op = core.CreateOperator(
-            'BooleanUnmask',
-            inputs_names,
-            'output')
-
-        self.assertReferenceChecks(gc, op, inputs, ref)
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/box_with_nms_limit_op_test.py b/caffe2/python/operator_test/box_with_nms_limit_op_test.py
deleted file mode 100644
index e459edb57de3..000000000000
--- a/caffe2/python/operator_test/box_with_nms_limit_op_test.py
+++ /dev/null
@@ -1,253 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import unittest
-import numpy as np
-
-
-def get_op(input_len, output_len, args):
-    input_names = ['in_scores', 'in_boxes', 'in_batch_splits']
-    assert input_len <= len(input_names)
-    input_names = input_names[:input_len]
-
-    out_names = ['scores', 'boxes', 'classes', 'batch_splits', 'keeps', 'keeps_size']
-    assert output_len <= len(out_names)
-    out_names = out_names[:output_len]
-
-    op = core.CreateOperator(
-        'BoxWithNMSLimit',
-        input_names,
-        out_names,
-        **args)
-
-    return op
-
-
-HU_CONFIG = {
-    'gc': hu.gcs_cpu_only['gc'],
-}
-
-
-def gen_boxes(count, center):
-    len = 10
-    len_half = len / 2.0
-    ret = np.tile(
-        np.array(
-            [center[0] - len_half, center[1] - len_half,
-            center[0] + len_half, center[1] + len_half]
-        ).astype(np.float32),
-        (count, 1)
-    )
-    return ret
-
-
-def gen_multiple_boxes(centers, scores, count, num_classes):
-    ret_box = None
-    ret_scores = None
-    for cc, ss in zip(centers, scores):
-        box = gen_boxes(count, cc)
-        ret_box = np.vstack((ret_box, box)) if ret_box is not None else box
-        cur_sc = np.ones((count, 1), dtype=np.float32) * ss
-        ret_scores = np.vstack((ret_scores, cur_sc)) \
-            if ret_scores is not None else cur_sc
-    ret_box = np.tile(ret_box, (1, num_classes))
-    ret_scores = np.tile(ret_scores, (1, num_classes))
-    assert ret_box.shape == (len(centers) * count, 4 * num_classes)
-    assert ret_scores.shape == (len(centers) * count, num_classes)
-    return ret_box, ret_scores
-
-
-class TestBoxWithNMSLimitOp(serial.SerializedTestCase):
-    @given(**HU_CONFIG)
-    @settings(deadline=10000)
-    def test_simple(self, gc):
-        in_centers = [(0, 0), (20, 20), (50, 50)]
-        in_scores = [0.9, 0.8, 0.6]
-        boxes, scores = gen_multiple_boxes(in_centers, in_scores, 10, 2)
-
-        gt_boxes, gt_scores = gen_multiple_boxes(in_centers, in_scores, 1, 1)
-        gt_classes = np.ones(gt_boxes.shape[0], dtype=np.float32)
-
-        op = get_op(2, 3, {"score_thresh": 0.5, "nms": 0.9})
-
-        def ref(*args, **kwargs):
-            return (gt_scores.flatten(), gt_boxes, gt_classes)
-
-        self.assertReferenceChecks(gc, op, [scores, boxes], ref)
-
-    @given(**HU_CONFIG)
-    @settings(deadline=10000)
-    def test_score_thresh(self, gc):
-        in_centers = [(0, 0), (20, 20), (50, 50)]
-        in_scores = [0.7, 0.85, 0.6]
-        boxes, scores = gen_multiple_boxes(in_centers, in_scores, 10, 2)
-
-        gt_centers = [(20, 20)]
-        gt_scores = [0.85]
-        gt_boxes, gt_scores = gen_multiple_boxes(gt_centers, gt_scores, 1, 1)
-        gt_classes = np.ones(gt_boxes.shape[0], dtype=np.float32)
-
-        op = get_op(2, 3, {"score_thresh": 0.8, "nms": 0.9})
-
-        def ref(*args, **kwargs):
-            return (gt_scores.flatten(), gt_boxes, gt_classes)
-
-        self.assertReferenceChecks(gc, op, [scores, boxes], ref)
-
-    @given(det_per_im=st.integers(1, 3), **HU_CONFIG)
-    @settings(deadline=10000)
-    def test_detections_per_im(self, det_per_im, gc):
-        in_centers = [(0, 0), (20, 20), (50, 50)]
-        in_scores = [0.7, 0.85, 0.6]
-        boxes, scores = gen_multiple_boxes(in_centers, in_scores, 10, 2)
-
-        gt_centers = [(20, 20), (0, 0), (50, 50)][:det_per_im]
-        gt_scores = [0.85, 0.7, 0.6][:det_per_im]
-        gt_boxes, gt_scores = gen_multiple_boxes(gt_centers, gt_scores, 1, 1)
-        gt_classes = np.ones(gt_boxes.shape[0], dtype=np.float32)
-
-        op = get_op(
-            2, 3,
-            {"score_thresh": 0.5, "nms": 0.9, "detections_per_im": det_per_im}
-        )
-
-        def ref(*args, **kwargs):
-            return (gt_scores.flatten(), gt_boxes, gt_classes)
-
-        self.assertReferenceChecks(gc, op, [scores, boxes], ref)
-
-    @given(
-        num_classes=st.integers(2, 10),
-        det_per_im=st.integers(1, 4),
-        cls_agnostic_bbox_reg=st.booleans(),
-        input_boxes_include_bg_cls=st.booleans(),
-        output_classes_include_bg_cls=st.booleans(),
-        **HU_CONFIG
-    )
-    @settings(deadline=10000)
-    def test_multiclass(
-        self,
-        num_classes,
-        det_per_im,
-        cls_agnostic_bbox_reg,
-        input_boxes_include_bg_cls,
-        output_classes_include_bg_cls,
-        gc
-    ):
-        in_centers = [(0, 0), (20, 20), (50, 50)]
-        in_scores = [0.7, 0.85, 0.6]
-        boxes, scores = gen_multiple_boxes(in_centers, in_scores, 10, num_classes)
-
-        if not input_boxes_include_bg_cls:
-            # remove background class
-            boxes = boxes[:, 4:]
-        if cls_agnostic_bbox_reg:
-            # only leave one class
-            boxes = boxes[:, :4]
-        # randomize un-used scores for background class
-        scores_bg_class_id = 0 if input_boxes_include_bg_cls else -1
-        scores[:, scores_bg_class_id] = np.random.rand(scores.shape[0]).astype(np.float32)
-
-        gt_centers = [(20, 20), (0, 0), (50, 50)][:det_per_im]
-        gt_scores = [0.85, 0.7, 0.6][:det_per_im]
-        gt_boxes, gt_scores = gen_multiple_boxes(gt_centers, gt_scores, 1, 1)
-        # [1, 1, 1, 2, 2, 2, 3, 3, 3, ...]
-        gt_classes = np.tile(
-            np.array(range(1, num_classes), dtype=np.float32),
-            (gt_boxes.shape[0], 1)).T.flatten()
-        if not output_classes_include_bg_cls:
-            # remove background class
-            gt_classes -= 1
-        gt_boxes = np.tile(gt_boxes, (num_classes - 1, 1))
-        gt_scores = np.tile(gt_scores, (num_classes - 1, 1)).flatten()
-
-        op = get_op(
-            2, 3,
-            {
-                "score_thresh": 0.5,
-                "nms": 0.9,
-                "detections_per_im": (num_classes - 1) * det_per_im,
-                "cls_agnostic_bbox_reg": cls_agnostic_bbox_reg,
-                "input_boxes_include_bg_cls": input_boxes_include_bg_cls,
-                "output_classes_include_bg_cls": output_classes_include_bg_cls
-            }
-        )
-
-        def ref(*args, **kwargs):
-            return (gt_scores, gt_boxes, gt_classes)
-
-        self.assertReferenceChecks(gc, op, [scores, boxes], ref)
-
-    @given(det_per_im=st.integers(1, 3), **HU_CONFIG)
-    def test_detections_per_im_same_thresh(self, det_per_im, gc):
-        in_centers = [(0, 0), (20, 20), (50, 50)]
-        in_scores = [0.7, 0.7, 0.7]
-        boxes, scores = gen_multiple_boxes(in_centers, in_scores, 10, 2)
-
-        gt_centers = [(20, 20), (0, 0), (50, 50)][:det_per_im]
-        gt_scores = [0.7, 0.7, 0.7][:det_per_im]
-        gt_boxes, gt_scores = gen_multiple_boxes(gt_centers, gt_scores, 1, 1)
-        gt_classes = np.ones(gt_boxes.shape[0], dtype=np.float32)
-
-        op = get_op(
-            2, 3,
-            {"score_thresh": 0.5, "nms": 0.9, "detections_per_im": det_per_im}
-        )
-
-        # boxes output could be in any order
-        def verify(inputs, outputs):
-            # check scores
-            np.testing.assert_allclose(
-                outputs[0], gt_scores.flatten(), atol=1e-4, rtol=1e-4,
-            )
-            # check classes
-            np.testing.assert_allclose(
-                outputs[2], gt_classes, atol=1e-4, rtol=1e-4,
-            )
-            self.assertEqual(outputs[1].shape, gt_boxes.shape)
-
-        self.assertValidationChecks(gc, op, [scores, boxes], verify, as_kwargs=False)
-
-    @given(num_classes=st.integers(2, 10), **HU_CONFIG)
-    def test_detections_per_im_same_thresh_multiclass(self, num_classes, gc):
-        in_centers = [(0, 0), (20, 20), (50, 50)]
-        in_scores = [0.6, 0.7, 0.7]
-        boxes, scores = gen_multiple_boxes(in_centers, in_scores, 10, num_classes)
-
-        det_per_im = 1
-        gt_centers = [(20, 20), (50, 50)]
-        gt_scores = [0.7, 0.7]
-        gt_boxes, gt_scores = gen_multiple_boxes(gt_centers, gt_scores, 1, 1)
-
-        op = get_op(
-            2, 3,
-            {"score_thresh": 0.5, "nms": 0.9, "detections_per_im": det_per_im}
-        )
-
-        # boxes output could be in any order
-        def verify(inputs, outputs):
-            # check scores
-            self.assertEqual(outputs[0].shape, (1,))
-            self.assertEqual(outputs[0][0], gt_scores[0])
-
-            # check boxes
-            self.assertTrue(
-                np.allclose(outputs[1], gt_boxes[0, :], atol=1e-4, rtol=1e-4) or
-                np.allclose(outputs[1], gt_boxes[1, :], atol=1e-4, rtol=1e-4)
-            )
-
-            # check class
-            self.assertNotEqual(outputs[2][0], 0)
-
-        self.assertValidationChecks(gc, op, [scores, boxes], verify, as_kwargs=False)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/bucketize_op_test.py b/caffe2/python/operator_test/bucketize_op_test.py
deleted file mode 100644
index 2eb2acf87902..000000000000
--- a/caffe2/python/operator_test/bucketize_op_test.py
+++ /dev/null
@@ -1,34 +0,0 @@
-
-
-
-
-from caffe2.python import core
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-import numpy as np
-
-
-class TestBucketizeOp(hu.HypothesisTestCase):
-    @given(
-        x=hu.tensor(
-            min_dim=1, max_dim=2, dtype=np.float32,
-            elements=hu.floats(min_value=-5, max_value=5)),
-        **hu.gcs)
-    def test_bucketize_op(self, x, gc, dc):
-        length = np.random.randint(low=1, high=5)
-        boundaries = np.random.randn(length) * 5
-        boundaries.sort()
-
-        def ref(x, boundaries):
-            bucket_idx = np.digitize(x, boundaries, right=True)
-            return [bucket_idx]
-
-        op = core.CreateOperator('Bucketize',
-                                 ["X"], ["INDICES"],
-                                 boundaries=boundaries)
-        self.assertReferenceChecks(gc, op, [x, boundaries], ref)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/cast_op_test.py b/caffe2/python/operator_test/cast_op_test.py
deleted file mode 100644
index 95540a6121bc..000000000000
--- a/caffe2/python/operator_test/cast_op_test.py
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
-
-
-
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-
-from hypothesis import given
-import numpy as np
-
-
-class TestCastOp(hu.HypothesisTestCase):
-
-    @given(**hu.gcs)
-    def test_cast_int_float(self, gc, dc):
-        data = np.random.rand(5, 5).astype(np.int32)
-        # from int to float
-        op = core.CreateOperator('Cast', 'data', 'data_cast', to=1, from_type=2)
-        self.assertDeviceChecks(dc, op, [data], [0])
-        # This is actually 0
-        self.assertGradientChecks(gc, op, [data], 0, [0])
-
-    @given(**hu.gcs)
-    def test_cast_int_float_empty(self, gc, dc):
-        data = np.random.rand(0).astype(np.int32)
-        # from int to float
-        op = core.CreateOperator('Cast', 'data', 'data_cast', to=1, from_type=2)
-        self.assertDeviceChecks(dc, op, [data], [0])
-        # This is actually 0
-        self.assertGradientChecks(gc, op, [data], 0, [0])
-
-    @given(data=hu.tensor(dtype=np.int32), **hu.gcs_cpu_only)
-    def test_cast_int_to_string(self, data, gc, dc):
-        op = core.CreateOperator(
-            'Cast', 'data', 'data_cast', to=core.DataType.STRING)
-
-        def ref(data):
-            ret = data.astype(dtype=str)
-            # the string blob will be fetched as object, we feed and re-fetch
-            # to mimic this.
-            with hu.temp_workspace('tmp_ref_int_to_string'):
-                workspace.FeedBlob('tmp_blob', ret)
-                fetched_ret = workspace.FetchBlob('tmp_blob')
-            return (fetched_ret, )
-
-        self.assertReferenceChecks(gc, op, inputs=[data], reference=ref)
diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py
deleted file mode 100644
index e8ee47702445..000000000000
--- a/caffe2/python/operator_test/ceil_op_test.py
+++ /dev/null
@@ -1,39 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-from hypothesis import given, settings
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-
-
-class TestCeil(serial.SerializedTestCase):
-
-    @given(X=hu.tensor(),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_ceil(self, X, gc, dc, engine):
-        op = core.CreateOperator("Ceil", ["X"], ["Y"], engine=engine)
-
-        def ceil_ref(X):
-            return (np.ceil(X),)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=ceil_ref)
-
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/channel_backprop_stats_op_test.py b/caffe2/python/operator_test/channel_backprop_stats_op_test.py
deleted file mode 100644
index 7adc5ce24fb7..000000000000
--- a/caffe2/python/operator_test/channel_backprop_stats_op_test.py
+++ /dev/null
@@ -1,62 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-from hypothesis import given, settings
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-class TestChannelBackpropStats(serial.SerializedTestCase):
-    @given(
-        size=st.integers(7, 10),
-        inputChannels=st.integers(1, 10),
-        batchSize=st.integers(1, 3),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def testChannelBackpropStats(self, size, inputChannels, batchSize, gc, dc):
-
-        op = core.CreateOperator(
-            "ChannelBackpropStats",
-            ["X", "mean", "invStdDev", "outputGrad"],
-            ["scaleGrad", "biasGrad"],
-        )
-
-        def referenceChannelBackpropStatsTest(X, mean, invStdDev, outputGrad):
-            scaleGrad = np.zeros(inputChannels)
-            biasGrad = np.zeros(inputChannels)
-            for n in range(batchSize):
-                for c in range(inputChannels):
-                    for h in range(size):
-                        for w in range(size):
-                            biasGrad[c] += outputGrad[n, c, h, w]
-                            scaleGrad[c] += (
-                                X[n, c, h, w] - mean[c]
-                            ) * invStdDev[c] * outputGrad[n, c, h, w]
-            return scaleGrad, biasGrad
-
-        X = np.random.rand(batchSize, inputChannels, size, size)\
-                     .astype(np.float32) - 0.5
-        sums = np.sum(X, axis=(0, 2, 3), keepdims=False)
-        numPixels = size * size * batchSize
-        mean = sums / numPixels
-        sumsq = np.sum(X**2, axis=(0, 2, 3), keepdims=False)
-        var = ((sumsq -
-                (sums * sums) / numPixels) / numPixels).astype(np.float32)
-        invStdDev = 1 / np.sqrt(var)
-        outputGrad = np.random.rand(batchSize, inputChannels, size, size)\
-            .astype(np.float32) - 0.5
-        self.assertReferenceChecks(
-            gc, op, [X, mean, invStdDev, outputGrad],
-            referenceChannelBackpropStatsTest
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/channel_shuffle_test.py b/caffe2/python/operator_test/channel_shuffle_test.py
deleted file mode 100644
index b821e7b6a43c..000000000000
--- a/caffe2/python/operator_test/channel_shuffle_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core
-
-
-class ChannelShuffleOpsTest(serial.SerializedTestCase):
-    def _channel_shuffle_nchw_ref(self, X, group):
-        dims = X.shape
-        N = dims[0]
-        C = dims[1]
-        G = group
-        K = int(C / G)
-        X = X.reshape(N, G, K, np.prod(dims[2:]))
-        Y = np.transpose(X, axes=(0, 2, 1, 3))
-        return [Y.reshape(dims)]
-
-    def _channel_shuffle_nhwc_ref(self, X, group):
-        dims = X.shape
-        N = dims[0]
-        C = dims[-1]
-        G = group
-        K = int(C / G)
-        X = X.reshape(N, np.prod(dims[1:-1]), G, K)
-        Y = np.transpose(X, axes=(0, 1, 3, 2))
-        return [Y.reshape(dims)]
-
-    @serial.given(
-        N=st.integers(0, 5),
-        G=st.integers(1, 5),
-        K=st.integers(1, 5),
-        H=st.integers(1, 5),
-        W=st.integers(1, 5),
-        order=st.sampled_from(["NCHW", "NHWC"]),
-        **hu.gcs
-    )
-    def test_channel_shuffle(self, N, G, K, H, W, order, gc, dc):
-        C = G * K
-        if order == "NCHW":
-            X = np.random.randn(N, C, H, W).astype(np.float32)
-        else:
-            X = np.random.randn(N, H, W, C).astype(np.float32)
-
-        op = core.CreateOperator("ChannelShuffle", ["X"], ["Y"], group=G, order=order)
-
-        def channel_shuffle_ref(X):
-            if order == "NCHW":
-                return self._channel_shuffle_nchw_ref(X, G)
-            else:
-                return self._channel_shuffle_nhwc_ref(X, G)
-
-        self.assertReferenceChecks(gc, op, [X], channel_shuffle_ref)
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-        self.assertDeviceChecks(dc, op, [X], [0])
diff --git a/caffe2/python/operator_test/channel_stats_op_test.py b/caffe2/python/operator_test/channel_stats_op_test.py
deleted file mode 100644
index 72eedc479dd6..000000000000
--- a/caffe2/python/operator_test/channel_stats_op_test.py
+++ /dev/null
@@ -1,87 +0,0 @@
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-
-
-class TestChannelStatsOp(serial.SerializedTestCase):
-    def channel_stats_nchw_ref(self, X):
-        dims = X.shape
-        N = dims[0]
-        C = dims[1]
-        X = X.reshape(N, C, -1)
-        sum1 = np.sum(X, axis=(0, 2), keepdims=False)
-        sum2 = np.sum(X**2, axis=(0, 2), keepdims=False)
-        return (sum1, sum2)
-
-    def channel_stats_nhwc_ref(self, X):
-        dims = X.shape
-        N = dims[0]
-        C = dims[-1]
-        X = X.reshape(N, -1, C)
-        sum1 = np.sum(X, axis=(0, 1), keepdims=False)
-        sum2 = np.sum(X**2, axis=(0, 1), keepdims=False)
-        return (sum1, sum2)
-
-    @given(
-        N=st.integers(1, 5), C=st.integers(1, 10), H=st.integers(1, 12),
-        W=st.integers(1, 12), order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
-    @settings(deadline=10000)
-    def test_channel_stats_2d(self, N, C, H, W, order, gc, dc):
-        op = core.CreateOperator(
-            "ChannelStats",
-            ["X"],
-            ["sum", "sumsq"],
-            order=order,
-        )
-
-        def ref_op(X):
-            if order == "NCHW":
-                return self.channel_stats_nchw_ref(X)
-            else:
-                return self.channel_stats_nhwc_ref(X)
-
-        X = np.random.randn(N, C, H, W).astype(np.float32)
-        if order == "NHWC":
-            X = np.transpose(X, [0, 2, 3, 1])
-
-        self.assertReferenceChecks(gc, op, [X], reference=ref_op)
-        self.assertDeviceChecks(dc, op, [X], [0, 1])
-
-    @given(
-        N=st.integers(1, 5), C=st.integers(1, 10), D=st.integers(1, 6),
-        H=st.integers(1, 6), W=st.integers(1, 6),
-        order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
-    @settings(deadline=10000)
-    def test_channel_stats_3d(self, N, C, D, H, W, order, gc, dc):
-        op = core.CreateOperator(
-            "ChannelStats",
-            ["X"],
-            ["sum", "sumsq"],
-            order=order,
-        )
-
-        def ref_op(X):
-            if order == "NCHW":
-                return self.channel_stats_nchw_ref(X)
-            else:
-                return self.channel_stats_nhwc_ref(X)
-
-        X = np.random.randn(N, C, D, H, W).astype(np.float32)
-        if order == "NHWC":
-            X = np.transpose(X, [0, 2, 3, 4, 1])
-
-        self.assertReferenceChecks(gc, op, [X], reference=ref_op)
-        self.assertDeviceChecks(dc, op, [X], [0, 1])
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/checkpoint_test.py b/caffe2/python/operator_test/checkpoint_test.py
deleted file mode 100644
index 3042e5989764..000000000000
--- a/caffe2/python/operator_test/checkpoint_test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace, test_util
-import os
-import shutil
-import tempfile
-import unittest
-
-
-class CheckpointTest(test_util.TestCase):
-    """A simple test case to make sure that the checkpoint behavior is correct.
-    """
-
-    @unittest.skipIf("LevelDB" not in core.C.registered_dbs(), "Need LevelDB")
-    def testCheckpoint(self):
-        temp_root = tempfile.mkdtemp()
-        net = core.Net("test_checkpoint")
-        # Note(jiayq): I am being a bit lazy here and am using the old iter
-        # convention that does not have an input. Optionally change it to the
-        # new style if needed.
-        net.Iter([], "iter")
-        net.ConstantFill([], "value", shape=[1, 2, 3])
-        net.Checkpoint(["iter", "value"], [],
-                       db=os.path.join(temp_root, "test_checkpoint_at_%05d"),
-                       db_type="leveldb", every=10, absolute_path=True)
-        self.assertTrue(workspace.CreateNet(net))
-        for i in range(100):
-            self.assertTrue(workspace.RunNet("test_checkpoint"))
-        for i in range(1, 10):
-            # Print statements are only for debugging purposes.
-            # print("Asserting %d" % i)
-            # print(os.path.join(temp_root, "test_checkpoint_at_%05d" % (i * 10)))
-            self.assertTrue(os.path.exists(
-                os.path.join(temp_root, "test_checkpoint_at_%05d" % (i * 10))))
-
-        # Finally, clean up.
-        shutil.rmtree(temp_root)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/clip_op_test.py b/caffe2/python/operator_test/clip_op_test.py
deleted file mode 100644
index 0e800dafe01a..000000000000
--- a/caffe2/python/operator_test/clip_op_test.py
+++ /dev/null
@@ -1,68 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-class TestClip(serial.SerializedTestCase):
-    @given(X=hu.tensor(min_dim=0),
-           min_=st.floats(min_value=-2, max_value=0),
-           max_=st.floats(min_value=0, max_value=2),
-           inplace=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_clip(self, X, min_, max_, inplace, gc, dc):
-        # go away from the origin point to avoid kink problems
-        if np.isscalar(X):
-            X = np.array([], dtype=np.float32)
-        else:
-            X[np.abs(X - min_) < 0.05] += 0.1
-            X[np.abs(X - max_) < 0.05] += 0.1
-
-        def clip_ref(X):
-            X = X.clip(min_, max_)
-            return (X,)
-
-        op = core.CreateOperator(
-            "Clip",
-            ["X"], ["Y" if not inplace else "X"],
-            min=min_,
-            max=max_)
-        self.assertReferenceChecks(gc, op, [X], clip_ref)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X], [0])
-        # Gradient check wrt X
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(X=hu.tensor(min_dim=0),
-           inplace=st.booleans(),
-           **hu.gcs)
-    def test_clip_default(self, X, inplace, gc, dc):
-        # go away from the origin point to avoid kink problems
-        if np.isscalar(X):
-            X = np.array([], dtype=np.float32)
-        else:
-            X += 0.04 * np.sign(X)
-        def clip_ref(X):
-            return (X,)
-
-        op = core.CreateOperator(
-            "Clip",
-            ["X"], ["Y" if not inplace else "X"])
-        self.assertReferenceChecks(gc, op, [X], clip_ref)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/clip_tensor_op_test.py b/caffe2/python/operator_test/clip_tensor_op_test.py
deleted file mode 100644
index c90c38234c8e..000000000000
--- a/caffe2/python/operator_test/clip_tensor_op_test.py
+++ /dev/null
@@ -1,63 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-
-
-class TestClipTensorByScalingOp(serial.SerializedTestCase):
-
-    @given(n=st.integers(5, 8), d=st.integers(2, 4),
-           threshold=st.floats(0.1, 10),
-           additional_threshold=st.floats(0.1, 10),
-           use_additional_threshold=st.booleans(),
-           inplace=st.booleans(),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_clip_tensor_by_scaling(self, n, d, threshold, additional_threshold,
-                                    use_additional_threshold, inplace, gc, dc):
-
-        tensor = np.random.rand(n, d).astype(np.float32)
-        val = np.array(np.linalg.norm(tensor))
-        additional_threshold = np.array([additional_threshold]).astype(np.float32)
-
-        def clip_tensor_by_scaling_ref(tensor_data, val_data,
-                                       additional_threshold=None):
-
-            if additional_threshold is not None:
-                final_threshold = threshold * additional_threshold
-            else:
-                final_threshold = threshold
-
-            if val_data > final_threshold:
-                ratio = final_threshold / float(val_data)
-                tensor_data = tensor_data * ratio
-
-            return [tensor_data]
-
-        op = core.CreateOperator(
-            "ClipTensorByScaling",
-            ["tensor", "val"] if not use_additional_threshold else (
-                ["tensor", "val", "additional_threshold"]),
-            ['Y'] if not inplace else ["tensor"],
-            threshold=threshold,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[tensor, val] if not use_additional_threshold else (
-                [tensor, val, additional_threshold]),
-            reference=clip_tensor_by_scaling_ref,
-        )
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py b/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
deleted file mode 100644
index 28e6cd3b3df6..000000000000
--- a/caffe2/python/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.py
+++ /dev/null
@@ -1,318 +0,0 @@
-
-
-
-
-
-import numpy as np
-import unittest
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-
-from caffe2.python import core, utils
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-#
-# Should match original Detectron code at
-# https://github.com/facebookresearch/Detectron/blob/master/lib/ops/collect_and_distribute_fpn_rpn_proposals.py
-#
-
-def boxes_area(boxes):
-    """Compute the area of an array of boxes."""
-    w = (boxes[:, 2] - boxes[:, 0] + 1)
-    h = (boxes[:, 3] - boxes[:, 1] + 1)
-    areas = w * h
-    assert np.all(areas >= 0), 'Negative areas founds'
-    return areas
-
-
-def map_rois_to_fpn_levels(
-    rois,
-    k_min, k_max,
-    roi_canonical_scale, roi_canonical_level
-):
-    """Determine which FPN level each RoI in a set of RoIs should map to based
-    on the heuristic in the FPN paper.
-    """
-    # Compute level ids
-    s = np.sqrt(boxes_area(rois))
-
-    # Eqn.(1) in FPN paper
-    target_lvls = np.floor(
-        roi_canonical_level +
-        np.log2(s / roi_canonical_scale + 1e-6))
-    target_lvls = np.clip(target_lvls, k_min, k_max)
-    return target_lvls
-
-
-def collect(inputs, **args):
-    post_nms_topN = args['rpn_post_nms_topN']
-    num_lvls = args['rpn_num_levels']
-    roi_inputs = inputs[:num_lvls]
-    score_inputs = inputs[num_lvls:]
-
-    # rois are in [[batch_idx, x0, y0, x1, y2], ...] format
-    # Combine predictions across all levels and retain the top scoring
-    #
-    # equivalent to Detectron code
-    #   rois = np.concatenate([blob.data for blob in roi_inputs])
-    #   scores = np.concatenate([blob.data for blob in score_inputs]).squeeze()
-    rois = np.concatenate(roi_inputs)
-    scores = np.concatenate(score_inputs).squeeze()
-    assert rois.shape[0] == scores.shape[0]
-    inds = np.argsort(-scores, kind='mergesort')[:post_nms_topN]
-    rois = rois[inds, :]
-    return rois
-
-
-def distribute(rois, _, outputs, **args):
-    """To understand the output blob order see return value of
-    roi_data.fast_rcnn.get_fast_rcnn_blob_names(is_training=False)
-    """
-    # equivalent to Detectron code
-    #   lvl_min = cfg.FPN.ROI_MIN_LEVEL
-    #   lvl_max = cfg.FPN.ROI_MAX_LEVEL
-    lvl_min = args['roi_min_level']
-    lvl_max = lvl_min + args['roi_num_levels'] - 1
-    lvls = map_rois_to_fpn_levels(
-        rois[:, 1:5],
-        lvl_min, lvl_max,
-        args['roi_canonical_scale'],
-        args['roi_canonical_level'])
-
-    # equivalent to Detectron code
-    #   outputs[0].reshape(rois.shape)
-    #   outputs[0].data[...] = rois
-    outputs[0] = rois
-
-    # Create new roi blobs for each FPN level
-    # (See: modeling.FPN.add_multilevel_roi_blobs which is similar but annoying
-    # to generalize to support this particular case.)
-    rois_idx_order = np.empty((0, ))
-    for output_idx, lvl in enumerate(range(lvl_min, lvl_max + 1)):
-        idx_lvl = np.where(lvls == lvl)[0]
-        blob_roi_level = rois[idx_lvl, :]
-        # equivalent to Detectron code
-        #   outputs[output_idx + 1].reshape(blob_roi_level.shape)
-        #   outputs[output_idx + 1].data[...] = blob_roi_level
-        outputs[output_idx + 1] = blob_roi_level
-        rois_idx_order = np.concatenate((rois_idx_order, idx_lvl))
-    rois_idx_restore = np.argsort(rois_idx_order, kind='mergesort')
-    # equivalent to Detectron code
-    #   py_op_copy_blob(
-    #       rois_idx_restore.astype(np.int32), outputs[-1])
-    outputs[-1] = rois_idx_restore.astype(np.int32)
-
-
-def collect_and_distribute_fpn_rpn_ref(*inputs):
-    assert inputs
-    args = inputs[-1]
-    inputs = inputs[:-1]
-
-    num_rpn_lvls = args['rpn_num_levels']
-    assert len(inputs) == 2 * num_rpn_lvls
-    N = inputs[0].shape[0]
-    for i in range(num_rpn_lvls):
-        assert len(inputs[i].shape) == 2
-        assert inputs[i].shape[0] == N
-        assert inputs[i].shape[1] == 5
-    for i in range(num_rpn_lvls, 2 * num_rpn_lvls):
-        assert len(inputs[i].shape) == 1
-        assert inputs[i].shape[0] == N
-
-    num_roi_lvls = args['roi_num_levels']
-    outputs = (num_roi_lvls + 2) * [None]
-    rois = collect(inputs, **args)
-    distribute(rois, None, outputs, **args)
-
-    return outputs
-
-
-def collect_rpn_ref(*inputs):
-    args = inputs[-1]
-    inputs = inputs[:-1]
-    rois = collect(inputs, **args)
-    return [rois]
-
-
-def distribute_fpn_ref(*inputs):
-    args = inputs[-1]
-    inputs = inputs[:-1]
-    rois = inputs[0]
-    num_roi_lvls = args['roi_num_levels']
-    outputs = (num_roi_lvls + 2) * [None]
-    distribute(rois, None, outputs, **args)
-    # remove the first rois from output of distribute
-    outputs.pop(0)
-    return outputs
-
-
-class TestCollectAndDistributeFpnRpnProposals(serial.SerializedTestCase):
-    @staticmethod
-    def _create_input(proposal_count, rpn_min_level, rpn_num_levels, roi_canonical_scale):
-        np.random.seed(0)
-
-        input_names = []
-        inputs = []
-
-        for lvl in range(rpn_num_levels):
-            rpn_roi = (
-                roi_canonical_scale *
-                np.random.rand(proposal_count, 5).astype(np.float32)
-            )
-            for i in range(proposal_count):
-                # Make RoIs have positive area, since they
-                # are in the format [[batch_idx, x0, y0, x1, y2], ...]
-                rpn_roi[i][3] += rpn_roi[i][1]
-                rpn_roi[i][4] += rpn_roi[i][2]
-            input_names.append('rpn_rois_fpn{}'.format(lvl + rpn_min_level))
-            inputs.append(rpn_roi)
-        for lvl in range(rpn_num_levels):
-            rpn_roi_score = np.random.rand(proposal_count).astype(np.float32)
-            input_names.append('rpn_roi_probs_fpn{}'.format(lvl + rpn_min_level))
-            inputs.append(rpn_roi_score)
-
-        return input_names, inputs
-
-    @given(proposal_count=st.integers(min_value=1000, max_value=8000),
-           rpn_min_level=st.integers(min_value=1, max_value=4),
-           rpn_num_levels=st.integers(min_value=1, max_value=6),
-           roi_min_level=st.integers(min_value=1, max_value=4),
-           roi_num_levels=st.integers(min_value=1, max_value=6),
-           rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000),
-           roi_canonical_scale=st.integers(min_value=100, max_value=300),
-           roi_canonical_level=st.integers(min_value=1, max_value=8),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_collect_and_dist(
-        self,
-        proposal_count,
-        rpn_min_level, rpn_num_levels,
-        roi_min_level, roi_num_levels,
-        rpn_post_nms_topN,
-        roi_canonical_scale, roi_canonical_level,
-        gc, dc
-    ):
-        input_names, inputs = self._create_input(
-            proposal_count, rpn_min_level, rpn_num_levels, roi_canonical_scale
-        )
-
-        output_names = [
-            'rois',
-        ]
-        for lvl in range(roi_num_levels):
-            output_names.append('rois_fpn{}'.format(lvl + roi_min_level))
-        output_names.append('rois_idx_restore')
-
-        op = core.CreateOperator(
-            'CollectAndDistributeFpnRpnProposals',
-            input_names,
-            output_names,
-            arg=[
-                utils.MakeArgument("roi_canonical_scale", roi_canonical_scale),
-                utils.MakeArgument("roi_canonical_level", roi_canonical_level),
-                utils.MakeArgument("roi_max_level", roi_min_level + roi_num_levels - 1),
-                utils.MakeArgument("roi_min_level", roi_min_level),
-                utils.MakeArgument("rpn_max_level", rpn_min_level + rpn_num_levels - 1),
-                utils.MakeArgument("rpn_min_level", rpn_min_level),
-                utils.MakeArgument("rpn_post_nms_topN", rpn_post_nms_topN),
-            ],
-            device_option=gc)
-        args = {
-            'rpn_min_level' : rpn_min_level,
-            'rpn_num_levels' : rpn_num_levels,
-            'roi_min_level' : roi_min_level,
-            'roi_num_levels' : roi_num_levels,
-            'rpn_post_nms_topN' : rpn_post_nms_topN,
-            'roi_canonical_scale' : roi_canonical_scale,
-            'roi_canonical_level' : roi_canonical_level}
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs + [args],
-            reference=collect_and_distribute_fpn_rpn_ref,
-        )
-
-    @given(
-        proposal_count=st.integers(min_value=1000, max_value=8000),
-        rpn_min_level=st.integers(min_value=1, max_value=4),
-        rpn_num_levels=st.integers(min_value=1, max_value=6),
-        roi_min_level=st.integers(min_value=1, max_value=4),
-        roi_num_levels=st.integers(min_value=1, max_value=6),
-        rpn_post_nms_topN=st.integers(min_value=1000, max_value=4000),
-        roi_canonical_scale=st.integers(min_value=100, max_value=300),
-        roi_canonical_level=st.integers(min_value=1, max_value=8),
-        **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_collect_and_dist_separately(
-        self,
-        proposal_count,
-        rpn_min_level, rpn_num_levels,
-        roi_min_level, roi_num_levels,
-        rpn_post_nms_topN,
-        roi_canonical_scale, roi_canonical_level,
-        gc, dc
-    ):
-        input_names, inputs = self._create_input(
-            proposal_count, rpn_min_level, rpn_num_levels, roi_canonical_scale
-        )
-
-        collect_op = core.CreateOperator(
-            'CollectRpnProposals',
-            input_names,
-            ['rois'],
-            arg=[
-                utils.MakeArgument("rpn_max_level", rpn_min_level + rpn_num_levels - 1),
-                utils.MakeArgument("rpn_min_level", rpn_min_level),
-                utils.MakeArgument("rpn_post_nms_topN", rpn_post_nms_topN),
-            ],
-            device_option=gc)
-        collect_args = {
-            'rpn_min_level' : rpn_min_level,
-            'rpn_num_levels' : rpn_num_levels,
-            'rpn_post_nms_topN' : rpn_post_nms_topN,
-        }
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=collect_op,
-            inputs=inputs + [collect_args],
-            reference=collect_rpn_ref,
-        )
-
-        rois = collect(inputs, **collect_args)
-
-        output_names = []
-        for lvl in range(roi_num_levels):
-            output_names.append('rois_fpn{}'.format(lvl + roi_min_level))
-        output_names.append('rois_idx_restore')
-
-        distribute_op = core.CreateOperator(
-            'DistributeFpnProposals',
-            ['rois'],
-            output_names,
-            arg=[
-                utils.MakeArgument("roi_canonical_scale", roi_canonical_scale),
-                utils.MakeArgument("roi_canonical_level", roi_canonical_level),
-                utils.MakeArgument("roi_max_level", roi_min_level + roi_num_levels - 1),
-                utils.MakeArgument("roi_min_level", roi_min_level),
-            ],
-            device_option=gc)
-        distribute_args = {
-            'roi_min_level' : roi_min_level,
-            'roi_num_levels' : roi_num_levels,
-            'roi_canonical_scale' : roi_canonical_scale,
-            'roi_canonical_level' : roi_canonical_level}
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=distribute_op,
-            inputs=[rois, distribute_args],
-            reference=distribute_fpn_ref,
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/concat_op_cost_test.py b/caffe2/python/operator_test/concat_op_cost_test.py
deleted file mode 100644
index 065a22569992..000000000000
--- a/caffe2/python/operator_test/concat_op_cost_test.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from collections import namedtuple
-
-import numpy as np
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-
-
-class TestConcatOpCost(TestCase):
-    def test_columnwise_concat(self):
-        def _test_columnwise_concat_for_type(dtype):
-            workspace.ResetWorkspace()
-            workspace.FeedBlob("input_1", np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype))
-            workspace.FeedBlob("input_2", np.array([[7], [8]], dtype=dtype))
-            concat_op = core.CreateOperator(
-                "Concat",
-                ["input_1", "input_2"],
-                ["output", "split_info"],
-            )
-            workspace.RunOperatorOnce(concat_op)
-
-            output = workspace.FetchBlob("output")
-            self.assertTupleEqual(output.shape, (2, 4))
-            np.testing.assert_array_equal(output, [[1, 2, 3, 7], [4, 5, 6, 8]])
-
-            flops, bytes_written, bytes_read = workspace.GetOperatorCost(
-                concat_op, concat_op.input
-            )
-
-            self.assertEqual(flops, 0)
-            self.assertEqual(
-                bytes_read,
-                sum(workspace.FetchBlob(b).nbytes for b in concat_op.input),
-            )
-            self.assertEqual(
-                bytes_written,
-                sum(workspace.FetchBlob(b).nbytes for b in concat_op.output),
-            )
-
-        [
-            _test_columnwise_concat_for_type(t)
-            for t in [np.int64, np.float64, np.half, np.int8]
-        ]
-
-    def test_split_then_concat(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("input", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
-        workspace.FeedBlob("split", np.array([1, 1, 1], dtype=np.int32))
-        split_op = core.CreateOperator(
-            "Split",
-            ["input", "split"],
-            ["output_1", "output_2", "output_3"],
-            axis=1,
-            add_axis=1,
-        )
-        workspace.RunOperatorOnce(split_op)
-
-        concat_op = core.CreateOperator(
-            "Concat",
-            ["output_1", "output_2", "output_3"],
-            ["output", "split_info"],
-            axis=1,
-            add_axis=1,
-        )
-        workspace.RunOperatorOnce(concat_op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("input"), workspace.FetchBlob("output")
-        )
-
-        split_cost = workspace.GetOperatorCost(split_op, split_op.input)
-        self.assertTupleEqual(
-            split_cost,
-            namedtuple("expected_cost", ["flops", "bytes_written", "bytes_read"])(
-                0, 24, 36
-            ),
-        )
-
-        concat_cost = workspace.GetOperatorCost(concat_op, concat_op.input)
-        self.assertTupleEqual(
-            concat_cost,
-            namedtuple("expected_cost", ["flops", "bytes_written", "bytes_read"])(
-                0, 36, 24
-            ),
-        )
diff --git a/caffe2/python/operator_test/concat_split_op_test.py b/caffe2/python/operator_test/concat_split_op_test.py
deleted file mode 100644
index ac83681f08bf..000000000000
--- a/caffe2/python/operator_test/concat_split_op_test.py
+++ /dev/null
@@ -1,211 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-@st.composite
-def _tensor_splits(draw, add_axis=False):
-    """Generates (axis, split_info, tensor_splits) tuples."""
-    tensor = draw(hu.tensor(min_value=4))  # Each dim has at least 4 elements.
-    axis = draw(st.integers(-len(tensor.shape), len(tensor.shape) - 1))
-    if add_axis:
-        # Simple case: get individual slices along one axis, where each of them
-        # is (N-1)-dimensional. The axis will be added back upon concatenation.
-        return (
-            axis,
-            np.ones(tensor.shape[axis], dtype=np.int32),
-            [
-                np.array(tensor.take(i, axis=axis))
-                for i in range(tensor.shape[axis])
-            ]
-        )
-    else:
-        # General case: pick some (possibly consecutive, even non-unique)
-        # indices at which we will split the tensor, along the given axis.
-        splits = sorted(draw(
-            st.lists(elements=st.integers(0, tensor.shape[axis]), max_size=4)
-        ) + [0, tensor.shape[axis]])
-        return (
-            axis,
-            np.array(np.diff(splits), dtype=np.int32),
-            [
-                tensor.take(range(splits[i], splits[i + 1]), axis=axis)
-                for i in range(len(splits) - 1)
-            ],
-        )
-
-
-class TestConcatSplitOps(serial.SerializedTestCase):
-    @serial.given(tensor_splits=_tensor_splits(),
-           **hu.gcs)
-    def test_concat(self, tensor_splits, gc, dc):
-        axis, _, splits = tensor_splits
-
-        op = core.CreateOperator(
-            "Concat",
-            ['X_{}'.format(i) for i in range(len(splits))],
-            ['concat_result', 'split_info'],
-            axis=axis
-        )
-
-        self.assertReferenceChecks(
-            gc, op, splits, lambda *splits: (
-                np.concatenate(splits, axis=axis),
-                np.array([a.shape[axis] for a in splits])
-            ),
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, splits, [0, 1])
-        self.assertGradientChecks(
-            gc, op, splits, 0, [0],
-            ensure_outputs_are_inferred=True,
-        )
-
-    @given(tensor_splits=_tensor_splits(add_axis=True),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_concat_add_axis(self, tensor_splits, gc, dc):
-        axis, _, splits = tensor_splits
-
-        op = core.CreateOperator(
-            "Concat",
-            ['X_{}'.format(i) for i in range(len(splits))],
-            ['concat_result', 'split_info'],
-            axis=axis,
-            add_axis=1
-        )
-
-        self.assertReferenceChecks(
-            gc, op, splits, lambda *splits: (
-                np.concatenate(
-                    [np.expand_dims(a, axis) for a in splits],
-                    axis=axis
-                ),
-                np.array([1] * len(splits))
-            ),
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, splits, [0, 1])
-        for i in range(len(splits)):
-            self.assertGradientChecks(
-                gc, op, splits, i, [0],
-                ensure_outputs_are_inferred=True,
-            )
-
-    @serial.given(tensor_splits=_tensor_splits(),
-           split_as_arg=st.booleans(),
-           **hu.gcs)
-    def test_split(self, tensor_splits, split_as_arg, gc, dc):
-        axis, split_info, splits = tensor_splits
-
-        split_as_arg = True
-
-        if split_as_arg:
-            input_names = ['input']
-            input_tensors = [np.concatenate(splits, axis=axis)]
-            kwargs = dict(axis=axis, split=split_info)
-        else:
-            input_names = ['input', 'split']
-            input_tensors = [np.concatenate(splits, axis=axis), split_info]
-            kwargs = dict(axis=axis)
-
-        op = core.CreateOperator(
-            "Split",
-            input_names,
-            ['X_{}'.format(i) for i in range(len(split_info))],
-            **kwargs
-        )
-
-        def split_ref(input, split=split_info):
-            s = np.cumsum([0] + list(split))
-            return [
-                np.array(input.take(np.arange(s[i], s[i + 1]), axis=axis))
-                for i in range(len(split))
-            ]
-        outputs_with_grad = range(len(split_info))
-        self.assertReferenceChecks(
-            gc, op, input_tensors, split_ref,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, input_tensors, outputs_with_grad)
-        self.assertGradientChecks(
-            gc, op, input_tensors, 0, outputs_with_grad,
-            ensure_outputs_are_inferred=True,
-        )
-
-    @given(
-        inputs=hu.lengths_tensor(
-            dtype=np.float32,
-            min_value=1,
-            max_value=11,
-            allow_empty=True,
-        ),
-        split_by_scaling_lengths=st.booleans(),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_split_by_lengths(self, inputs, split_by_scaling_lengths, gc, dc):
-        data, lengths = inputs
-        len_len = len(lengths)
-
-        def _find_factor_simple(x):
-            for i in [2, 3, 5, 7, 9, 11]:
-                if x % i == 0:
-                    return i
-            return x
-
-        num_output = _find_factor_simple(len_len)
-        scaling_factor = 1
-
-        if split_by_scaling_lengths:
-            sum_len = sum(lengths)
-            sum_scaling_lengths = _find_factor_simple(sum_len)
-            if sum_scaling_lengths != sum_len and sum_scaling_lengths >= num_output:
-                scaling_lengths = [1] * (num_output - 1) + [sum_scaling_lengths - num_output + 1]
-                len_len = len(scaling_lengths)
-                lengths = np.array(scaling_lengths, dtype=np.int32)
-                scaling_factor = (sum_len // sum_scaling_lengths) if sum_scaling_lengths else 1
-
-        axis = 0
-        op = core.CreateOperator(
-            "SplitByLengths",
-            ["data", "lengths"],
-            ['X_{}'.format(i) for i in range(num_output)],
-            axis=axis,
-            use_scaling_lengths=split_by_scaling_lengths,
-        )
-
-        def split_by_lengths_ref(data, lengths, num_output=num_output, axis=0):
-            idxs = np.cumsum([0] + list(lengths)).astype(np.int32)
-            return [
-                np.array(
-                    data.take(
-                        np.arange(
-                            scaling_factor * idxs[i * len_len // num_output],
-                            scaling_factor * idxs[(i + 1) * len_len // num_output]
-                        ),
-                        axis=axis
-                    )
-                ) for i in range(num_output)
-            ]
-        outputs_with_grad = range(num_output)
-        input_tensors = [data, lengths]
-        self.assertReferenceChecks(
-            hu.cpu_do, op, input_tensors, split_by_lengths_ref)
-        self.assertDeviceChecks(dc, op, input_tensors, outputs_with_grad)
-        self.assertGradientChecks(
-            hu.cpu_do, op, input_tensors, 0, outputs_with_grad,
-            input_device_options={"lengths": hu.cpu_do})
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/conditional_test.py b/caffe2/python/operator_test/conditional_test.py
deleted file mode 100644
index 2e214f089a45..000000000000
--- a/caffe2/python/operator_test/conditional_test.py
+++ /dev/null
@@ -1,29 +0,0 @@
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestConditionalOp(serial.SerializedTestCase):
-    @serial.given(rows_num=st.integers(1, 10000), **hu.gcs_cpu_only)
-    def test_conditional(self, rows_num, gc, dc):
-        op = core.CreateOperator(
-            "Conditional", ["condition", "data_t", "data_f"], "output"
-        )
-        data_t = np.random.random((rows_num, 10, 20)).astype(np.float32)
-        data_f = np.random.random((rows_num, 10, 20)).astype(np.float32)
-        condition = np.random.choice(a=[True, False], size=rows_num)
-
-        def ref(condition, data_t, data_f):
-            output = [
-                data_t[i] if condition[i] else data_f[i]
-                for i in range(rows_num)
-            ]
-            return (output,)
-
-        self.assertReferenceChecks(gc, op, [condition, data_t, data_f], ref)
diff --git a/caffe2/python/operator_test/conftest.py b/caffe2/python/operator_test/conftest.py
deleted file mode 100644
index a240e98fc51e..000000000000
--- a/caffe2/python/operator_test/conftest.py
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
-
-
-
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-def pytest_addoption(parser):
-    parser.addoption(
-        '-G',
-        '--generate-serialized',
-        action='store_true',
-        dest='generate',
-        help='generate output files (default=false, compares to current files)',
-    )
-    parser.addoption(
-        '-O',
-        '--output',
-        default=serial.DATA_DIR,
-        dest='output',
-        help='output directory (default: %(default)s)'
-    )
-    parser.addoption(
-        '-D',
-        '--disable-serialized-check',
-        action='store_true',
-        dest='disable',
-        help='disable checking serialized tests'
-    )
-    parser.addoption(
-        '-C',
-        '--disable-gen-coverage',
-        action='store_true',
-        dest='disable_coverage',
-        help='disable generating coverage markdown file'
-    )
-
-
-def pytest_configure(config):
-    generate = config.getoption('generate', default=False)
-    output = config.getoption('output', default=serial.DATA_DIR)
-    disable = config.getoption('disable', default=False)
-    disable_coverage = config.getoption('disable_coverage', default=False)
-    serial._output_context.__setattr__('should_generate_output', generate)
-    serial._output_context.__setattr__('output_dir', output)
-    serial._output_context.__setattr__('disable_serialized_check', disable)
-    serial._output_context.__setattr__('disable_gen_coverage', disable_coverage)
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
deleted file mode 100644
index 23217b15b82d..000000000000
--- a/caffe2/python/operator_test/conv_test.py
+++ /dev/null
@@ -1,1009 +0,0 @@
-
-
-import collections
-import functools
-import unittest
-
-import caffe2.python._import_c_extension as C
-import caffe2.python.hip_test_util as hiputl
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import brew, core, utils, workspace
-from caffe2.python.model_helper import ModelHelper
-from hypothesis import assume, given, settings
-
-
-def _cudnn_supports(dilation=False, nhwc=False, backward=False):
-    """Return True if cuDNN supports this configuration."""
-    v = workspace.GetCuDNNVersion()
-    if backward:
-        if nhwc:
-            # nhwc isn't supported in backward ops.
-            return False
-    else:
-        # Forward mode.
-        if dilation and v < 6000:
-            # Dilation not supported until v6
-            return False
-        if dilation and nhwc:
-            # Dilation and NHWC not supported together
-            return False
-    return True
-
-
-def _cudnn_convolution_algo_count(direction):
-    try:
-        if direction == "fwd":
-            return st.integers(0, C.cudnn_convolution_fwd_algo_count - 1)
-        elif direction == "dgrad":
-            return st.integers(0, C.cudnn_convolution_bwd_data_algo_count - 1)
-        elif direction == "wgrad":
-            return st.integers(0, C.cudnn_convolution_bwd_filter_algo_count - 1)
-        else:
-            assert False
-    except Exception:
-        return st.sampled_from([-1])
-
-
-class TestConvolution(serial.SerializedTestCase):
-    # CUDNN does NOT support different padding values and we skip it
-    @given(
-        op_type=st.sampled_from(["Conv", "Conv2D"]),
-        stride_h=st.integers(1, 3),
-        stride_w=st.integers(1, 3),
-        pad_t=st.integers(0, 3),
-        pad_l=st.integers(0, 3),
-        pad_b=st.integers(0, 3),
-        pad_r=st.integers(0, 3),
-        kernel=st.integers(3, 5),
-        size=st.integers(1, 8),
-        input_channels=st.integers(1, 3),
-        output_channels=st.integers(1, 3),
-        batch_size=st.integers(0, 3),
-        group=st.integers(1, 2),
-        order=st.sampled_from(["NCHW", "NHWC"]),
-        engine=st.sampled_from(["", "EIGEN"]),
-        shared_buffer=st.booleans(),
-        use_bias=st.booleans(),
-        **hu.gcs
-    )
-    @settings(deadline=None, max_examples=50)
-    def test_convolution_separate_stride_pad_gradients(
-        self,
-        op_type,
-        stride_h,
-        stride_w,
-        pad_t,
-        pad_l,
-        pad_b,
-        pad_r,
-        kernel,
-        size,
-        input_channels,
-        output_channels,
-        batch_size,
-        group,
-        order,
-        engine,
-        shared_buffer,
-        use_bias,
-        gc,
-        dc,
-    ):
-        # TODO: Group conv in NHWC not implemented for GPU yet.
-        assume(group == 1 or order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
-        if group != 1 and order == "NHWC":
-            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
-        # Group conv not implemented with EIGEN engine.
-        assume(group == 1 or engine != "EIGEN")
-
-        input_channels *= group
-        output_channels *= group
-
-        op = core.CreateOperator(
-            op_type,
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            stride_h=stride_h,
-            stride_w=stride_w,
-            pad_t=pad_t,
-            pad_l=pad_l,
-            pad_b=pad_b,
-            pad_r=pad_r,
-            kernel=kernel,
-            group=group,
-            order=order,
-            engine=engine,
-            shared_buffer=int(shared_buffer),
-        )
-        X = (
-            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
-            - 0.5
-        )
-        w = (
-            np.random.rand(
-                output_channels, kernel, kernel, int(input_channels / group)
-            ).astype(np.float32)
-            - 0.5
-        )
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-
-        inputs = [X, w, b] if use_bias else [X, w]
-
-        # Error handling path.
-        if size + pad_r + pad_l < kernel or size + pad_t + pad_b < kernel:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-    # CUDNN does NOT support different padding values and we skip it
-    @given(
-        op_type=st.sampled_from(["Conv", "Conv2D"]),
-        stride_h=st.integers(1, 3),
-        stride_w=st.integers(1, 3),
-        pad_t=st.integers(0, 3),
-        pad_l=st.integers(0, 3),
-        pad_b=st.integers(0, 3),
-        pad_r=st.integers(0, 3),
-        kernel=st.integers(1, 5),
-        size=st.integers(7, 10),
-        input_channels=st.integers(1, 8),
-        output_channels=st.integers(1, 8),
-        batch_size=st.integers(0, 3),
-        engine=st.sampled_from(["", "EIGEN"]),
-        use_bias=st.booleans(),
-        **hu.gcs
-    )
-    @settings(deadline=None)
-    def test_convolution_separate_stride_pad_layout(
-        self,
-        op_type,
-        stride_h,
-        stride_w,
-        pad_t,
-        pad_l,
-        pad_b,
-        pad_r,
-        kernel,
-        size,
-        input_channels,
-        output_channels,
-        batch_size,
-        engine,
-        use_bias,
-        gc,
-        dc,
-    ):
-        X = (
-            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
-            - 0.5
-        )
-        w = (
-            np.random.rand(output_channels, kernel, kernel, input_channels).astype(
-                np.float32
-            )
-            - 0.5
-        )
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        outputs = {}
-        for order in ["NCHW", "NHWC"]:
-            op = core.CreateOperator(
-                op_type,
-                ["X", "w", "b"] if use_bias else ["X", "w"],
-                ["Y"],
-                stride_h=stride_h,
-                stride_w=stride_w,
-                kernel=kernel,
-                pad_t=pad_t,
-                pad_l=pad_l,
-                pad_b=pad_b,
-                pad_r=pad_r,
-                order=order,
-                engine=engine,
-                device_option=gc,
-            )
-            if order == "NCHW":
-                X_f = utils.NHWC2NCHW(X)
-                w_f = utils.NHWC2NCHW(w)
-            else:
-                X_f = X
-                w_f = w
-            self.ws.create_blob("X").feed(X_f, device_option=gc)
-            self.ws.create_blob("w").feed(w_f, device_option=gc)
-            self.ws.create_blob("b").feed(b, device_option=gc)
-            self.ws.run(op)
-            outputs[order] = self.ws.blobs["Y"].fetch()
-        np.testing.assert_allclose(
-            outputs["NCHW"], utils.NHWC2NCHW(outputs["NHWC"]), atol=1e-4, rtol=1e-4
-        )
-
-    @given(
-        op_type=st.sampled_from(["Conv", "Conv2D"]),
-        stride=st.integers(1, 3),
-        pad=st.integers(0, 3),
-        kernel=st.integers(1, 5),
-        dilation=st.integers(1, 3),
-        size=st.integers(7, 10),
-        input_channels=st.integers(1, 8),
-        output_channels=st.integers(1, 8),
-        batch_size=st.integers(0, 3),
-        group=st.integers(1, 2),
-        order=st.sampled_from(["NCHW", "NHWC"]),
-        engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
-        use_bias=st.booleans(),
-        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
-        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
-        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
-        **hu.gcs
-    )
-    @settings(max_examples=20, deadline=None)
-    def test_convolution_gradients(
-        self,
-        op_type,
-        stride,
-        pad,
-        kernel,
-        dilation,
-        size,
-        input_channels,
-        output_channels,
-        batch_size,
-        group,
-        order,
-        engine,
-        use_bias,
-        force_algo_fwd,
-        force_algo_dgrad,
-        force_algo_wgrad,
-        gc,
-        dc,
-    ):
-        # TODO: Group conv in NHWC not implemented for GPU yet.
-        assume(
-            group == 1
-            or (order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
-            and engine != "MKLDNN"
-        )
-        if group != 1 and order == "NHWC":
-            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
-
-        input_channels *= group
-        output_channels *= group
-        dkernel = dilation * (kernel - 1) + 1
-
-        if engine == "CUDNN":
-            if hiputl.run_in_hip(gc, dc):
-                assume((order == "NCHW") and not (dilation > 1 and group > 1))
-            else:
-                assume(
-                    _cudnn_supports(
-                        dilation=(dilation > 1), nhwc=(order == "NHWC"), backward=True
-                    )
-                )
-
-        assume(engine != "MKLDNN" or use_bias is True)
-
-        op = core.CreateOperator(
-            op_type,
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            stride=stride,
-            kernel=kernel,
-            dilation=dilation,
-            pad=pad,
-            group=group,
-            order=order,
-            engine=engine,
-            force_algo_fwd=force_algo_fwd,
-            force_algo_dgrad=force_algo_dgrad,
-            force_algo_wgrad=force_algo_wgrad,
-        )
-        X = (
-            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
-            - 0.5
-        )
-        w = (
-            np.random.rand(
-                output_channels, kernel, kernel, int(input_channels / group)
-            ).astype(np.float32)
-            - 0.5
-        )
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-
-        inputs = [X, w, b] if use_bias else [X, w]
-        # Error handling path.
-        if size + pad + pad < dkernel or size + pad + pad < dkernel:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-
-        try:
-            self.assertDeviceChecks(dc, op, inputs, [0])
-        except RuntimeError as e:
-            es = str(e)
-            # CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM should always have
-            # implementation
-            if (
-                "status == CUDNN_STATUS_SUCCESS" not in es
-                or "CUDNN_STATUS_NOT_SUPPORTED" not in es
-                or force_algo_fwd == 0
-            ):
-                raise e
-
-        for i in range(len(inputs)):
-            try:
-                self.assertGradientChecks(gc, op, inputs, i, [0])
-            except RuntimeError as e:
-                es = str(e)
-                if (
-                    "status == CUDNN_STATUS_SUCCESS" not in es
-                    or "CUDNN_STATUS_NOT_SUPPORTED" not in es
-                ):
-                    raise e
-
-    def _nd_convolution(
-        self,
-        n,
-        input_channels_per_group,
-        output_channels_per_group,
-        batch_size,
-        stride,
-        size,
-        kernel,
-        dilation,
-        pad,
-        group,
-        order,
-        use_bias,
-        engine,
-        force_algo_fwd,
-        force_algo_dgrad,
-        force_algo_wgrad,
-        gc,
-        dc,
-    ):
-        # TODO: Group conv in NHWC not implemented for GPU yet.
-        # TODO: Group 1D conv in NCHW not implemented for GPU yet.
-        assume(
-            group == 1
-            or (n != 1 and order == "NCHW")
-            or gc.device_type == caffe2_pb2.CPU
-        )
-        if group != 1 and (n == 1 or order == "NHWC"):
-            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
-
-        input_channels = group * input_channels_per_group
-        output_channels = group * output_channels_per_group
-
-        dkernel = dilation * (kernel - 1) + 1
-        for op_type in ["Conv", "Conv" + str(n) + "D"]:
-            op = core.CreateOperator(
-                op_type,
-                ["X", "w", "b"] if use_bias else ["X", "w"],
-                ["Y"],
-                strides=[stride] * n,
-                kernels=[kernel] * n,
-                dilations=[dilation] * n,
-                pads=[pad] * n * 2,
-                group=group,
-                order=order,
-                engine=engine,
-                force_algo_fwd=force_algo_fwd,
-                force_algo_dgrad=force_algo_dgrad,
-                force_algo_wgrad=force_algo_wgrad,
-            )
-
-            input_dims = [batch_size, input_channels]
-            input_dims.extend([size] * n)
-            filter_dims = [output_channels, input_channels // group]
-            filter_dims.extend([kernel] * n)
-
-            X = np.random.rand(*input_dims).astype(np.float32) - 0.5
-            w = np.random.rand(*filter_dims).astype(np.float32) - 0.5
-            b = np.random.rand(output_channels).astype(np.float32) - 0.5
-            if order == "NHWC":
-                X = utils.NCHW2NHWC(X)
-                w = utils.NCHW2NHWC(w)
-
-            inputs = [X, w, b] if use_bias else [X, w]
-
-            if size + pad + pad < dkernel or size + pad + pad < dkernel:
-                with self.assertRaises(RuntimeError):
-                    self.assertDeviceChecks(dc, op, inputs, [0])
-                return
-
-            self.assertDeviceChecks(dc, op, inputs, [0])
-            for i in range(len(inputs)):
-                self.assertGradientChecks(gc, op, inputs, i, [0])
-
-    @given(
-        input_channels=st.integers(1, 3),
-        output_channels=st.integers(1, 2),
-        batch_size=st.integers(0, 3),
-        stride=st.integers(1, 3),
-        size=st.integers(7, 10),
-        kernel=st.integers(1, 2),
-        dilation=st.integers(1, 3),
-        pad=st.integers(0, 3),
-        group=st.integers(1, 2),
-        order=st.sampled_from(["NCHW", "NHWC"]),
-        use_bias=st.booleans(),
-        engine=st.sampled_from(["", "CUDNN"]),
-        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
-        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
-        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_1d_convolution(
-        self,
-        input_channels,
-        output_channels,
-        batch_size,
-        stride,
-        size,
-        kernel,
-        dilation,
-        pad,
-        group,
-        order,
-        use_bias,
-        engine,
-        force_algo_fwd,
-        force_algo_dgrad,
-        force_algo_wgrad,
-        gc,
-        dc,
-    ):
-        if hiputl.run_in_hip(gc, dc):
-            # currently miopen only supports 2d conv
-            assume(engine != "CUDNN")  # CUDNN is aliased to MIOPEN for HIP
-        # TODO: 1D conv in NHWC not implemented for GPU yet.
-        assume(order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
-        if order == "NHWC":
-            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
-
-        self._nd_convolution(
-            1,
-            input_channels,
-            output_channels,
-            batch_size,
-            stride,
-            size,
-            kernel,
-            dilation,
-            pad,
-            group,
-            order,
-            use_bias,
-            engine,
-            force_algo_fwd,
-            force_algo_dgrad,
-            force_algo_wgrad,
-            gc,
-            dc,
-        )
-
-    @given(
-        input_channels=st.integers(1, 2),
-        output_channels=st.integers(1, 2),
-        batch_size=st.integers(0, 2),
-        stride=st.integers(1, 2),
-        size=st.integers(4, 5),
-        kernel=st.integers(1, 2),
-        dilation=st.integers(1, 2),
-        pad=st.integers(0, 2),
-        group=st.integers(1, 2),
-        order=st.sampled_from(["NCHW", "NHWC"]),
-        use_bias=st.booleans(),
-        engine=st.sampled_from(["", "MIOPEN"]),  # TODO: add "CUDNN"
-        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
-        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
-        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
-        **hu.gcs
-    )
-    @settings(max_examples=20, deadline=None)
-    def test_3d_convolution(
-        self,
-        input_channels,
-        output_channels,
-        batch_size,
-        stride,
-        size,
-        kernel,
-        dilation,
-        pad,
-        group,
-        order,
-        use_bias,
-        engine,
-        force_algo_fwd,
-        force_algo_dgrad,
-        force_algo_wgrad,
-        gc,
-        dc,
-    ):
-        # TODO: 3D conv in NHWC not implemented for GPU yet.
-        assume(order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
-        if order == "NHWC":
-            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
-        self._nd_convolution(
-            3,
-            input_channels,
-            output_channels,
-            batch_size,
-            stride,
-            size,
-            kernel,
-            dilation,
-            pad,
-            group,
-            order,
-            use_bias,
-            engine,
-            force_algo_fwd,
-            force_algo_dgrad,
-            force_algo_wgrad,
-            gc,
-            dc,
-        )
-
-    @given(
-        op_type=st.sampled_from(["Conv", "Conv3D"]),
-        batch_size=st.integers(0, 2),
-        stride=st.integers(1, 2),
-        size=st.integers(3, 5),
-        kernel=st.integers(1, 2),
-        dilation=st.integers(1, 2),
-        pad=st.integers(0, 2),
-        use_bias=st.booleans(),
-        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
-        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
-        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
-        **hu.gcs_no_hip
-    )  # MIOPEN doesn't support 3D conv yet
-    @settings(deadline=10000)
-    def test_3d_convolution_cudnn_nchw(
-        self,
-        op_type,
-        batch_size,
-        stride,
-        size,
-        kernel,
-        dilation,
-        pad,
-        use_bias,
-        force_algo_fwd,
-        force_algo_dgrad,
-        force_algo_wgrad,
-        gc,
-        dc,
-    ):
-        input_channels = 1
-        output_channels = 1
-        n = 3
-        dkernel = dilation * (kernel - 1) + 1
-        order = "NCHW"
-
-        op = core.CreateOperator(
-            op_type,
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            strides=[stride] * n,
-            kernels=[kernel] * n,
-            dilations=[dilation] * n,
-            pads=[pad] * n * 2,
-            order=order,
-            engine="CUDNN",
-            force_algo_fwd=force_algo_fwd,
-            force_algo_dgrad=force_algo_dgrad,
-            force_algo_wgrad=force_algo_wgrad,
-        )
-
-        input_dims = [batch_size, input_channels]
-        input_dims.extend([size] * n)
-        filter_dims = [output_channels, input_channels]
-        filter_dims.extend([kernel] * n)
-        X = np.random.rand(*input_dims).astype(np.float32) - 0.5
-        w = np.random.rand(*filter_dims).astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-
-        inputs = [X, w, b] if use_bias else [X, w]
-
-        if size + pad + pad < dkernel or size + pad + pad < dkernel:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-
-        try:
-            self.assertDeviceChecks(dc, op, inputs, [0])
-        except RuntimeError as e:
-            es = str(e)
-            # CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM should always have
-            # implementation
-            if (
-                "status == CUDNN_STATUS_SUCCESS" not in es
-                or "CUDNN_STATUS_NOT_SUPPORTED" not in es
-                or force_algo_fwd == 0
-            ):
-                raise e
-
-        for i in range(len(inputs)):
-            try:
-                self.assertGradientChecks(gc, op, inputs, i, [0])
-            except RuntimeError as e:
-                es = str(e)
-                if (
-                    "status == CUDNN_STATUS_SUCCESS" not in es
-                    or "CUDNN_STATUS_NOT_SUPPORTED" not in es
-                ):
-                    raise e
-
-    @given(
-        op_type=st.sampled_from(["Conv", "Conv2D"]),
-        stride=st.integers(1, 3),
-        pad=st.integers(0, 3),
-        kernel=st.integers(1, 5),
-        dilation=st.integers(1, 3),
-        size=st.integers(7, 10),
-        input_channels=st.integers(1, 8),
-        output_channels=st.integers(1, 8),
-        batch_size=st.integers(0, 3),
-        use_bias=st.booleans(),
-        **hu.gcs
-    )
-    @settings(deadline=None, max_examples=50)
-    def test_convolution_layout(
-        self,
-        op_type,
-        stride,
-        pad,
-        kernel,
-        dilation,
-        size,
-        input_channels,
-        output_channels,
-        batch_size,
-        use_bias,
-        gc,
-        dc,
-    ):
-        assume(size >= dilation * (kernel - 1) + 1)
-
-        X = (
-            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
-            - 0.5
-        )
-        w = (
-            np.random.rand(output_channels, kernel, kernel, input_channels).astype(
-                np.float32
-            )
-            - 0.5
-        )
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        Output = collections.namedtuple("Output", ["Y", "engine", "order"])
-        outputs = []
-
-        for order in ["NCHW", "NHWC"]:
-            engine_list = [""]
-            if hiputl.run_in_hip(gc, dc):
-                if order == "NCHW":
-                    engine_list.append("MIOPEN")
-            else:
-                if _cudnn_supports(dilation=(dilation > 1), nhwc=(order == "NHWC")):
-                    engine_list.append("CUDNN")
-
-            for engine in engine_list:
-                op = core.CreateOperator(
-                    op_type,
-                    ["X", "w", "b"] if use_bias else ["X", "w"],
-                    ["Y"],
-                    stride=stride,
-                    kernel=kernel,
-                    dilation=dilation,
-                    pad=pad,
-                    order=order,
-                    engine=engine,
-                    device_option=gc,
-                    exhaustive_search=True,
-                )
-                if order == "NCHW":
-                    X_f = utils.NHWC2NCHW(X)
-                    w_f = utils.NHWC2NCHW(w)
-                else:
-                    X_f = X
-                    w_f = w
-                self.assertDeviceChecks(
-                    dc, op, [X_f, w_f, b] if use_bias else [X_f, w_f], [0]
-                )
-                self.ws.create_blob("X").feed(X_f, device_option=gc)
-                self.ws.create_blob("w").feed(w_f, device_option=gc)
-                self.ws.create_blob("b").feed(b, device_option=gc)
-                self.ws.run(op)
-                outputs.append(
-                    Output(Y=self.ws.blobs["Y"].fetch(), engine=engine, order=order)
-                )
-
-        def canonical(o):
-            if o.order == "NHWC":
-                return utils.NHWC2NCHW(o.Y)
-            else:
-                return o.Y
-
-        for o in outputs:
-            np.testing.assert_allclose(
-                canonical(outputs[0]), canonical(o), atol=1e-4, rtol=1e-4
-            )
-
-    @given(
-        num_workers=st.integers(1, 4),
-        net_type=st.sampled_from(
-            ["simple", "dag"]
-            + (
-                ["async_dag"]
-                if workspace.has_gpu_support
-                else []
-            )
-        ),
-        engine=st.sampled_from(["CUDNN", ""]),
-        **hu.gcs_no_hip
-    )
-    @settings(deadline=None)
-    def test_convolution_sync(self, net_type, num_workers, engine, gc, dc):
-        m = ModelHelper(name="test_model")
-        n = 1
-        d = 2
-        depth = 3
-        iters = 5
-        h = 5
-        w = 5
-        workspace.ResetWorkspace()
-
-        use_cudnn = engine == "CUDNN"
-
-        np.random.seed(1701)
-        # Build a binary tree of conv layers, summing at each node.
-        for i in reversed(range(depth)):
-            for j in range(2 ** i):
-                bottom_1 = "{}_{}".format(i + 1, 2 * j)
-                bottom_2 = "{}_{}".format(i + 1, 2 * j + 1)
-                mid_1 = "{}_{}_m".format(i + 1, 2 * j)
-                mid_2 = "{}_{}_m".format(i + 1, 2 * j + 1)
-                top = "{}_{}".format(i, j)
-                w1, b1, w2, b2 = np.random.randn(4).tolist()
-                brew.conv(
-                    m,
-                    bottom_1,
-                    mid_1,
-                    dim_in=d,
-                    dim_out=d,
-                    kernel=3,
-                    weight_init=("ConstantFill", {"value": w1}),
-                    bias_init=("ConstantFill", {"value": b1}),
-                    cudnn_state=np.random.randint(0, 3),
-                    stride=1,
-                    pad=1,
-                    deterministic=1,
-                    use_cudnn=use_cudnn,
-                    engine=engine,
-                )
-                brew.conv(
-                    m,
-                    bottom_2,
-                    mid_2,
-                    dim_in=d,
-                    dim_out=d,
-                    kernel=3,
-                    stride=1,
-                    pad=1,
-                    weight_init=("ConstantFill", {"value": w2}),
-                    bias_init=("ConstantFill", {"value": b2}),
-                    deterministic=1,
-                    cudnn_state=np.random.randint(0, 3),
-                    use_cudnn=use_cudnn,
-                    engine=engine,
-                )
-                m.net.Sum([mid_1, mid_2], top)
-
-        m.net.Flatten(["0_0"], ["0_0_flat"])
-        m.net.SquaredL2Distance(["0_0_flat", "label"], "xent")
-        m.net.AveragedLoss("xent", "loss")
-        input_to_grad = m.AddGradientOperators(["loss"])
-        m.Proto().device_option.CopyFrom(gc)
-        m.param_init_net.Proto().device_option.CopyFrom(gc)
-        m.Proto().type = net_type
-        m.Proto().num_workers = num_workers
-        self.ws.run(m.param_init_net)
-
-        def run():
-            import numpy as np
-
-            np.random.seed(1701)
-            input_blobs = ["{}_{}".format(depth, j) for j in range(2 ** depth)]
-            for input_blob in input_blobs:
-                self.ws.create_blob(input_blob).feed(
-                    np.random.randn(n, d, h, w).astype(np.float32), device_option=gc
-                )
-                self.ws.create_blob("label").feed(
-                    np.random.randn(n, d * h * w).astype(np.float32), device_option=gc
-                )
-            self.ws.run(m.net)
-            gradients = [
-                self.ws.blobs[str(input_to_grad[input_blob])].fetch()
-                for input_blob in input_blobs
-            ]
-            return gradients
-
-        outputs = [run() for _ in range(iters)]
-        for output in outputs[1:]:
-            np.testing.assert_array_equal(outputs[0], output)
-            np.testing.assert_allclose(
-                np.sum(np.square(output)), 1763719461732352.0, rtol=1e-5
-            )
-
-    def test_use_cudnn_engine_interactions(self):
-        """Make sure the use_cudnn and engine kwargs work as expected."""
-        for model_default in [None, True, False]:
-            arg_scope = {}
-            if model_default is not None:
-                arg_scope["use_cudnn"] = model_default
-            else:
-                model_default = True  # the default
-
-            model = ModelHelper(arg_scope=arg_scope)
-            self.assertEqual(model.arg_scope["use_cudnn"], model_default)
-            f = functools.partial(brew.conv, model, "conv_in", "conv_out", 10, 10, 5)
-
-            for op_cudnn in [None, True, False]:
-                for op_engine in [None, "", "CUDNN"]:
-                    kwargs = {}
-                    if op_cudnn is not None:
-                        kwargs["use_cudnn"] = op_cudnn
-                    else:
-                        op_cudnn = False  # the default
-                    if op_engine is not None:
-                        kwargs["engine"] = op_engine
-
-                    calculated_cudnn = kwargs.get("use_cudnn", model_default)
-                    expected_engine = kwargs.get(
-                        "engine", "CUDNN" if calculated_cudnn else ""
-                    )
-
-                    if (calculated_cudnn is False and op_engine == "CUDNN") or (
-                        calculated_cudnn is True and op_engine == ""
-                    ):
-                        with self.assertRaises(ValueError):
-                            f(**kwargs)
-                    else:
-                        f(**kwargs)
-                        self.assertEqual(model.Proto().op[-1].engine, expected_engine)
-
-    @given(
-        op_type=st.sampled_from(["Conv", "Conv2D"]),
-        N=st.integers(0, 3),
-        G=st.integers(1, 3),
-        DX=st.integers(1, 3),
-        DY=st.integers(1, 3),
-        H=st.integers(1, 3),
-        W=st.integers(1, 3),
-        use_bias=st.booleans(),
-        order=st.sampled_from(["NCHW", "NHWC"]),
-        force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
-        force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
-        force_algo_wgrad=_cudnn_convolution_algo_count("wgrad"),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_1x1_conv(
-        self,
-        op_type,
-        N,
-        G,
-        DX,
-        DY,
-        H,
-        W,
-        use_bias,
-        order,
-        force_algo_fwd,
-        force_algo_dgrad,
-        force_algo_wgrad,
-        gc,
-        dc,
-    ):
-        if hiputl.run_in_hip(gc, dc):
-            assume(order == "NCHW")
-        if order == "NHWC":
-            G = 1
-
-        C = G * DX
-        M = G * DY
-
-        op = core.CreateOperator(
-            op_type,
-            ["X", "filter", "bias"] if use_bias else ["X", "filter"],
-            ["Y"],
-            stride_h=1,
-            stride_w=1,
-            pad_t=0,
-            pad_l=0,
-            pad_b=0,
-            pad_r=0,
-            kernel=1,
-            order=order,
-            group=G,
-            force_algo_fwd=force_algo_fwd,
-            force_algo_dgrad=force_algo_dgrad,
-            force_algo_wgrad=force_algo_wgrad,
-        )
-
-        if order == "NCHW":
-            X = np.random.randn(N, C, H, W).astype(np.float32)
-            filter = np.random.randn(M, DX, 1, 1).astype(np.float32)
-        else:
-            X = np.random.randn(N, H, W, C).astype(np.float32)
-            filter = np.random.randn(M, 1, 1, DX).astype(np.float32)
-        bias = np.random.randn(M).astype(np.float32)
-        inputs = [X, filter, bias] if use_bias else [X, filter]
-
-        def conv_1x1_nchw_ref(X, filter, bias=None):
-            if N == 0:
-                Y = np.zeros(shape=(N, M, H, W), dtype=np.float32)
-                return [Y]
-
-            X = X.reshape(N, G, DX, -1)
-            filter = filter.reshape(G, DY, DX)
-            Y = np.zeros(shape=(N, G, DY, H * W), dtype=np.float32)
-            for i in range(N):
-                for j in range(G):
-                    Y[i, j, :, :] = np.dot(filter[j, :, :], X[i, j, :, :])
-            Y = Y.reshape(N, M, H, W)
-            if bias is not None:
-                bias = bias.reshape(1, M, 1, 1)
-                Y = np.add(Y, bias)
-            return [Y]
-
-        def conv_1x1_nhwc_ref(X, filter, bias=None):
-            if N == 0:
-                Y = np.zeros(shape=(N, H, W, M), dtype=np.float32)
-                return [Y]
-
-            X = X.reshape(N, -1, G, DX)
-            filter = filter.reshape(G, DY, DX)
-            Y = np.zeros(shape=(N, H * W, G, DY), dtype=np.float32)
-            for i in range(N):
-                for j in range(G):
-                    Y[i, :, j, :] = np.dot(X[i, :, j, :], filter[j, :, :].transpose())
-            Y = Y.reshape(N, H, W, M)
-            if bias is not None:
-                bias = bias.reshape(1, 1, 1, M)
-                Y = np.add(Y, bias)
-            return [Y]
-
-        if order == "NCHW":
-            conv_1x1_ref = conv_1x1_nchw_ref
-        else:
-            conv_1x1_ref = conv_1x1_nhwc_ref
-        self.assertReferenceChecks(
-            device_option=gc, op=op, inputs=inputs, reference=conv_1x1_ref
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/conv_transpose_test.py b/caffe2/python/operator_test/conv_transpose_test.py
deleted file mode 100644
index 4fcb6361d0a6..000000000000
--- a/caffe2/python/operator_test/conv_transpose_test.py
+++ /dev/null
@@ -1,429 +0,0 @@
-
-
-
-
-import numpy as np
-from hypothesis import assume, given, settings
-import hypothesis.strategies as st
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, utils
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.hip_test_util as hiputl
-
-
-class TestConvolutionTranspose(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           adj=st.integers(0, 2),
-           size=st.integers(7, 10),
-           input_channels=st.integers(1, 8),
-           output_channels=st.integers(1, 8),
-           batch_size=st.integers(0, 3),
-           engine=st.sampled_from(["", "CUDNN", "BLOCK"]),
-           shared_buffer=st.booleans(),
-           use_bias=st.booleans(),
-           **hu.gcs)
-    def test_convolution_transpose_layout_legacy_args(
-            self, stride, pad, kernel, adj,
-            size, input_channels,
-            output_channels, batch_size,
-            engine, shared_buffer, use_bias, gc, dc):
-        assume(adj < stride)
-        X = np.random.rand(
-            batch_size, size, size, input_channels).astype(np.float32) - 0.5
-        w = np.random.rand(
-            input_channels, kernel, kernel, output_channels)\
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        outputs = {}
-        for order in ["NCHW", "NHWC"]:
-            # MIOPEN doesn't work with NHWC, fallback to use normal hip
-            if hiputl.run_in_hip(gc, dc) and order == "NHWC":
-                tmp_engine = ""
-            else:
-                tmp_engine = engine
-            op = core.CreateOperator(
-                "ConvTranspose",
-                ["X", "w", "b"] if use_bias else ["X", "w"],
-                ["Y"],
-                stride=stride,
-                kernel=kernel,
-                pad=pad,
-                adj=adj,
-                order=order,
-                engine=tmp_engine,
-                shared_buffer=int(shared_buffer),
-                device_option=gc,
-            )
-            if order == "NCHW":
-                X_f = utils.NHWC2NCHW(X)
-                w_f = utils.NHWC2NCHW(w)
-            else:
-                X_f = X
-                w_f = w
-            self.assertDeviceChecks(
-                dc,
-                op,
-                [X_f, w_f, b] if use_bias else [X_f, w_f],
-                [0])
-            self.ws.create_blob("X").feed(X_f, device_option=gc)
-            self.ws.create_blob("w").feed(w_f, device_option=gc)
-            self.ws.create_blob("b").feed(b, device_option=gc)
-            self.ws.run(op)
-            outputs[order] = self.ws.blobs["Y"].fetch()
-        output_size = (size - 1) * stride + kernel + adj - 2 * pad
-        self.assertEqual(
-            outputs["NCHW"].shape,
-            (batch_size, output_channels, output_size, output_size))
-        np.testing.assert_allclose(
-            outputs["NCHW"],
-            utils.NHWC2NCHW(outputs["NHWC"]),
-            atol=1e-4,
-            rtol=1e-4)
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           adj=st.integers(0, 2),
-           size=st.integers(7, 10),
-           input_channels=st.integers(1, 8),
-           output_channels=st.integers(1, 8),
-           batch_size=st.integers(0, 3),
-           engine=st.sampled_from(["", "CUDNN", "BLOCK"]),
-           shared_buffer=st.booleans(),
-           use_bias=st.booleans(),
-           **hu.gcs)
-    def test_convolution_transpose_layout(
-            self, stride, pad, kernel, adj,
-            size, input_channels,
-            output_channels, batch_size,
-            engine, shared_buffer, use_bias, gc, dc):
-        assume(adj < stride)
-        X = np.random.rand(
-            batch_size, size, size, input_channels).astype(np.float32) - 0.5
-        w = np.random.rand(
-            input_channels, kernel, kernel, output_channels)\
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        outputs = {}
-        for order in ["NCHW", "NHWC"]:
-            if hiputl.run_in_hip(gc, dc) and order == "NHWC":
-                # MIOPEN doesn't work with NHWC, fallback to use normal hip
-                tmp_engine = ""
-            else:
-                tmp_engine = engine
-            op = core.CreateOperator(
-                "ConvTranspose",
-                ["X", "w", "b"] if use_bias else ["X", "w"],
-                ["Y"],
-                strides=[stride] * 2,
-                kernels=[kernel] * 2,
-                pads=[pad] * 4,
-                adjs=[adj] * 2,
-                order=order,
-                engine=tmp_engine,
-                shared_buffer=int(shared_buffer),
-                device_option=gc,
-            )
-            if order == "NCHW":
-                X_f = utils.NHWC2NCHW(X)
-                w_f = utils.NHWC2NCHW(w)
-            else:
-                X_f = X
-                w_f = w
-            self.assertDeviceChecks(
-                dc,
-                op,
-                [X_f, w_f, b] if use_bias else [X_f, w_f],
-                [0])
-            self.ws.create_blob("X").feed(X_f, device_option=gc)
-            self.ws.create_blob("w").feed(w_f, device_option=gc)
-            self.ws.create_blob("b").feed(b, device_option=gc)
-            self.ws.run(op)
-            outputs[order] = self.ws.blobs["Y"].fetch()
-        output_size = (size - 1) * stride + kernel + adj - 2 * pad
-        self.assertEqual(
-            outputs["NCHW"].shape,
-            (batch_size, output_channels, output_size, output_size))
-        np.testing.assert_allclose(
-            outputs["NCHW"],
-            utils.NHWC2NCHW(outputs["NHWC"]),
-            atol=1e-4,
-            rtol=1e-4)
-
-    # CUDNN does not support separate stride and pad so we skip it.
-    @given(stride_h=st.integers(1, 3),
-           stride_w=st.integers(1, 3),
-           pad_t=st.integers(0, 3),
-           pad_l=st.integers(0, 3),
-           pad_b=st.integers(0, 3),
-           pad_r=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           adj_h=st.integers(0, 2),
-           adj_w=st.integers(0, 2),
-           size=st.integers(7, 10),
-           input_channels=st.integers(1, 8),
-           output_channels=st.integers(1, 8),
-           batch_size=st.integers(0, 3),
-           engine=st.sampled_from(["", "BLOCK"]),
-           use_bias=st.booleans(),
-           **hu.gcs)
-    def test_convolution_transpose_separate_stride_pad_adj_layout(
-            self, stride_h, stride_w, pad_t, pad_l, pad_b, pad_r, kernel,
-            adj_h, adj_w, size, input_channels, output_channels, batch_size,
-            engine, use_bias, gc, dc):
-        assume(adj_h < stride_h)
-        assume(adj_w < stride_w)
-        X = np.random.rand(
-            batch_size, size, size, input_channels).astype(np.float32) - 0.5
-        w = np.random.rand(
-            input_channels, kernel, kernel, output_channels)\
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        outputs = {}
-        for order in ["NCHW", "NHWC"]:
-            op = core.CreateOperator(
-                "ConvTranspose",
-                ["X", "w", "b"] if use_bias else ["X", "w"],
-                ["Y"],
-                stride_h=stride_h,
-                stride_w=stride_w,
-                kernel=kernel,
-                pad_t=pad_t,
-                pad_l=pad_l,
-                pad_b=pad_b,
-                pad_r=pad_r,
-                adj_h=adj_h,
-                adj_w=adj_w,
-                order=order,
-                engine=engine,
-                device_option=gc,
-            )
-            if order == "NCHW":
-                X_f = utils.NHWC2NCHW(X)
-                w_f = utils.NHWC2NCHW(w)
-            else:
-                X_f = X
-                w_f = w
-            self.assertDeviceChecks(
-                dc,
-                op,
-                [X_f, w_f, b] if use_bias else [X_f, w_f],
-                [0])
-            self.ws.create_blob("X").feed(X_f, device_option=gc)
-            self.ws.create_blob("w").feed(w_f, device_option=gc)
-            self.ws.create_blob("b").feed(b, device_option=gc)
-            self.ws.run(op)
-            outputs[order] = self.ws.blobs["Y"].fetch()
-        output_h = (size - 1) * stride_h + kernel + adj_h - pad_t - pad_b
-        output_w = (size - 1) * stride_w + kernel + adj_w - pad_l - pad_r
-        self.assertEqual(
-            outputs["NCHW"].shape,
-            (batch_size, output_channels, output_h, output_w))
-        np.testing.assert_allclose(
-            outputs["NCHW"],
-            utils.NHWC2NCHW(outputs["NHWC"]),
-            atol=1e-4,
-            rtol=1e-4)
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           adj=st.integers(0, 2),
-           size=st.integers(7, 10),
-           input_channels=st.integers(1, 8),
-           output_channels=st.integers(1, 8),
-           batch_size=st.integers(0, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           engine=st.sampled_from(["", "CUDNN", "BLOCK"]),
-           use_bias=st.booleans(),
-           compute_dX=st.booleans(),
-           **hu.gcs)
-    @settings(max_examples=2, deadline=None)
-    def test_convolution_transpose_gradients(self, stride, pad, kernel, adj,
-                                             size, input_channels,
-                                             output_channels, batch_size,
-                                             order, engine, use_bias,
-                                             compute_dX, gc, dc):
-        assume(adj < stride)
-        if hiputl.run_in_hip(gc, dc) and engine == "CUDNN":
-            assume(order == "NCHW")
-        X = np.random.rand(
-            batch_size, size, size, input_channels).astype(np.float32) - 0.5
-        w = np.random.rand(
-            input_channels, kernel, kernel, output_channels)\
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        op = core.CreateOperator(
-            "ConvTranspose",
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            stride=stride,
-            kernel=kernel,
-            pad=pad,
-            adj=adj,
-            order=order,
-            engine=engine,
-            no_gradient_to_input=not compute_dX,
-        )
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-
-        inputs = [X, w, b] if use_bias else [X, w]
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-        if use_bias and compute_dX:
-            # w, b, X
-            outputs_to_check = [1, 2, 0]
-        elif use_bias:
-            # w, b
-            outputs_to_check = [1, 2]
-        elif compute_dX:
-            # w, X
-            outputs_to_check = [1, 0]
-        else:
-            # w
-            outputs_to_check = [1]
-        for i in outputs_to_check:
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-    # CUDNN does not support separate stride and pad so we skip it.
-    @given(stride_h=st.integers(1, 3),
-           stride_w=st.integers(1, 3),
-           pad_t=st.integers(0, 3),
-           pad_l=st.integers(0, 3),
-           pad_b=st.integers(0, 3),
-           pad_r=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           adj_h=st.integers(0, 2),
-           adj_w=st.integers(0, 2),
-           size=st.integers(7, 10),
-           input_channels=st.integers(1, 8),
-           output_channels=st.integers(1, 8),
-           batch_size=st.integers(0, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           engine=st.sampled_from(["", "BLOCK"]),
-           use_bias=st.booleans(),
-           compute_dX=st.booleans(),
-           **hu.gcs)
-    @settings(max_examples=2, deadline=None)
-    def test_convolution_transpose_separate_stride_pad_adj_gradient(
-            self, stride_h, stride_w, pad_t, pad_l, pad_b, pad_r, kernel,
-            adj_h, adj_w, size, input_channels, output_channels, batch_size,
-            order, engine, use_bias, compute_dX, gc, dc):
-        assume(adj_h < stride_h)
-        assume(adj_w < stride_w)
-        X = np.random.rand(
-            batch_size, size, size, input_channels).astype(np.float32) - 0.5
-        w = np.random.rand(
-            input_channels, kernel, kernel, output_channels)\
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        op = core.CreateOperator(
-            "ConvTranspose",
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            stride_h=stride_h,
-            stride_w=stride_w,
-            kernel=kernel,
-            pad_t=pad_t,
-            pad_l=pad_l,
-            pad_b=pad_b,
-            pad_r=pad_r,
-            adj_h=adj_h,
-            adj_w=adj_w,
-            order=order,
-            engine=engine,
-            no_gradient_to_input=not compute_dX,
-        )
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-
-        inputs = [X, w, b] if use_bias else [X, w]
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-        if use_bias and compute_dX:
-            # w, b, X
-            outputs_to_check = [1, 2, 0]
-        elif use_bias:
-            # w, b
-            outputs_to_check = [1, 2]
-        elif compute_dX:
-            # w, X
-            outputs_to_check = [1, 0]
-        else:
-            # w
-            outputs_to_check = [1]
-        for i in outputs_to_check:
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(1, 3),
-           adj=st.integers(0, 2),
-           size=st.integers(7, 10),
-           input_channels=st.integers(1, 8),
-           output_channels=st.integers(1, 8),
-           batch_size=st.integers(0, 4),
-           group=st.integers(1, 4),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           engine=st.sampled_from(["", "CUDNN", "BLOCK"]),
-           shared_buffer=st.booleans(),
-           use_bias=st.booleans(),
-           **hu.gcs)
-    @settings(max_examples=2, deadline=None)
-    def test_convolution_transpose_with_group(
-            self, stride, pad, kernel, adj, size, input_channels,
-            output_channels, batch_size, group, order, engine, shared_buffer,
-            use_bias, gc, dc):
-        assume(adj < stride)
-        # TODO: Group conv_transpose in NHWC not implemented for GPU yet.
-        assume(group == 1 or order == "NCHW" or
-               gc.device_type == caffe2_pb2.CPU)
-        if group != 1 and order == "NHWC":
-            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
-
-        if hiputl.run_in_hip(gc, dc) and order == "NHWC":
-            engine = ""
-
-        op = core.CreateOperator(
-            "ConvTranspose",
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            stride=stride,
-            kernel=kernel,
-            pad=pad,
-            adj=adj,
-            group=group,
-            order=order,
-            engine=engine,
-            shared_buffer=int(shared_buffer),
-            device_option=gc,
-        )
-
-        input_channels *= group
-        output_channels *= group
-
-        X = np.random.rand(
-            batch_size, size, size, input_channels).astype(np.float32) - 0.5
-        w = np.random.rand(
-            input_channels, kernel, kernel, int(output_channels / group)) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-
-        inputs = [X, w, b] if use_bias else [X, w]
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/copy_ops_test.py b/caffe2/python/operator_test/copy_ops_test.py
deleted file mode 100644
index 2b8b756cdf61..000000000000
--- a/caffe2/python/operator_test/copy_ops_test.py
+++ /dev/null
@@ -1,188 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-import unittest
-from caffe2.proto import caffe2_pb2
-from caffe2.python import workspace, core, model_helper, brew, test_util
-
-
-class CopyOpsTest(test_util.TestCase):
-
-    def tearDown(self):
-        # Reset workspace after each test
-        # Otherwise, the multi-GPU test will use previously created tensors,
-        #   which may have been placed on the wrong device
-        workspace.ResetWorkspace()
-
-    def run_test_copy_gradient(self, device_opt):
-        model = model_helper.ModelHelper(name="copy_test")
-        with core.DeviceScope(device_opt):
-            x = model.net.AddExternalInputs("x")
-            y = model.Copy(x, "y")
-            loss = model.AveragedLoss(y, "loss")
-            gradient_map = model.AddGradientOperators([loss])
-            workspace.FeedBlob(x, np.random.rand(32).astype(np.float32))
-            workspace.RunNetOnce(model.param_init_net)
-            workspace.RunNetOnce(model.net)
-            self.assertTrue(np.array_equal(
-                workspace.FetchBlob(x),
-                workspace.FetchBlob(y),
-            ))
-            self.assertTrue(np.array_equal(
-                workspace.FetchBlob(gradient_map[x]),
-                workspace.FetchBlob(gradient_map[y]),
-            ))
-
-    def test_copy_gradient_cpu(self):
-        self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CPU, 0))
-
-    @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.")
-    def test_copy_gradient_gpu(self):
-        self.run_test_copy_gradient(core.DeviceOption(workspace.GpuDeviceType, 0))
-
-    @unittest.skipIf(workspace.NumGpuDevices() < 2, "Need at least 2 GPU.")
-    def test_copy_gradient_multiple_gpus(self):
-        model = model_helper.ModelHelper(name="copy_test")
-
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
-            x_cpu = model.net.AddExternalInputs("x_cpu")
-
-        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
-            x_gpu_1 = model.CopyCPUToGPU(x_cpu, "x_gpu_1")
-
-        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 1)):
-            x_gpu_2 = model.Copy(x_gpu_1, "x_gpu_2")
-            loss = model.AveragedLoss(x_gpu_2, "loss")
-            gradient_map = model.AddGradientOperators([loss])
-
-        workspace.FeedBlob("x_cpu", np.random.rand(32).astype(np.float32))
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        self.assertTrue(np.array_equal(
-            workspace.FetchBlob("x_gpu_1"),
-            workspace.FetchBlob("x_gpu_2"),
-        ))
-        self.assertTrue(np.array_equal(
-            workspace.FetchBlob(gradient_map["x_gpu_1"]),
-            workspace.FetchBlob(gradient_map["x_gpu_2"]),
-        ))
-
-        def get_op_with_output(model, output_blob_name):
-            for op in model.net.Proto().op:
-                if len(op.output) == 1 and op.output[0] == output_blob_name:
-                    return op
-            return None
-
-        self.assertEqual(
-            get_op_with_output(model, "x_gpu_2_grad").device_option,
-            core.DeviceOption(workspace.GpuDeviceType, 1),
-        )
-        self.assertEqual(
-            get_op_with_output(model, "x_cpu_grad").device_option,
-            core.DeviceOption(workspace.GpuDeviceType, 0),
-        )
-
-    @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.")
-    def test_cpu2gpu_gpu2cpu_sparse_gradients(self):
-        model = model_helper.ModelHelper(name="copy_test")
-        v = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
-        indices = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
-        cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
-        gpu_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
-
-        with core.DeviceScope(gpu_opt):
-            vcpu = model.CopyGPUToCPU(v, "vcpu")
-
-        with core.DeviceScope(cpu_opt):
-            g = model.Gather([vcpu, indices], "g")
-
-        with core.DeviceScope(gpu_opt):
-            ggpu = model.CopyCPUToGPU(g, "ggpu")
-            f = brew.fc(model, ggpu, "out", dim_in=4, dim_out=6)
-            (softmax, loss) = model.SoftmaxWithLoss(
-                [f, "label"],
-                ["softmax", "loss"],
-            )
-        gradient_map = model.AddGradientOperators([loss])
-        self.assertTrue("v" in gradient_map)
-        self.assertTrue(isinstance(gradient_map['v'], core.GradientSlice))
-
-    @unittest.skipIf(workspace.NumGpuDevices() < 1, "Need at least 1 GPU.")
-    def test_cpu2gpu_gpu2cpu_gradients(self):
-        model = model_helper.ModelHelper(name="copy_test")
-
-        batch = 32
-        cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
-        gpu_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
-
-        with core.NameScope("cpu"):
-            with core.DeviceScope(cpu_opt):
-                x_cpu = brew.fc(model, 'data', 'x_cpu', 16, 8)
-
-        with core.NameScope("gpu_0"):
-            with core.DeviceScope(gpu_opt):
-                x_gpu = model.CopyCPUToGPU(x_cpu, "x_gpu")
-                pred_gpu = brew.fc(model, x_gpu, "pred_gpu", 8, 4)
-                pred_cpu = model.CopyGPUToCPU(pred_gpu, "pred_cpu")
-
-        with core.DeviceScope(cpu_opt):
-            with core.NameScope("cpu"):
-                (softmax, loss) = model.SoftmaxWithLoss(
-                    [pred_cpu, "label"],
-                    ["softmax", "loss"],
-                )
-
-        gradient_map = model.AddGradientOperators([loss])
-
-        # Add param updates (for cpu and gpu)
-        init_net = model.param_init_net
-        with core.DeviceScope(cpu_opt):
-            with core.NameScope("cpu"):
-                ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
-                LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0)
-                for param in model.GetParams():
-                    model.WeightedSum(
-                        [param, ONE, gradient_map[param], LR],
-                        param,
-                    )
-
-        with core.NameScope("gpu_0"):
-            with core.DeviceScope(gpu_opt):
-                ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
-                LR = init_net.ConstantFill([], "LR", shape=[1], value=-2.0)
-                for param in model.GetParams():
-                    model.WeightedSum(
-                        [param, ONE, gradient_map[param], LR],
-                        param,
-                    )
-
-        with core.DeviceScope(cpu_opt):
-            workspace.FeedBlob(
-                'cpu/data',
-                np.random.rand(batch, 16).astype(np.float32),
-            )
-            workspace.FeedBlob(
-                'cpu/label',
-                np.random.randint(4, size=batch).astype(np.int32),
-            )
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.CreateNet(model.net)
-
-        initial_params = {p: workspace.FetchBlob(p) for p in model.GetParams()}
-        workspace.RunNet(model.net.Proto().name)
-        updated_params = {p: workspace.FetchBlob(p) for p in model.GetParams()}
-
-        for p in model.GetParams():
-            g = gradient_map[p]
-            expected = initial_params[p] - 2.0 * workspace.FetchBlob(g)
-            actual = updated_params[p]
-            self.assertTrue(
-                np.array_equal(expected, updated_params[p]),
-                "Mismatch: {}: {}, {}".format(p, expected, actual),
-            )
diff --git a/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py b/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py
deleted file mode 100644
index 8e914259bb78..000000000000
--- a/caffe2/python/operator_test/copy_rows_to_tensor_op_test.py
+++ /dev/null
@@ -1,74 +0,0 @@
-
-
-import logging
-
-import caffe2.python.hypothesis_test_util as hu
-import numpy as np
-from caffe2.python import core
-from hypothesis import given, settings, strategies as st
-
-
-logger = logging.getLogger(__name__)
-
-
-def get_input_tensors():
-    height = np.random.randint(1, 10)
-    width = np.random.randint(1, 10)
-    dtype = np.float32
-    input_tensor = hu.arrays(
-        dims=[height, width],
-        dtype=dtype,
-        elements=st.integers(min_value=0, max_value=100),
-    )
-
-    return input_tensor
-
-
-class TestCopyRowsToTensor(hu.HypothesisTestCase):
-    @given(input_tensor=get_input_tensors(), **hu.gcs_cpu_only)
-    def test_copy_rows_to_tensor(self, input_tensor, gc, dc):
-        dtype = np.random.choice([np.float16, np.float32, np.int32, np.int64], 1)[0]
-        input_tensor = np.array(input_tensor).astype(dtype)
-        height = np.shape(input_tensor)[0]
-        width = np.shape(input_tensor)[1]
-        row = np.random.rand(width).astype(dtype)
-        indices_lengths = np.random.randint(height)
-        all_indices = np.arange(height)
-        np.random.shuffle(all_indices)
-        indices = all_indices[:indices_lengths]
-
-        def ref(input_tensor, indices, row):
-            for idx in indices:
-                input_tensor[idx] = row
-            return [input_tensor]
-        op = core.CreateOperator(
-            "CopyRowsToTensor", ["input_tensor", "indices", "row"], ["input_tensor"]
-        )
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[input_tensor, indices, row],
-            reference=ref,
-        )
-
-    @given(input_tensor=get_input_tensors(), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_copy_rows_to_tensor_invalid_input(self, input_tensor, gc, dc):
-        input_tensor = np.array(input_tensor).astype(np.float32)
-        height = np.shape(input_tensor)[0]
-        width = np.shape(input_tensor)[1]
-        row = np.random.rand(width + 1).astype(np.float32)
-        indices_lengths = np.random.randint(height)
-        all_indices = np.arange(height)
-        np.random.shuffle(all_indices)
-        indices = all_indices[:indices_lengths]
-
-        self.assertRunOpRaises(
-            device_option=gc,
-            op=core.CreateOperator(
-                "CopyRowsToTensor", ["input_tensor", "indices", "row"], ["input_tensor"]
-            ),
-            inputs=[input_tensor, indices, row],
-            exception=RuntimeError,
-            regexp="width of input tensor should match lengths of row",
-        )
diff --git a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py b/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
deleted file mode 100644
index d979407321a4..000000000000
--- a/caffe2/python/operator_test/cosine_embedding_criterion_op_test.py
+++ /dev/null
@@ -1,50 +0,0 @@
-
-
-
-
-
-import hypothesis.strategies as st
-import numpy as np
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-class TestCosineEmbeddingCriterion(serial.SerializedTestCase):
-    @serial.given(N=st.integers(min_value=10, max_value=20),
-           seed=st.integers(min_value=0, max_value=65535),
-           margin=st.floats(min_value=-0.5, max_value=0.5),
-           **hu.gcs)
-    def test_cosine_embedding_criterion(self, N, seed, margin, gc, dc):
-        np.random.seed(seed)
-        S = np.random.randn(N).astype(np.float32)
-        Y = np.random.choice([-1, 1], size=N).astype(np.int32)
-        op = core.CreateOperator(
-            "CosineEmbeddingCriterion", ["S", "Y"], ["output"],
-            margin=margin)
-
-        def ref_cec(S, Y):
-            result = (1 - S) * (Y == 1) + np.maximum(S - margin, 0) * (Y == -1)
-            return (result, )
-
-        # This checks the op implementation against a reference function in
-        # python.
-        self.assertReferenceChecks(gc, op, [S, Y], ref_cec)
-        # This checks the op implementation over multiple device options (e.g.
-        # CPU and CUDA). [0] means that the 0-th output is checked.
-        self.assertDeviceChecks(dc, op, [S, Y], [0])
-
-        # Now, since this operator's output has a "kink" around the margin
-        # value, we move the S vector away from the margin a little bit. This
-        # is a standard trick to avoid gradient check to fail on subgradient
-        # points.
-        S[np.abs(S - margin) < 0.1] += 0.2
-        # This checks the operator's gradient. the first 0 means that we are
-        # checking the gradient of the first input (S), and the second [0] means
-        # that the gradient check should initiate from the 0-th output.
-        self.assertGradientChecks(gc, op, [S, Y], 0, [0])
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/counter_ops_test.py b/caffe2/python/operator_test/counter_ops_test.py
deleted file mode 100644
index d57ff31508c6..000000000000
--- a/caffe2/python/operator_test/counter_ops_test.py
+++ /dev/null
@@ -1,84 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-import tempfile
-
-
-class TestCounterOps(TestCase):
-
-    def test_counter_ops(self):
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'CreateCounter', [], ['c'], init_count=1))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'CountDown', ['c'], ['t1']))  # 1 -> 0
-        assert not workspace.FetchBlob('t1')
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'CountDown', ['c'], ['t2']))  # 0 -> -1
-        assert workspace.FetchBlob('t2')
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'CountUp', ['c'], ['t21']))  # -1 -> 0
-        assert workspace.FetchBlob('t21') == -1
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'RetrieveCount', ['c'], ['t22']))
-        assert workspace.FetchBlob('t22') == 0
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'ResetCounter', ['c'], [], init_count=1))  # -> 1
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'CountDown', ['c'], ['t3']))  # 1 -> 0
-        assert not workspace.FetchBlob('t3')
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'ResetCounter', ['c'], ['t31'], init_count=5))  # 0 -> 5
-        assert workspace.FetchBlob('t31') == 0
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'ResetCounter', ['c'], ['t32']))  # 5 -> 0
-        assert workspace.FetchBlob('t32') == 5
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'ConstantFill', [], ['t4'], value=False, shape=[],
-            dtype=core.DataType.BOOL))
-        assert workspace.FetchBlob('t4') == workspace.FetchBlob('t1')
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'ConstantFill', [], ['t5'], value=True, shape=[],
-            dtype=core.DataType.BOOL))
-        assert workspace.FetchBlob('t5') == workspace.FetchBlob('t2')
-
-        assert workspace.RunOperatorOnce(core.CreateOperator(
-            'And', ['t1', 't2'], ['t6']))
-        assert not workspace.FetchBlob('t6')  # True && False
-
-        assert workspace.RunOperatorOnce(core.CreateOperator(
-            'And', ['t2', 't5'], ['t7']))
-        assert workspace.FetchBlob('t7')  # True && True
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'CreateCounter', [], ['serialized_c'], init_count=22))
-        with tempfile.NamedTemporaryFile() as tmp:
-            workspace.RunOperatorOnce(core.CreateOperator(
-                'Save', ['serialized_c'], [], absolute_path=1,
-                db_type='minidb', db=tmp.name))
-            for i in range(10):
-                workspace.RunOperatorOnce(core.CreateOperator(
-                    'CountDown', ['serialized_c'], ['t8']))
-            workspace.RunOperatorOnce(core.CreateOperator(
-                'RetrieveCount', ['serialized_c'], ['t8']))
-            assert workspace.FetchBlob('t8') == 12
-            workspace.RunOperatorOnce(core.CreateOperator(
-                'Load', [], ['serialized_c'], absolute_path=1,
-                db_type='minidb', db=tmp.name))
-            workspace.RunOperatorOnce(core.CreateOperator(
-                'RetrieveCount', ['serialized_c'], ['t8']))
-            assert workspace.FetchBlob('t8') == 22
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/crf_test.py b/caffe2/python/operator_test/crf_test.py
deleted file mode 100644
index a4447fa3f364..000000000000
--- a/caffe2/python/operator_test/crf_test.py
+++ /dev/null
@@ -1,140 +0,0 @@
-
-
-
-
-from caffe2.python import workspace, crf, brew
-from caffe2.python.model_helper import ModelHelper
-import numpy as np
-from scipy.special import logsumexp
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-from hypothesis import given, settings
-
-
-class TestCRFOp(hu.HypothesisTestCase):
-
-    @given(num_tags=st.integers(2, 4),
-           num_words=st.integers(2, 15))
-    @settings(deadline=10000)
-    def test_crf_with_loss_op(self, num_tags, num_words):
-        model = ModelHelper(name='external')
-        embeddings_dim = 200
-        embeddings = np.random.randn(num_words, embeddings_dim).astype(np.float32)
-        transitions = np.random.uniform(
-            low=-1, high=1, size=(num_tags + 2, num_tags + 2)
-        ).astype(np.float32)
-        labels = np.random.randint(num_tags, size=(num_words)).astype(np.int64)
-        embeddings_blob, labels_blob, transitions_blob = (
-            model.net.AddExternalInputs(
-                'embeddings_blob',
-                'labels_blob',
-                'crf_transitions')
-        )
-        workspace.FeedBlob(str(embeddings_blob), embeddings)
-        workspace.FeedBlob(str(labels_blob), labels)
-        workspace.FeedBlob(str(transitions_blob), transitions)
-        predictions_blob = brew.fc(
-            model,
-            embeddings_blob, "fc_0",
-            embeddings_dim, num_tags,
-            ('UniformFill', {'min': -1.0}, {'max': 1.0}),
-            ('UniformFill', {'min': -1.0}, {'max': 1.0})
-        )
-        crf_layer = crf.CRFWithLoss(model, num_tags, transitions_blob)
-        crf_loss = crf_layer.crf_loss(predictions_blob, labels_blob)
-        model.net.AddGradientOperators([crf_loss])
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        loss = workspace.FetchBlob(str(crf_loss))
-        predictions = workspace.FetchBlob(str(predictions_blob))
-        np.testing.assert_allclose(
-            loss,
-            self._compute_loss_manual(
-                predictions, num_tags, labels, transitions
-            ),
-            atol=0.001,
-            rtol=0.001,
-            err_msg='CRF LOSS is not matching the reference'
-        )
-
-    @given(num_tags=st.integers(1, 4),
-           num_words=st.integers(2, 4))
-    @settings(deadline=10000)
-    def test_crf_gradient(self, num_tags, num_words):
-        base_model = ModelHelper(name='base_model')
-        transitions = np.random.randn(
-            num_tags + 2, num_tags + 2
-        ).astype(np.float32)
-        predictions = np.random.randn(num_words, 1, num_tags + 2).astype(np.float32)
-        initial = np.random.randn(1, num_tags + 2).astype(np.float32)
-        predictions_blob, transitions_blob, initial_blob = (
-            base_model.net.AddExternalInputs(
-                'predictions_blob', 'crf_transitions', 'inital_blob'
-            )
-        )
-
-        workspace.FeedBlob(str(predictions_blob), predictions)
-        workspace.FeedBlob(str(transitions_blob), transitions)
-        workspace.FeedBlob(str(initial_blob), initial)
-
-        crf_layer = crf.CRFWithLoss(base_model, num_tags, transitions_blob)
-        crf_layer.build_crf_net(
-            predictions_blob, initial_blob, transitions_blob
-        )
-        op = base_model.net._net.op[-1]
-        workspace.RunNetOnce(base_model.param_init_net)
-        gradients_to_check = (
-            index for (index, input_name) in enumerate(op.input)
-            if input_name != "crf_net/zero_segment_id"
-        )
-
-        inputs = [workspace.FetchBlob(name) for name in op.input]
-        for param in gradients_to_check:
-            self.assertGradientChecks(
-                device_option=hu.cpu_do,
-                op=op,
-                inputs=inputs,
-                outputs_to_check=param,
-                outputs_with_grads=[1],
-                threshold=0.05,
-                stepsize=0.001,
-            )
-
-    def _compute_loss_manual(self, predictions, num_tags, labels, transitions):
-        low_score = -1000
-        b_s = np.array(
-            [[low_score] * num_tags + [0, low_score]]
-        ).astype(np.float32)
-        e_s = np.array(
-            [[low_score] * num_tags + [low_score, 0]]
-        ).astype(np.float32)
-        predictions = np.concatenate(
-            [predictions, low_score * np.ones((predictions.shape[0], 2))],
-            axis=1
-        )
-        predictions = np.concatenate(
-            [b_s, predictions, e_s],
-            axis=0
-        )
-        b_id = np.array([num_tags], dtype=np.int32)
-        e_id = np.array([num_tags + 1], dtype=np.int32)
-        labels = np.concatenate(
-            [b_id, labels, e_id],
-            axis=0
-        )
-        curr_state = predictions[0]
-        input_states = predictions[1:]
-
-        for input_state in input_states:
-            prev = np.expand_dims(curr_state, axis=1)
-            curr_input = np.expand_dims(input_state, axis=0)
-            curr_state = logsumexp(prev + curr_input + transitions, axis=0)
-
-        total_score = logsumexp(curr_state, axis=0)
-        # Compute best path score
-        unary_scores = sum(w[labels[i]] for i, w in enumerate(predictions))
-        binary_scores = sum(
-            transitions[a][b] for a, b in zip(labels[:-1], labels[1:])
-        )
-        loss = total_score - (binary_scores + unary_scores)
-        return loss
diff --git a/caffe2/python/operator_test/cross_entropy_ops_test.py b/caffe2/python/operator_test/cross_entropy_ops_test.py
deleted file mode 100644
index c88f93503a15..000000000000
--- a/caffe2/python/operator_test/cross_entropy_ops_test.py
+++ /dev/null
@@ -1,287 +0,0 @@
-
-
-
-
-from caffe2.python import core
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-
-def sigmoid(x):
-    return 1.0 / (1.0 + np.exp(-x))
-
-
-def sigmoid_cross_entropy_with_logits(x, z):
-    return np.maximum(x, 0) - x * z + np.log(1 + np.exp(-np.abs(x)))
-
-
-def sigmoid_cross_entropy_with_logits_grad(x, z):
-    return z - sigmoid(x)
-
-
-def sigmoid_cross_entropy_with_logits_with_log_D_trick(x, z):
-    return -(2 * z - 1.) * np.log(sigmoid(x))
-
-
-def sigmoid_cross_entropy_with_logits_with_log_D_trick_grad(x, z):
-    return (2 * z - 1.) * (1 - sigmoid(x))
-
-
-def unjoined_sigmoid_cross_entropy(x, z):
-    return -z * x + (1. - z) * np.maximum(x, 0) \
-        + (1. - z) * np.log(1 + np.exp(-np.abs(x)))
-
-
-def unjoined_sigmoid_cross_entropy_grad(x, z):
-    return z - (1. - z) / (1. + np.exp(-x))
-
-
-class TestCrossEntropyOps(hu.HypothesisTestCase):
-    @given(
-        inputs=st.lists(
-            elements=st.integers(min_value=1, max_value=5),
-            min_size=1,
-            max_size=2,
-        ).flatmap(
-            lambda shape: st.tuples(
-                hu.arrays(
-                    dims=shape,
-                    elements=st.one_of(
-                        hu.floats(min_value=-1.0, max_value=-0.1),
-                        hu.floats(min_value=0.1, max_value=1.0),
-                    )),
-                hu.arrays(
-                    dims=shape,
-                    elements=st.sampled_from([0.0, 1.0]),
-                ),
-            )
-        ),
-        options=st.one_of(
-            st.tuples(st.just(True), st.just(False)),
-            st.tuples(st.just(False), st.just(True)),
-            st.tuples(st.just(False), st.just(False))
-        ),
-        **hu.gcs
-    )
-    def test_sigmoid_cross_entropy_with_logits(
-        self, inputs, options, gc, dc
-    ):
-        logits, targets = inputs
-        log_D_trick, unjoined_lr_loss = options
-
-        def sigmoid_xentr_logit_ref(logits, targets):
-            if unjoined_lr_loss:
-                s = unjoined_sigmoid_cross_entropy(logits, targets)
-            else:
-                s = (
-                    sigmoid_cross_entropy_with_logits(logits, targets)
-                    if not log_D_trick else
-                    sigmoid_cross_entropy_with_logits_with_log_D_trick(
-                        logits, targets
-                    )
-                )
-            m = np.mean(s, axis=len(logits.shape) - 1)
-            return (m, )
-
-        def sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
-            fwd_logits, fwd_targets = fwd_inputs
-            inner_size = fwd_logits.shape[-1]
-            if unjoined_lr_loss:
-                m = unjoined_sigmoid_cross_entropy_grad(logits, targets)
-            else:
-                m = (
-                    sigmoid_cross_entropy_with_logits_grad(fwd_logits, fwd_targets)
-                    if not log_D_trick else
-                    sigmoid_cross_entropy_with_logits_with_log_D_trick_grad(
-                        fwd_logits, fwd_targets
-                    )
-                )
-            # m = fwd_targets - sigmoid(fwd_logits)
-            g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size
-            return (g_in, None)
-
-        op = core.CreateOperator(
-            'SigmoidCrossEntropyWithLogits', ['logits', 'targets'],
-            ['xentropy'],
-            log_D_trick=log_D_trick,
-            unjoined_lr_loss=unjoined_lr_loss
-        )
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[logits, targets],
-            reference=sigmoid_xentr_logit_ref,
-            output_to_grad='xentropy',
-            grad_reference=sigmoid_xentr_logit_grad_ref)
-
-    @given(
-        log_D_trick=st.just(False),
-        **hu.gcs_cpu_only
-    )
-    def test_cross_entropy_and_unjoied_cross_entropy_relation(
-        self, log_D_trick, gc, dc
-    ):
-        logits = np.array([1.4720, 0.3500, -0.6529, -1.1908, 0.8357,
-                    -1.0774, -0.3395, -0.2469, 0.6708, -1.8332], dtype='f')
-        targets = np.array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0.], dtype='f')
-        lr_size = targets.size
-        unjoined_lr_loss = False
-
-        def sigmoid_xentr_logit_ref(logits, targets):
-            if unjoined_lr_loss:
-                s = unjoined_sigmoid_cross_entropy(logits, targets)
-            else:
-                s = sigmoid_cross_entropy_with_logits(logits, targets)
-            m = np.mean(s, axis=len(logits.shape) - 1)
-            return (m, )
-
-        def sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
-            fwd_logits, fwd_targets = fwd_inputs
-            inner_size = fwd_logits.shape[-1]
-            if unjoined_lr_loss:
-                m = unjoined_sigmoid_cross_entropy_grad(logits, targets)
-            else:
-                m = sigmoid_cross_entropy_with_logits_grad(
-                    fwd_logits, fwd_targets)
-
-            # m = fwd_targets - sigmoid(fwd_logits)
-            g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size
-            return (g_in, None)
-
-        op = core.CreateOperator(
-            'SigmoidCrossEntropyWithLogits', ['logits', 'targets'],
-            ['xentropy'],
-            log_D_trick=log_D_trick,
-            unjoined_lr_loss=unjoined_lr_loss
-        )
-        output_lr = self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[logits, targets],
-            reference=sigmoid_xentr_logit_ref,
-            output_to_grad='xentropy',
-            grad_reference=sigmoid_xentr_logit_grad_ref)
-
-        # Unjoined dataset where labels change later
-        logits = np.array([1.4720, 0.3500, -0.6529, -1.1908, 0.8357,
-                    -1.0774, -0.3395, -0.2469, 0.6708, -1.8332, 1.4720, 0.3500,
-                    -0.6529, -1.1908, 0.8357, -1.0774], dtype='f')
-        targets = np.array([0., 0., 0., 0., 0., 0., 0., 0., 0.,
-                            0., 1., 1., 1., 1., 1., 1.], dtype='f')
-        unjoined_lr_loss = True
-        unjoined_lr_size = targets.size
-
-        op = core.CreateOperator(
-            'SigmoidCrossEntropyWithLogits', ['logits', 'targets'],
-            ['xentropy'],
-            log_D_trick=log_D_trick,
-            unjoined_lr_loss=unjoined_lr_loss
-        )
-        outputs_unjoined_lr = self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[logits, targets],
-            reference=sigmoid_xentr_logit_ref,
-            output_to_grad='xentropy',
-            grad_reference=sigmoid_xentr_logit_grad_ref)
-
-        self.assertAlmostEqual(
-            output_lr[0].item(0) * lr_size / unjoined_lr_size,
-            outputs_unjoined_lr[0].item(0),
-            delta=0.0001)
-
-    @given(
-        inputs=st.lists(
-            elements=st.integers(min_value=1, max_value=5),
-            min_size=1,
-            max_size=2,
-        ).flatmap(
-            lambda shape: st.tuples(
-                hu.arrays(
-                    dims=shape,
-                    elements=st.one_of(
-                        hu.floats(min_value=-1.0, max_value=-0.1),
-                        hu.floats(min_value=0.1, max_value=1.0),
-                    )),
-                hu.arrays(
-                    dims=shape,
-                    elements=st.sampled_from([0.0, 1.0]),
-                ),
-                hu.arrays(
-                    dims=shape,
-                    elements=hu.floats(min_value=0.1, max_value=1.0),
-                ),
-            )
-        ),
-        **hu.gcs
-    )
-    def test_weighted_sigmoid_cross_entropy_with_logits(self, inputs, gc, dc):
-        logits, targets, weights = inputs
-
-        def weighted_sigmoid_xentr_logit_ref(logits, targets, weights):
-            s = sigmoid_cross_entropy_with_logits(logits, targets)
-            s = np.multiply(s, weights)
-            m = np.mean(s, axis=len(logits.shape) - 1)
-            return (m, )
-
-        def weighted_sigmoid_xentr_logit_grad_ref(g_out, outputs, fwd_inputs):
-            fwd_logits, fwd_targets, fwd_weights = fwd_inputs
-            inner_size = fwd_logits.shape[-1]
-            m = fwd_targets - sigmoid(fwd_logits)
-            m = np.multiply(m, weights)
-            g_in = -np.expand_dims(g_out, axis=-1) * m / inner_size
-            return (g_in, None, None)
-
-        op = core.CreateOperator(
-            'WeightedSigmoidCrossEntropyWithLogits',
-            ['logits', 'targets', 'weights'],
-            ['xentropy'])
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[logits, targets, weights],
-            reference=weighted_sigmoid_xentr_logit_ref,
-            output_to_grad='xentropy',
-            grad_reference=weighted_sigmoid_xentr_logit_grad_ref)
-
-    @given(n=st.integers(2, 10),
-           b=st.integers(1, 5),
-           **hu.gcs_cpu_only)
-    def test_soft_label_cross_entropy(self, n, b, gc, dc):
-        # Initialize X and add 1e-2 for numerical stability
-        X = np.random.rand(b, n).astype(np.float32)
-        X = X + 1e-2
-        for i in range(b):
-            X[i] = X[i] / np.sum(X[i])
-
-        # Initialize label
-        label = np.random.rand(b, n).astype(np.float32)
-        for i in range(b):
-            label[i] = label[i] / np.sum(label[i])
-
-        # Reference implementation of cross entropy with soft labels
-        def soft_label_xentr_ref(X, label):
-            xent = [np.sum((-label[j][i] * np.log(max(X[j][i], 1e-20))
-                            for i in range(len(X[0])))) for j in range(b)]
-            return (xent,)
-
-        op = core.CreateOperator("CrossEntropy", ["X", "label"], ["Y"])
-
-        # TODO(surya) Once CrossEntropyOp is ported to GPU, add the respective
-        # tests to this unit test.
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, label],
-            reference=soft_label_xentr_ref,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X, label], 0, [0], stepsize=1e-4, threshold=1e-2)
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py b/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
deleted file mode 100644
index 29440c00a4b3..000000000000
--- a/caffe2/python/operator_test/ctc_beam_search_decoder_op_test.py
+++ /dev/null
@@ -1,137 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from collections import defaultdict, Counter
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-
-DEFAULT_BEAM_WIDTH = 10
-DEFAULT_PRUNE_THRESHOLD = 0.001
-
-
-class TestCTCBeamSearchDecoderOp(serial.SerializedTestCase):
-    @given(
-        batch=st.sampled_from([1, 2, 4]),
-        max_time=st.sampled_from([1, 8, 64]),
-        alphabet_size=st.sampled_from([1, 2, 32, 128, 512]),
-        beam_width=st.sampled_from([1, 2, 16, None]),
-        num_candidates=st.sampled_from([1, 2]),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=None, max_examples=30)
-    def test_ctc_beam_search_decoder(
-        self, batch, max_time, alphabet_size, beam_width, num_candidates, gc, dc
-    ):
-        if not beam_width:
-            beam_width = DEFAULT_BEAM_WIDTH
-            op_seq_len = core.CreateOperator('CTCBeamSearchDecoder',
-                ['INPUTS', 'SEQ_LEN'],
-                ['OUTPUT_LEN', 'VALUES', 'OUTPUT_PROB'],
-                num_candidates=num_candidates)
-
-            op_no_seq_len = core.CreateOperator('CTCBeamSearchDecoder',
-                ['INPUTS'],
-                ['OUTPUT_LEN', 'VALUES', 'OUTPUT_PROB'],
-                num_candidates=num_candidates)
-        else:
-            num_candidates = min(num_candidates, beam_width)
-            op_seq_len = core.CreateOperator('CTCBeamSearchDecoder',
-                ['INPUTS', 'SEQ_LEN'],
-                ['OUTPUT_LEN', 'VALUES', 'OUTPUT_PROB'],
-                beam_width=beam_width,
-                num_candidates=num_candidates)
-
-            op_no_seq_len = core.CreateOperator('CTCBeamSearchDecoder',
-                ['INPUTS'],
-                ['OUTPUT_LEN', 'VALUES', 'OUTPUT_PROB'],
-                beam_width=beam_width,
-                num_candidates=num_candidates)
-
-        def input_generater():
-            inputs = np.random.rand(max_time, batch, alphabet_size)\
-                .astype(np.float32)
-            seq_len = np.random.randint(1, max_time + 1, size=batch)\
-                .astype(np.int32)
-            return inputs, seq_len
-
-        def ref_ctc_decoder(inputs, seq_len):
-            output_len = np.zeros(batch * num_candidates, dtype=np.int32)
-            output_prob = np.zeros(batch * num_candidates, dtype=np.float32)
-            val = np.array([]).astype(np.int32)
-
-            for i in range(batch):
-                Pb, Pnb = defaultdict(Counter), defaultdict(Counter)
-                Pb[0][()] = 1
-                Pnb[0][()] = 0
-                A_prev = [()]
-                ctc = inputs[:, i, :]
-                ctc = np.vstack((np.zeros(alphabet_size), ctc))
-                len_i = seq_len[i] if seq_len is not None else max_time
-
-                for t in range(1, len_i + 1):
-                    pruned_alphabet = np.where(ctc[t] > DEFAULT_PRUNE_THRESHOLD)[0]
-                    for l in A_prev:
-                        for c in pruned_alphabet:
-                            if c == 0:
-                                Pb[t][l] += ctc[t][c] * (Pb[t - 1][l] + Pnb[t - 1][l])
-                            else:
-                                l_plus = l + (c,)
-                                if len(l) > 0 and c == l[-1]:
-                                    Pnb[t][l_plus] += ctc[t][c] * Pb[t - 1][l]
-                                    Pnb[t][l] += ctc[t][c] * Pnb[t - 1][l]
-                                else:
-                                    Pnb[t][l_plus] += \
-                                        ctc[t][c] * (Pb[t - 1][l] + Pnb[t - 1][l])
-
-                                if l_plus not in A_prev:
-                                    Pb[t][l_plus] += \
-                                        ctc[t][0] * \
-                                        (Pb[t - 1][l_plus] + Pnb[t - 1][l_plus])
-                                    Pnb[t][l_plus] += ctc[t][c] * Pnb[t - 1][l_plus]
-
-                    A_next = Pb[t] + Pnb[t]
-                    A_prev = sorted(A_next, key=A_next.get, reverse=True)
-                    A_prev = A_prev[:beam_width]
-
-                candidates = A_prev[:num_candidates]
-                index = 0
-                for candidate in candidates:
-                    val = np.hstack((val, candidate))
-                    output_len[i * num_candidates + index] = len(candidate)
-                    output_prob[i * num_candidates + index] = Pb[t][candidate] + Pnb[t][candidate]
-                    index += 1
-
-            return [output_len, val, output_prob]
-
-        def ref_ctc_decoder_max_time(inputs):
-            return ref_ctc_decoder(inputs, None)
-
-        inputs, seq_len = input_generater()
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op_seq_len,
-            inputs=[inputs, seq_len],
-            reference=ref_ctc_decoder,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op_no_seq_len,
-            inputs=[inputs],
-            reference=ref_ctc_decoder_max_time,
-        )
-
-
-if __name__ == "__main__":
-    import random
-    random.seed(2603)
-    unittest.main()
diff --git a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py b/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
deleted file mode 100644
index 8bc7eb47d488..000000000000
--- a/caffe2/python/operator_test/ctc_greedy_decoder_op_test.py
+++ /dev/null
@@ -1,151 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-class TestCTCGreedyDecoderOp(serial.SerializedTestCase):
-
-    @given(
-        batch=st.sampled_from([2, 4, 128, 256]),
-        max_time=st.sampled_from([2, 10, 30, 50]),
-        num_classes=st.sampled_from([2, 10, 26, 40]),
-        merge_repeated=st.sampled_from([True, False]),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=10000)
-    def test_ctc_greedy_decoder(
-        self, batch, max_time,
-        num_classes, merge_repeated, gc, dc
-    ):
-
-        def input_generater():
-            inputs = np.random.rand(max_time, batch, num_classes)\
-                .astype(np.float32)
-            seq_len = np.random.randint(1, max_time + 1, size=batch)\
-                .astype(np.int32)
-            return inputs, seq_len
-
-        def ref_ctc_decoder(inputs, seq_len):
-            merge = merge_repeated
-            output_len = np.array([]).astype(np.int32)
-            val = np.array([]).astype(np.int32)
-            for i in range(batch):
-                prev_id = 0
-                t_dec = 0
-                len_i = seq_len[i] if seq_len is not None else max_time
-                for t in range(len_i):
-                    max_id = np.argmax(inputs[t, i, :])
-                    if max_id == 0:
-                        prev_id = max_id
-                        continue
-                    if max_id == prev_id and merge:
-                        prev_id = max_id
-                        continue
-                    t_dec += 1
-                    val = np.append(val, max_id)
-                    prev_id = max_id
-                output_len = np.append(output_len, t_dec)
-
-            return [output_len, val]
-
-        def ref_ctc_decoder_max_time(inputs):
-            return ref_ctc_decoder(inputs, None)
-
-        inputs, seq_len = input_generater()
-        op = core.CreateOperator('CTCGreedyDecoder',
-            ['INPUTS', 'SEQ_LEN'],
-            ['OUTPUT_LEN', 'VALUES'],
-            merge_repeated=merge_repeated)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[inputs, seq_len],
-            reference=ref_ctc_decoder,
-        )
-
-        op_1 = core.CreateOperator('CTCGreedyDecoder',
-            ['INPUTS'],
-            ['OUTPUT_LEN', 'VALUES'],
-            merge_repeated=merge_repeated)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op_1,
-            inputs=[inputs],
-            reference=ref_ctc_decoder_max_time,
-        )
-
-    @given(
-        batch=st.sampled_from([2, 4, 128, 256]),
-        max_time=st.sampled_from([2, 10, 30, 50]),
-        num_classes=st.sampled_from([2, 10, 26, 40]),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=10000)
-    def test_ctc_greedy_decoder_no_merge_arg(
-        self, batch, max_time,
-        num_classes, gc, dc
-    ):
-
-        def input_generater():
-            inputs = np.random.rand(max_time, batch, num_classes)\
-                .astype(np.float32)
-            seq_len = np.random.randint(1, max_time + 1, size=batch)\
-                .astype(np.int32)
-            return inputs, seq_len
-
-        def ref_ctc_decoder_no_merge_arg(inputs, seq_len):
-            merge = True
-
-            output_len = np.array([]).astype(np.int32)
-            val = np.array([]).astype(np.int32)
-            for i in range(batch):
-                prev_id = 0
-                t_dec = 0
-                len_i = seq_len[i] if seq_len is not None else max_time
-                for t in range(len_i):
-                    max_id = np.argmax(inputs[t, i, :])
-                    if max_id == 0:
-                        prev_id = max_id
-                        continue
-                    if max_id == prev_id and merge:
-                        prev_id = max_id
-                        continue
-                    t_dec += 1
-                    val = np.append(val, max_id)
-                    prev_id = max_id
-                output_len = np.append(output_len, t_dec)
-
-            return [output_len, val]
-
-        def ref_ctc_decoder_max_time(inputs):
-            return ref_ctc_decoder_no_merge_arg(inputs, None)
-
-        inputs, seq_len = input_generater()
-
-        op = core.CreateOperator('CTCGreedyDecoder',
-            ['INPUTS'],
-            ['OUTPUT_LEN', 'VALUES'])
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[inputs],
-            reference=ref_ctc_decoder_max_time,
-        )
-
-
-if __name__ == "__main__":
-    import random
-    random.seed(2603)
-    unittest.main()
diff --git a/caffe2/python/operator_test/cudnn_recurrent_test.py b/caffe2/python/operator_test/cudnn_recurrent_test.py
deleted file mode 100644
index 491593e8423e..000000000000
--- a/caffe2/python/operator_test/cudnn_recurrent_test.py
+++ /dev/null
@@ -1,151 +0,0 @@
-
-
-
-
-
-from caffe2.python import model_helper, workspace, core, rnn_cell
-import numpy as np
-
-import unittest
-
-
-@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
-class TestLSTMs(unittest.TestCase):
-
-    def testEqualToCudnn(self):
-        with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType)):
-            T = 8
-            batch_size = 4
-            input_dim = 8
-            hidden_dim = 31
-
-            workspace.FeedBlob(
-                "seq_lengths",
-                np.array([T] * batch_size, dtype=np.int32)
-            )
-            workspace.FeedBlob("target", np.zeros(
-                [T, batch_size, hidden_dim], dtype=np.float32
-            ))
-            workspace.FeedBlob("hidden_init", np.zeros(
-                [1, batch_size, hidden_dim], dtype=np.float32
-            ))
-            workspace.FeedBlob("cell_init", np.zeros(
-                [1, batch_size, hidden_dim], dtype=np.float32
-            ))
-
-            own_model = model_helper.ModelHelper(name="own_lstm")
-
-            input_shape = [T, batch_size, input_dim]
-            cudnn_model = model_helper.ModelHelper(name="cudnn_lstm")
-            input_blob = cudnn_model.param_init_net.UniformFill(
-                [], "input", shape=input_shape)
-            workspace.FeedBlob("CUDNN/hidden_init_cudnn", np.zeros(
-                [1, batch_size, hidden_dim], dtype=np.float32
-            ))
-            workspace.FeedBlob("CUDNN/cell_init_cudnn", np.zeros(
-                [1, batch_size, hidden_dim], dtype=np.float32
-            ))
-
-            cudnn_output, cudnn_last_hidden, cudnn_last_state, param_extract = rnn_cell.cudnn_LSTM(
-                model=cudnn_model,
-                input_blob=input_blob,
-                initial_states=("hidden_init_cudnn", "cell_init_cudnn"),
-                dim_in=input_dim,
-                dim_out=hidden_dim,
-                scope="CUDNN",
-                return_params=True,
-            )
-            cudnn_loss = cudnn_model.AveragedLoss(
-                cudnn_model.SquaredL2Distance(
-                    [cudnn_output, "target"], "CUDNN/dist"
-                ), "CUDNN/loss"
-            )
-
-            own_output, own_last_hidden, _, own_last_state, own_params = rnn_cell.LSTM(
-                model=own_model,
-                input_blob=input_blob,
-                seq_lengths="seq_lengths",
-                initial_states=("hidden_init", "cell_init"),
-                dim_in=input_dim,
-                dim_out=hidden_dim,
-                scope="OWN",
-                return_params=True,
-            )
-            own_loss = own_model.AveragedLoss(
-                own_model.SquaredL2Distance([own_output, "target"], "OWN/dist"),
-                "OWN/loss"
-            )
-
-            # Add gradients
-            cudnn_model.AddGradientOperators([cudnn_loss])
-            own_model.AddGradientOperators([own_loss])
-
-            # Add parameter updates
-            LR = cudnn_model.param_init_net.ConstantFill(
-                [], shape=[1], value=0.01
-            )
-            ONE = cudnn_model.param_init_net.ConstantFill(
-                [], shape=[1], value=1.0
-            )
-            for param in cudnn_model.GetParams():
-                cudnn_model.WeightedSum(
-                    [param, ONE, cudnn_model.param_to_grad[param], LR], param
-                )
-            for param in own_model.GetParams():
-                own_model.WeightedSum(
-                    [param, ONE, own_model.param_to_grad[param], LR], param
-                )
-
-            # Copy states over
-            own_model.net.Copy(own_last_hidden, "hidden_init")
-            own_model.net.Copy(own_last_state, "cell_init")
-            cudnn_model.net.Copy(cudnn_last_hidden, "CUDNN/hidden_init_cudnn")
-            cudnn_model.net.Copy(cudnn_last_state, "CUDNN/cell_init_cudnn")
-
-            workspace.RunNetOnce(cudnn_model.param_init_net)
-            workspace.CreateNet(cudnn_model.net)
-
-            ##
-            ##  CUDNN LSTM MODEL EXECUTION
-            ##
-            # Get initial values from CuDNN LSTM so we can feed them
-            # to our own.
-            (param_extract_net, param_extract_mapping) = param_extract
-            workspace.RunNetOnce(param_extract_net)
-            cudnn_lstm_params = {
-                input_type: {
-                    k: workspace.FetchBlob(v[0])
-                    for k, v in pars.items()
-                }
-                for input_type, pars in param_extract_mapping.items()
-            }
-
-            # Run the model 3 times, so that some parameter updates are done
-            workspace.RunNet(cudnn_model.net.Proto().name, 3)
-
-            ##
-            ## OWN LSTM MODEL EXECUTION
-            ##
-            # Map the cuDNN parameters to our own
-            workspace.RunNetOnce(own_model.param_init_net)
-            rnn_cell.InitFromLSTMParams(own_params, cudnn_lstm_params)
-
-            # Run the model 3 times, so that some parameter updates are done
-            workspace.CreateNet(own_model.net)
-            workspace.RunNet(own_model.net.Proto().name, 3)
-
-            ##
-            ## COMPARE RESULTS
-            ##
-            # Then compare that final results after 3 runs are equal
-            own_output_data = workspace.FetchBlob(own_output)
-            own_last_hidden = workspace.FetchBlob(own_last_hidden)
-            own_loss = workspace.FetchBlob(own_loss)
-
-            cudnn_output_data = workspace.FetchBlob(cudnn_output)
-            cudnn_last_hidden = workspace.FetchBlob(cudnn_last_hidden)
-            cudnn_loss = workspace.FetchBlob(cudnn_loss)
-
-            self.assertTrue(np.allclose(own_output_data, cudnn_output_data))
-            self.assertTrue(np.allclose(own_last_hidden, cudnn_last_hidden))
-            self.assertTrue(np.allclose(own_loss, cudnn_loss))
diff --git a/caffe2/python/operator_test/data_couple_op_test.py b/caffe2/python/operator_test/data_couple_op_test.py
deleted file mode 100644
index d840207159b2..000000000000
--- a/caffe2/python/operator_test/data_couple_op_test.py
+++ /dev/null
@@ -1,30 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-import numpy as np
-
-
-class TestDataCoupleOp(TestCase):
-
-    def test_data_couple_op(self):
-        param_array = np.random.rand(10, 10)
-        gradient_array = np.random.rand(10, 10)
-        extra_array = np.random.rand(10, 10)
-        workspace.FeedBlob("param", param_array)
-        workspace.FeedBlob("gradient", gradient_array)
-        workspace.FeedBlob("extraBlob", extra_array)
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            "DataCouple",
-            ["param", "gradient", "extraBlob"],
-            ["param", "gradient"]))
-
-        result1 = workspace.FetchBlob('param')
-        result2 = workspace.FetchBlob('gradient')
-
-        self.assertFalse((result1 - param_array).any())
-        self.assertFalse((result2 - gradient_array).any())
diff --git a/caffe2/python/operator_test/dataset_ops_test.py b/caffe2/python/operator_test/dataset_ops_test.py
deleted file mode 100644
index 7121258de127..000000000000
--- a/caffe2/python/operator_test/dataset_ops_test.py
+++ /dev/null
@@ -1,699 +0,0 @@
-import functools
-import operator
-import string
-
-import hypothesis.strategies as st
-import numpy as np
-import numpy.testing as npt
-from caffe2.python import core, dataset, workspace
-from caffe2.python.dataset import Const
-from caffe2.python.schema import (
-    FeedRecord,
-    FetchRecord,
-    Field,
-    List,
-    Map,
-    NewRecord,
-    Scalar,
-    Struct,
-    from_blob_list,
-)
-from caffe2.python.test_util import TestCase
-from hypothesis import given
-
-
-def _assert_arrays_equal(actual, ref, err_msg):
-    if ref.dtype.kind in ("S", "O", "U"):
-        np.testing.assert_array_equal(actual, ref, err_msg=err_msg)
-    else:
-        np.testing.assert_allclose(actual, ref, atol=1e-4, rtol=1e-4, err_msg=err_msg)
-
-
-def _assert_records_equal(actual, ref):
-    assert isinstance(actual, Field)
-    assert isinstance(ref, Field)
-    b1 = actual.field_blobs()
-    b2 = ref.field_blobs()
-    assert len(b1) == len(b2), "Records have different lengths: %d vs. %d" % (
-        len(b1),
-        len(b2),
-    )
-    for name, d1, d2 in zip(ref.field_names(), b1, b2):
-        _assert_arrays_equal(d1, d2, err_msg="Mismatch in field %s." % name)
-
-
-@st.composite
-def _sparse_features_map(draw, num_records, **kwargs):
-    sparse_maps_lengths = draw(
-        st.lists(
-            st.integers(min_value=1, max_value=10),
-            min_size=num_records,
-            max_size=num_records,
-        )
-    )
-
-    sparse_maps_total_length = sum(sparse_maps_lengths)
-
-    sparse_keys = draw(
-        st.lists(
-            st.integers(min_value=1, max_value=100),
-            min_size=sparse_maps_total_length,
-            max_size=sparse_maps_total_length,
-            unique=True,
-        )
-    )
-
-    sparse_values_lengths = draw(
-        st.lists(
-            st.integers(min_value=1, max_value=10),
-            min_size=sparse_maps_total_length,
-            max_size=sparse_maps_total_length,
-        )
-    )
-
-    total_sparse_values_lengths = sum(sparse_values_lengths)
-
-    sparse_values = draw(
-        # max_value is max int64
-        st.lists(
-            st.integers(min_value=1, max_value=9223372036854775807),
-            min_size=total_sparse_values_lengths,
-            max_size=total_sparse_values_lengths,
-        )
-    )
-
-    return [
-        sparse_maps_lengths,
-        sparse_keys,
-        sparse_values_lengths,
-        sparse_values,
-    ]
-
-
-@st.composite
-def _dense_features_map(draw, num_records, **kwargs):
-    float_lengths = draw(
-        st.lists(
-            st.integers(min_value=1, max_value=10),
-            min_size=num_records,
-            max_size=num_records,
-        )
-    )
-
-    total_length = sum(float_lengths)
-
-    float_keys = draw(
-        st.lists(
-            st.integers(min_value=1, max_value=100),
-            min_size=total_length,
-            max_size=total_length,
-            unique=True,
-        )
-    )
-
-    float_values = draw(
-        st.lists(st.floats(), min_size=total_length, max_size=total_length)
-    )
-
-    return [float_lengths, float_keys, float_values]
-
-
-@st.composite
-def _dataset(draw, min_elements=3, max_elements=10, **kwargs):
-    schema = Struct(
-        # Dense Features Map
-        ("floats", Map(Scalar(np.int32), Scalar(np.float32))),
-        # Sparse Features Map
-        (
-            "int_lists",
-            Map(
-                Scalar(np.int32),
-                List(Scalar(np.int64)),
-            ),
-        ),
-        # Complex Type
-        ("text", Scalar(str)),
-    )
-
-    num_records = draw(st.integers(min_value=min_elements, max_value=max_elements))
-
-    raw_dense_features_map_contents = draw(_dense_features_map(num_records))
-
-    raw_sparse_features_map_contents = draw(_sparse_features_map(num_records))
-
-    raw_text_contents = [
-        draw(
-            st.lists(
-                st.text(alphabet=string.ascii_lowercase),
-                min_size=num_records,
-                max_size=num_records,
-            )
-        )
-    ]
-
-    # Concatenate all raw contents to a single one
-    contents_raw = (
-        raw_dense_features_map_contents
-        + raw_sparse_features_map_contents
-        + raw_text_contents
-    )
-
-    contents = from_blob_list(schema, contents_raw)
-
-    return (schema, contents, num_records)
-
-
-class TestDatasetOps(TestCase):
-    @given(_dataset())
-    def test_pack_unpack(self, input):
-        """
-        Tests if packing and unpacking of the whole dataset is an identity.
-        """
-        (schema, contents, num_records) = input
-
-        dataset_fields = schema.field_names()
-
-        for pack_to_single_shared_ptr in (True, False):
-            net = core.Net("pack_unpack_net")
-            batch = NewRecord(net, contents)
-            FeedRecord(batch, contents)
-
-            packed = net.PackRecords(
-                batch.field_blobs(),
-                1,
-                fields=dataset_fields,
-                pack_to_single_shared_ptr=pack_to_single_shared_ptr,
-            )
-
-            unpacked = packed.UnPackRecords(
-                [], len(dataset_fields), fields=dataset_fields
-            )
-
-            workspace.RunNetOnce(net)
-
-            for initial_tensor, unpacked_tensor in zip(batch.field_blobs(), unpacked):
-                npt.assert_array_equal(
-                    workspace.FetchBlob(initial_tensor),
-                    workspace.FetchBlob(unpacked_tensor),
-                )
-
-    def test_dataset_ops(self):
-        """
-        1. Defining the schema of our dataset.
-
-        This example schema could represent, for example, a search query log.
-        """
-        schema = Struct(
-            # fixed size vector, which will be stored as a matrix when batched
-            ("dense", Scalar((np.float32, 3))),
-            # could represent a feature map from feature ID to float value
-            ("floats", Map(Scalar(np.int32), Scalar(np.float32))),
-            # could represent a multi-valued categorical feature map
-            (
-                "int_lists",
-                Map(
-                    Scalar(np.int32),
-                    List(Scalar(np.int64)),
-                ),
-            ),
-            # could represent a multi-valued, weighted categorical feature map
-            (
-                "id_score_pairs",
-                Map(
-                    Scalar(np.int32),
-                    Map(
-                        Scalar(np.int64),
-                        Scalar(np.float32),
-                        keys_name="ids",
-                        values_name="scores",
-                    ),
-                ),
-            ),
-            # additional scalar information
-            (
-                "metadata",
-                Struct(
-                    ("user_id", Scalar(np.int64)),
-                    ("user_embed", Scalar((np.float32, 2))),
-                    ("query", Scalar(str)),
-                ),
-            ),
-        )
-        """
-        This is what the flattened fields for this schema look like, along
-        with its type. Each one of these fields will be stored, read and
-        written as a tensor.
-        """
-        expected_fields = [
-            ("dense", (np.float32, 3)),
-            ("floats:lengths", np.int32),
-            ("floats:values:keys", np.int32),
-            ("floats:values:values", np.float32),
-            ("int_lists:lengths", np.int32),
-            ("int_lists:values:keys", np.int32),
-            ("int_lists:values:values:lengths", np.int32),
-            ("int_lists:values:values:values", np.int64),
-            ("id_score_pairs:lengths", np.int32),
-            ("id_score_pairs:values:keys", np.int32),
-            ("id_score_pairs:values:values:lengths", np.int32),
-            ("id_score_pairs:values:values:values:ids", np.int64),
-            ("id_score_pairs:values:values:values:scores", np.float32),
-            ("metadata:user_id", np.int64),
-            ("metadata:user_embed", (np.float32, 2)),
-            ("metadata:query", str),
-        ]
-        zipped = zip(expected_fields, schema.field_names(), schema.field_types())
-        for (ref_name, ref_type), name, dtype in zipped:
-            self.assertEqual(ref_name, name)
-            self.assertEqual(np.dtype(ref_type), dtype)
-        """
-        2. The contents of our dataset.
-
-        Contents as defined below could represent, for example, a log of
-        search queries along with dense, sparse features and metadata.
-        The dataset below has 3 top-level entries.
-        """
-        contents_raw = [
-            # dense
-            [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
-            # floats
-            [1, 2, 3],  # len
-            [11, 21, 22, 31, 32, 33],  # key
-            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
-            # int lists
-            [2, 0, 1],  # len
-            [11, 12, 31],  # key
-            [2, 4, 3],  # value:len
-            [111, 112, 121, 122, 123, 124, 311, 312, 313],  # value:value
-            # id score pairs
-            [1, 2, 2],  # len
-            [11, 21, 22, 31, 32],  # key
-            [1, 1, 2, 2, 3],  # value:len
-            [111, 211, 221, 222, 311, 312, 321, 322, 323],  # value:ids
-            [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3],  # val:score
-            # metadata
-            [123, 234, 456],  # user_id
-            [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]],  # user_embed
-            ["dog posts", "friends who like to", "posts about ca"],  # query
-        ]
-        # convert the above content to ndarrays, checking against the schema
-        contents = from_blob_list(schema, contents_raw)
-        """
-        3. Creating and appending to the dataset.
-        We first create an empty dataset with the given schema.
-        Then, a Writer is used to append these entries to the dataset.
-        """
-        ds = dataset.Dataset(schema)
-        net = core.Net("init")
-        with core.NameScope("init"):
-            ds.init_empty(net)
-
-            content_blobs = NewRecord(net, contents)
-            FeedRecord(content_blobs, contents)
-            writer = ds.writer(init_net=net)
-            writer.write_record(net, content_blobs)
-        workspace.RunNetOnce(net)
-        """
-        4. Iterating through the dataset contents.
-
-        If we were to iterate through the top level entries of our dataset,
-        this is what we should expect to see:
-        """
-        entries_raw = [
-            (
-                [[1.1, 1.2, 1.3]],  # dense
-                [1],
-                [11],
-                [1.1],  # floats
-                [2],
-                [11, 12],
-                [2, 4],
-                [111, 112, 121, 122, 123, 124],  # intlst
-                [1],
-                [11],
-                [1],
-                [111],
-                [11.1],  # id score pairs
-                [123],
-                [[0.2, 0.8]],
-                ["dog posts"],  # metadata
-            ),
-            (
-                [[2.1, 2.2, 2.3]],  # dense
-                [2],
-                [21, 22],
-                [2.1, 2.2],  # floats
-                [0],
-                [],
-                [],
-                [],  # int list
-                [2],
-                [21, 22],
-                [1, 2],
-                [211, 221, 222],
-                [21.1, 22.1, 22.2],
-                [234],
-                [[0.5, 0.5]],
-                ["friends who like to"],  # metadata
-            ),
-            (
-                [[3.1, 3.2, 3.3]],  # dense
-                [3],
-                [31, 32, 33],
-                [3.1, 3.2, 3.3],  # floats
-                [1],
-                [31],
-                [3],
-                [311, 312, 313],  # int lst
-                [2],
-                [31, 32],
-                [2, 3],
-                [311, 312, 321, 322, 323],
-                [31.1, 31.2, 32.1, 32.2, 32.3],  # id score list
-                [456],
-                [[0.7, 0.3]],
-                ["posts about ca"],  # metadata
-            ),
-            # after the end of the dataset, we will keep getting empty vectors
-            ([],) * 16,
-            ([],) * 16,
-        ]
-        entries = [from_blob_list(schema, e) for e in entries_raw]
-        """
-        Let's go ahead and create the reading nets.
-        We will run `read` net multiple times and assert that we are reading the
-        entries the way we stated above.
-        """
-        read_init_net = core.Net("read_init")
-        read_next_net = core.Net("read_next")
-        reader = ds.reader(read_init_net)
-        should_continue, batch = reader.read_record(read_next_net)
-
-        workspace.RunNetOnce(read_init_net)
-        workspace.CreateNet(read_next_net, True)
-
-        for entry in entries:
-            workspace.RunNet(str(read_next_net))
-            actual = FetchRecord(batch)
-            _assert_records_equal(actual, entry)
-        """
-        5. Reading/writing in a single plan
-
-        If all of operations on the data are expressible as Caffe2 operators,
-        we don't need to load the data to python, iterating through the dataset
-        in a single Plan.
-
-        Where we will process the dataset a little and store it in a second
-        dataset. We can reuse the same Reader since it supports reset.
-        """
-        reset_net = core.Net("reset_net")
-        reader.reset(reset_net)
-        read_step, batch = reader.execution_step()
-        """ We will add the line number * 1000 to the feature ids. """
-        process_net = core.Net("process")
-        line_no = Const(process_net, 0, dtype=np.int32)
-        const_one = Const(process_net, 1000, dtype=np.int32)
-        process_net.Add([line_no, const_one], [line_no])
-        field = batch.floats.keys.get()
-        process_net.Print(field, [])
-        process_net.Add([field, line_no], field, broadcast=1, axis=0)
-        """ Lets create a second dataset and append to it. """
-        ds2 = dataset.Dataset(schema, name="dataset2")
-        ds2.init_empty(reset_net)
-        writer = ds2.writer(reset_net)
-        writer.write_record(process_net, batch)
-        # commit is not necessary for DatasetWriter but will add it for
-        # generality of the example
-        commit_net = core.Net("commit")
-        writer.commit(commit_net)
-        """ Time to create and run a plan which will do the processing """
-        plan = core.Plan("process")
-        plan.AddStep(core.execution_step("reset", reset_net))
-        plan.AddStep(read_step.AddNet(process_net))
-        plan.AddStep(core.execution_step("commit", commit_net))
-        workspace.RunPlan(plan)
-        """
-        Now we should have dataset2 populated.
-        """
-        ds2_data = FetchRecord(ds2.content())
-        field = ds2_data.floats.keys
-        field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000])
-        _assert_records_equal(contents, ds2_data)
-        """
-        6. Slicing a dataset
-
-        You can create a new schema from pieces of another schema and reuse
-        the same data.
-        """
-        subschema = Struct(("top_level", schema.int_lists.values))
-        int_list_contents = contents.int_lists.values.field_names()
-        self.assertEqual(len(subschema.field_names()), len(int_list_contents))
-        """
-        7. Random Access a dataset
-
-        """
-        read_init_net = core.Net("read_init")
-        read_next_net = core.Net("read_next")
-
-        idx = np.array([2, 1, 0])
-        indices_blob = Const(read_init_net, idx, name="indices")
-        reader = ds.random_reader(read_init_net, indices_blob)
-        reader.computeoffset(read_init_net)
-
-        should_stop, batch = reader.read_record(read_next_net)
-
-        workspace.CreateNet(read_init_net, True)
-        workspace.RunNetOnce(read_init_net)
-
-        workspace.CreateNet(read_next_net, True)
-
-        for i in range(len(entries)):
-            k = idx[i] if i in idx else i
-            entry = entries[k]
-            workspace.RunNet(str(read_next_net))
-            actual = FetchRecord(batch)
-            _assert_records_equal(actual, entry)
-        workspace.RunNet(str(read_next_net))
-        self.assertEqual(True, workspace.FetchBlob(should_stop))
-        """
-        8. Random Access a dataset with loop_over = true
-
-        """
-        read_init_net = core.Net("read_init")
-        read_next_net = core.Net("read_next")
-
-        idx = np.array([2, 1, 0])
-        indices_blob = Const(read_init_net, idx, name="indices")
-        reader = ds.random_reader(read_init_net, indices_blob, loop_over=True)
-        reader.computeoffset(read_init_net)
-
-        should_stop, batch = reader.read_record(read_next_net)
-
-        workspace.CreateNet(read_init_net, True)
-        workspace.RunNetOnce(read_init_net)
-
-        workspace.CreateNet(read_next_net, True)
-
-        for _ in range(len(entries) * 3):
-            workspace.RunNet(str(read_next_net))
-            self.assertEqual(False, workspace.FetchBlob(should_stop))
-        """
-        9. Sort and shuffle a dataset
-
-        This sort the dataset using the score of a certain column,
-        and then shuffle within each chunk of size batch_size * shuffle_size
-        before shuffling the chunks.
-
-        """
-        read_init_net = core.Net("read_init")
-        read_next_net = core.Net("read_next")
-
-        reader = ds.random_reader(read_init_net)
-        reader.sort_and_shuffle(read_init_net, "int_lists:lengths", 1, 2)
-        reader.computeoffset(read_init_net)
-
-        should_continue, batch = reader.read_record(read_next_net)
-
-        workspace.CreateNet(read_init_net, True)
-        workspace.RunNetOnce(read_init_net)
-
-        workspace.CreateNet(read_next_net, True)
-
-        expected_idx = np.array([2, 1, 0])
-        for i in range(len(entries)):
-            k = expected_idx[i] if i in expected_idx else i
-            entry = entries[k]
-            workspace.RunNet(str(read_next_net))
-            actual = FetchRecord(batch)
-            _assert_records_equal(actual, entry)
-
-        """
-        Trim a dataset
-        """
-        trim_net = core.Net("trim_ds")
-        ds.trim(trim_net, multiple_of=2)
-        workspace.RunNetOnce(trim_net)
-        trimmed = FetchRecord(ds.content())
-        EXPECTED_SIZES = [2, 2, 3, 3, 2, 2, 2, 6, 2, 3, 3, 4, 4, 2, 2, 2]
-        actual_sizes = [d.shape[0] for d in trimmed.field_blobs()]
-        self.assertEqual(EXPECTED_SIZES, actual_sizes)
-
-    def test_last_n_window_ops(self):
-        collect_net = core.Net("collect_net")
-        collect_net.GivenTensorFill(
-            [],
-            "input",
-            shape=[3, 2],
-            values=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
-        )
-        input_array = np.array(list(range(1, 7)), dtype=np.float32).reshape(3, 2)
-
-        workspace.CreateBlob("output")
-        workspace.FeedBlob("next", np.array(0, dtype=np.int32))
-        collect_net.LastNWindowCollector(
-            ["output", "next", "input"],
-            ["output", "next"],
-            num_to_collect=7,
-        )
-        plan = core.Plan("collect_data")
-        plan.AddStep(core.execution_step("collect_data", [collect_net], num_iter=1))
-        workspace.RunPlan(plan)
-        reference_result = workspace.FetchBlob("output")
-        npt.assert_array_equal(input_array, reference_result)
-
-        plan = core.Plan("collect_data")
-        plan.AddStep(core.execution_step("collect_data", [collect_net], num_iter=2))
-        workspace.RunPlan(plan)
-        reference_result = workspace.FetchBlob("output")
-        npt.assert_array_equal(input_array[[1, 2, 2, 0, 1, 2, 0]], reference_result)
-
-        plan = core.Plan("collect_data")
-        plan.AddStep(core.execution_step("collect_data", [collect_net], num_iter=3))
-        workspace.RunPlan(plan)
-        reference_result = workspace.FetchBlob("output")
-        npt.assert_array_equal(input_array[[2, 0, 1, 2, 2, 0, 1]], reference_result)
-
-    def test_last_n_window_ops_shape_inference(self):
-        collect_net = core.Net("collect_net")
-        collect_net.GivenTensorFill(
-            [],
-            "input",
-            shape=[3, 2],
-            values=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
-        )
-
-        workspace.CreateBlob("output")
-        workspace.FeedBlob("next", np.array(0, dtype=np.int32))
-        collect_net.LastNWindowCollector(
-            ["output", "next", "input"],
-            ["output", "next"],
-            num_to_collect=7,
-        )
-        (shapes, types) = workspace.InferShapesAndTypes([collect_net])
-        workspace.RunNetOnce(collect_net)
-
-        self.assertTrue(
-            np.array_equal(
-                shapes["output"], np.array([7, workspace.blobs["output"].shape[1]])
-            )
-        )
-
-    def test_last_n_window_ops_shape_inference_4d_input(self):
-        input_shape = [3, 2, 4, 5]
-        collect_net = core.Net("collect_net")
-        collect_net.GivenTensorFill(
-            [],
-            "input",
-            shape=input_shape,
-            values=[
-                float(val) for val in range(functools.reduce(operator.mul, input_shape))
-            ],
-        )
-
-        workspace.CreateBlob("output")
-        workspace.FeedBlob("next", np.array(0, dtype=np.int32))
-        collect_net.LastNWindowCollector(
-            ["output", "next", "input"],
-            ["output", "next"],
-            num_to_collect=7,
-        )
-        (shapes, types) = workspace.InferShapesAndTypes([collect_net])
-        workspace.RunNetOnce(collect_net)
-
-        self.assertTrue(
-            np.array_equal(
-                shapes["output"], np.array([7, *list(workspace.blobs["output"].shape[1:])])
-            )
-        )
-
-    def test_collect_tensor_ops(self):
-        init_net = core.Net("init_net")
-        blobs = ["blob_1", "blob_2", "blob_3"]
-        bvec_map = {}
-        ONE = init_net.ConstantFill([], "ONE", shape=[1, 2], value=1)
-        for b in blobs:
-            init_net.ConstantFill([], [b], shape=[1, 2], value=0)
-            bvec_map[b] = b + "_vec"
-            init_net.CreateTensorVector([], [bvec_map[b]])
-
-        reader_net = core.Net("reader_net")
-        for b in blobs:
-            reader_net.Add([b, ONE], [b])
-
-        collect_net = core.Net("collect_net")
-        num_to_collect = 1000
-        max_example_to_cover = 100000
-        bvec = [bvec_map[b] for b in blobs]
-        collect_net.CollectTensor(
-            bvec + blobs,
-            bvec,
-            num_to_collect=num_to_collect,
-        )
-
-        print("Collect Net Proto: {}".format(collect_net.Proto()))
-
-        plan = core.Plan("collect_data")
-        plan.AddStep(core.execution_step("collect_init", init_net))
-        plan.AddStep(
-            core.execution_step(
-                "collect_data", [reader_net, collect_net], num_iter=max_example_to_cover
-            )
-        )
-        workspace.RunPlan(plan)
-
-        # concat the collected tensors
-        concat_net = core.Net("concat_net")
-        bconcated_map = {}
-        bsize_map = {}
-        for b in blobs:
-            bconcated_map[b] = b + "_concated"
-            bsize_map[b] = b + "_size"
-            concat_net.ConcatTensorVector([bvec_map[b]], [bconcated_map[b]])
-            concat_net.TensorVectorSize([bvec_map[b]], [bsize_map[b]])
-
-        workspace.RunNetOnce(concat_net)
-
-        # check data
-        reference_result = workspace.FetchBlob(bconcated_map[blobs[0]])
-        self.assertEqual(
-            reference_result.shape, (min(num_to_collect, max_example_to_cover), 2)
-        )
-        size = workspace.FetchBlob(bsize_map[blobs[0]])
-        self.assertEqual(tuple(), size.shape)
-        self.assertEqual(min(num_to_collect, max_example_to_cover), size.item())
-
-        hist, _ = np.histogram(
-            reference_result[:, 0], bins=10, range=(1, max_example_to_cover)
-        )
-        print("Sample histogram: {}".format(hist))
-
-        self.assertTrue(all(hist > 0.6 * (num_to_collect / 10)))
-        for i in range(1, len(blobs)):
-            result = workspace.FetchBlob(bconcated_map[blobs[i]])
-            self.assertEqual(reference_result.tolist(), result.tolist())
-
-
-if __name__ == "__main__":
-    import unittest
-
-    unittest.main()
diff --git a/caffe2/python/operator_test/decay_adagrad_test.py b/caffe2/python/operator_test/decay_adagrad_test.py
deleted file mode 100644
index 9004c1c26b96..000000000000
--- a/caffe2/python/operator_test/decay_adagrad_test.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import functools
-
-from hypothesis import given
-import hypothesis.strategies as st
-import numpy as np
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestDecayAdagrad(hu.HypothesisTestCase):
-
-    @staticmethod
-    def ref_decay_adagrad(param, mom1, mom2, grad, LR, ITER,
-                 beta1, beta2, epsilon, weight_decay, bias_correction_first, output_grad=False):
-        t = ITER + 1
-        mom1_out = (beta1 * mom1) + (1 - beta1) * grad
-        mom2_out = mom2 + np.square(grad)
-        if bias_correction_first:
-            c = 1 - np.power(beta1, t)
-        else:
-            c = 1.0
-        grad_out = mom1_out / c / (np.sqrt(mom2_out) + epsilon) + weight_decay * param
-        param_out = param + LR * grad_out
-
-        return param_out, mom1_out, mom2_out
-
-    @given(inputs=hu.tensors(n=4),
-           ITER=st.integers(min_value=0, max_value=10000),
-           LR=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           beta1=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           beta2=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           weight_decay=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           **hu.gcs_cpu_only)
-    def test_decay_adagrad(self, inputs, ITER, LR, beta1, beta2, epsilon, weight_decay, gc, dc):
-        bias_correction_first = True
-
-        param, mom1, mom2, grad = inputs
-        mom2 = np.abs(mom2)
-        ITER = np.array([ITER], dtype=np.int64)
-        LR = np.array([LR], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "DecayAdagrad",
-            ["param", "mom1", "mom2", "grad", "lr", "iter"],
-            ["output_param", "output_mom1", "output_mom2"],
-            beta1=beta1, beta2=beta2, epsilon=epsilon, weight_decay=weight_decay, bias_correction_first=bias_correction_first)
-
-        # Iter lives on the CPU
-        input_device_options = {'iter': hu.cpu_do}
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, mom1, mom2, grad, LR, ITER],
-            functools.partial(
-                self.ref_decay_adagrad,
-                beta1=beta1, beta2=beta2, epsilon=epsilon, weight_decay=weight_decay, bias_correction_first=bias_correction_first),
-            input_device_options=input_device_options)
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/deform_conv_test.py b/caffe2/python/operator_test/deform_conv_test.py
deleted file mode 100644
index 67289de5e924..000000000000
--- a/caffe2/python/operator_test/deform_conv_test.py
+++ /dev/null
@@ -1,603 +0,0 @@
-
-
-import unittest
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, utils, workspace
-from hypothesis import assume, given
-
-
-def _cudnn_supports(dilation=False, nhwc=False):
-    """Return True if cuDNN supports this configuration."""
-    v = workspace.GetCuDNNVersion()
-    if dilation and v < 6000:
-        # Dilation not supported until v6
-        return False
-    if dilation and nhwc:
-        # Dilation and NHWC not supported together
-        return False
-    return True
-
-
-def _conv_1d_output_size(size, kernel, pad, dilation, stride):
-    return max(1, int((size + pad * 2 - (dilation * (kernel - 1) + 1)) / stride) + 1)
-
-
-def _conv_2d_output_size(size, kernel, pad_h, pad_w, dilation, stride_h, stride_w):
-    return [
-        _conv_1d_output_size(size, kernel, pad_h, dilation, stride_h),
-        _conv_1d_output_size(size, kernel, pad_w, dilation, stride_w),
-    ]
-
-
-def _conv_2d_offsets_dims(
-    batch_size,
-    size,
-    kernel,
-    pad_h,
-    pad_w,
-    dilation,
-    stride_h,
-    stride_w,
-    deformable_group,
-):
-    dims = [batch_size, 2 * kernel * kernel * deformable_group]
-    dims.extend(
-        _conv_2d_output_size(size, kernel, pad_h, pad_w, dilation, stride_h, stride_w)
-    )
-    return dims
-
-
-def _conv_2d_random_offsets(batch_size, kernel, dims, num_deformable_group):
-    o = []
-    for y0 in range(0, kernel):
-        for x0 in range(0, kernel):
-            # stay away from integer offsets which correspond to "ridges" on the
-            # interpolated surface resulting in less precise estimates
-            x = np.random.randint(0, kernel) + np.random.uniform(0.05, 0.95)
-            y = np.random.randint(0, kernel) + np.random.uniform(0.05, 0.95)
-            o.append(y - y0)
-            o.append(x - x0)
-    o = o * num_deformable_group
-    e = []
-    for v in o:
-        e.append([[v] * dims[1]] * dims[0])
-    return np.array([e] * batch_size).astype(np.float32)
-
-
-def _conv_2d_shuffle_offsets(
-    batch_size, kernel, dims, num_deformable_group, input_channels, output_channels
-):
-    o = []
-    w0 = [[0 for x in range(kernel)] for y in range(kernel)]
-    for y0 in range(0, kernel):
-        for x0 in range(0, kernel):
-            x = np.random.randint(0, kernel)
-            y = np.random.randint(0, kernel)
-            o.append(y - y0)
-            o.append(x - x0)
-            w0[y][x] += 1
-    o = o * num_deformable_group
-    e = []
-    for v in o:
-        e.append([[v] * int(dims[1])] * int(dims[0]))
-    w0 = [[w0] * input_channels] * output_channels
-    return (
-        np.array([e] * batch_size).astype(np.float32),
-        utils.NCHW2NHWC(np.array(w0).astype(np.float32)),
-    )
-
-
-class TestConvolution(hu.HypothesisTestCase):
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(
-        stride=st.integers(1, 3),
-        pad=st.integers(0, 3),
-        kernel=st.integers(1, 5),
-        dilation=st.integers(1, 3),
-        size=st.integers(7, 10),
-        input_channels=st.integers(1, 8),
-        output_channels=st.integers(1, 8),
-        batch_size=st.integers(1, 3),
-        order=st.sampled_from(["NCHW"]),
-        engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
-        use_bias=st.booleans(),
-        deformable_group=st.integers(1, 3),
-        **hu.gcs_gpu_only
-    )
-    def test_null_offset_convolution(
-        self,
-        stride,
-        pad,
-        kernel,
-        dilation,
-        size,
-        input_channels,
-        output_channels,
-        batch_size,
-        order,
-        engine,
-        use_bias,
-        deformable_group,
-        gc,
-        dc,
-    ):
-        dkernel = dilation * (kernel - 1) + 1
-
-        if gc.device_type == caffe2_pb2.CUDA and engine == "CUDNN":
-            assume(_cudnn_supports(dilation=(dilation > 1), nhwc=(order == "NHWC")))
-
-        assume(engine != "MKLDNN" or use_bias is True)
-
-        op = core.CreateOperator(
-            "DeformConv",
-            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
-            ["Y"],
-            stride=stride,
-            kernel=kernel,
-            dilation=dilation,
-            pad=pad,
-            order=order,
-            engine=engine,
-            deformable_group=deformable_group,
-        )
-        offset_dims = _conv_2d_offsets_dims(
-            batch_size,
-            size,
-            kernel,
-            pad,
-            pad,
-            dilation,
-            stride,
-            stride,
-            deformable_group,
-        )
-        X = (
-            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
-            - 0.5
-        )
-        o = np.zeros(tuple(offset_dims), np.float32)
-        w = (
-            np.random.rand(output_channels, kernel, kernel, input_channels).astype(
-                np.float32
-            )
-            - 0.5
-        )
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-
-        inputs = [X, o, w, b] if use_bias else [X, o, w]
-
-        # Error handling path.
-        if size + pad + pad < dkernel or size + pad + pad < dkernel:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-        if input_channels % deformable_group != 0:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-        if output_channels % deformable_group != 0:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-
-        def reference_conv_op(*args):
-            reference_op = core.CreateOperator(
-                "Conv",
-                ["X", "w", "b"] if use_bias else ["X", "w"],
-                ["Y0"],
-                stride=stride,
-                kernel=kernel,
-                dilation=dilation,
-                pad=pad,
-                order=order,
-                engine=engine,
-                device_option=gc,
-            )
-            workspace.RunOperatorOnce(reference_op)
-            reference_blob = workspace.FetchBlob("Y0")
-            return (reference_blob,)
-
-        self.assertReferenceChecks(gc, op, inputs, reference_conv_op)
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(
-        stride=st.integers(1, 3),
-        pad=st.integers(0, 0),
-        kernel=st.integers(1, 5),
-        dilation=st.integers(1, 3),
-        size=st.integers(7, 10),
-        input_channels=st.integers(1, 8),
-        output_channels=st.integers(1, 8),
-        batch_size=st.integers(1, 3),
-        order=st.sampled_from(["NCHW"]),
-        engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
-        use_bias=st.booleans(),
-        deformable_group=st.integers(1, 4),
-        **hu.gcs_gpu_only
-    )
-    def test_flat_input_convolution(
-        self,
-        stride,
-        pad,
-        kernel,
-        dilation,
-        size,
-        input_channels,
-        output_channels,
-        batch_size,
-        order,
-        engine,
-        use_bias,
-        deformable_group,
-        gc,
-        dc,
-    ):
-        dkernel = dilation * (kernel - 1) + 1
-
-        if gc.device_type == caffe2_pb2.CUDA and engine == "CUDNN":
-            assume(_cudnn_supports(dilation=(dilation > 1), nhwc=(order == "NHWC")))
-
-        assume(engine != "MKLDNN" or use_bias is True)
-
-        op = core.CreateOperator(
-            "DeformConv",
-            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
-            ["Y"],
-            stride=stride,
-            kernel=kernel,
-            dilation=dilation,
-            pad=pad,
-            order=order,
-            engine=engine,
-            deformable_group=deformable_group,
-        )
-        X = np.ones((batch_size, size, size, input_channels), np.float32) - 0.5
-        output_size = _conv_2d_output_size(
-            size, kernel, pad, pad, dilation, stride, stride
-        )
-        o = _conv_2d_random_offsets(batch_size, kernel, output_size, deformable_group)
-        w = np.ones((output_channels, kernel, kernel, input_channels), np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-
-        inputs = [X, o, w, b] if use_bias else [X, o, w]
-
-        # Error handling path.
-        if size + pad + pad < dkernel or size + pad + pad < dkernel:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-        if input_channels % deformable_group != 0:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-        if output_channels % deformable_group != 0:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-
-        def reference_conv_op(*args):
-            reference_op = core.CreateOperator(
-                "Conv",
-                ["X", "w", "b"] if use_bias else ["X", "w"],
-                ["Y0"],
-                stride=stride,
-                kernel=kernel,
-                dilation=dilation,
-                pad=pad,
-                order=order,
-                engine=engine,
-                device_option=gc,
-            )
-            workspace.RunOperatorOnce(reference_op)
-            reference_blob = workspace.FetchBlob("Y0")
-            return (reference_blob,)
-
-        self.assertReferenceChecks(gc, op, inputs, reference_conv_op)
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(
-        stride=st.integers(1, 1),
-        pad=st.integers(0, 0),
-        kernel=st.integers(1, 5),
-        dilation=st.integers(1, 1),
-        size=st.integers(7, 10),
-        input_channels=st.integers(1, 8),
-        output_channels=st.integers(1, 8),
-        batch_size=st.integers(1, 3),
-        order=st.sampled_from(["NCHW"]),
-        engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
-        use_bias=st.booleans(),
-        deformable_group=st.integers(1, 4),
-        **hu.gcs_gpu_only
-    )
-    def test_shuffle_input_convolution(
-        self,
-        stride,
-        pad,
-        kernel,
-        dilation,
-        size,
-        input_channels,
-        output_channels,
-        batch_size,
-        order,
-        engine,
-        use_bias,
-        deformable_group,
-        gc,
-        dc,
-    ):
-        dkernel = dilation * (kernel - 1) + 1
-
-        if gc.device_type == caffe2_pb2.CUDA and engine == "CUDNN":
-            assume(_cudnn_supports(dilation=(dilation > 1), nhwc=(order == "NHWC")))
-
-        assume(engine != "MKLDNN" or use_bias is True)
-
-        op = core.CreateOperator(
-            "DeformConv",
-            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
-            ["Y"],
-            stride=stride,
-            kernel=kernel,
-            dilation=dilation,
-            pad=pad,
-            order=order,
-            engine=engine,
-            deformable_group=deformable_group,
-        )
-        X = (
-            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
-            - 0.5
-        )
-        output_size = _conv_2d_output_size(
-            size, kernel, pad, pad, dilation, stride, stride
-        )
-        o, w0 = _conv_2d_shuffle_offsets(
-            batch_size,
-            kernel,
-            output_size,
-            deformable_group,
-            input_channels,
-            output_channels,
-        )
-        w = np.ones((output_channels, kernel, kernel, input_channels), np.float32)
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-            w0 = utils.NHWC2NCHW(w0)
-
-        inputs = [X, o, w, b] if use_bias else [X, o, w]
-
-        # Error handling path.
-        if size + pad + pad < dkernel or size + pad + pad < dkernel:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-        if input_channels % deformable_group != 0:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-        if output_channels % deformable_group != 0:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-
-        def reference_conv_op(*args):
-            with core.DeviceScope(gc):
-                workspace.FeedBlob("w0", w0)
-            reference_op = core.CreateOperator(
-                "Conv",
-                ["X", "w0", "b"] if use_bias else ["X", "w0"],
-                ["Y0"],
-                stride=stride,
-                kernel=kernel,
-                dilation=dilation,
-                pad=pad,
-                order=order,
-                engine=engine,
-                device_option=gc,
-            )
-            workspace.RunOperatorOnce(reference_op)
-            reference_blob = workspace.FetchBlob("Y0")
-            return (reference_blob,)
-
-        self.assertReferenceChecks(gc, op, inputs, reference_conv_op)
-
-    # CUDNN does NOT support different padding values and we skip it
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(
-        stride_h=st.integers(1, 3),
-        stride_w=st.integers(1, 3),
-        pad_h=st.integers(0, 3),
-        pad_w=st.integers(0, 3),
-        kernel=st.integers(2, 5),
-        size=st.integers(1, 8),
-        input_channels=st.integers(1, 3),
-        output_channels=st.integers(1, 3),
-        batch_size=st.integers(1, 3),
-        order=st.sampled_from(["NCHW"]),
-        shared_buffer=st.booleans(),
-        use_bias=st.booleans(),
-        deformable_group=st.integers(1, 3),
-        **hu.gcs_gpu_only
-    )
-    def test_conv_separate_stride_pad_gradients(
-        self,
-        stride_h,
-        stride_w,
-        pad_h,
-        pad_w,
-        kernel,
-        size,
-        input_channels,
-        output_channels,
-        batch_size,
-        order,
-        shared_buffer,
-        use_bias,
-        deformable_group,
-        gc,
-        dc,
-    ):
-        op = core.CreateOperator(
-            "DeformConv",
-            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
-            ["Y"],
-            stride_h=stride_h,
-            stride_w=stride_w,
-            pad_t=pad_h,
-            pad_l=pad_w,
-            pad_b=pad_h,
-            pad_r=pad_w,
-            kernel=kernel,
-            order=order,
-            shared_buffer=int(shared_buffer),
-            deformable_group=deformable_group,
-        )
-        X = (
-            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
-            - 0.5
-        )
-        output_size = _conv_2d_output_size(
-            size, kernel, pad_h, pad_w, 1, stride_h, stride_w
-        )
-        o = _conv_2d_random_offsets(batch_size, kernel, output_size, deformable_group)
-        w = (
-            np.random.rand(output_channels, kernel, kernel, input_channels).astype(
-                np.float32
-            )
-            - 0.5
-        )
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-
-        inputs = [X, o, w, b] if use_bias else [X, o, w]
-
-        # Error handling path.
-        if size + pad_h * 2 < kernel or size + pad_w * 2 < kernel:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-        if input_channels % deformable_group != 0:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-        if output_channels % deformable_group != 0:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(
-        stride=st.integers(1, 3),
-        pad=st.integers(0, 3),
-        kernel=st.integers(1, 5),
-        dilation=st.integers(1, 3),
-        size=st.integers(7, 10),
-        input_channels=st.integers(1, 8),
-        output_channels=st.integers(1, 8),
-        batch_size=st.integers(1, 3),
-        order=st.sampled_from(["NCHW"]),
-        engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
-        use_bias=st.booleans(),
-        deformable_group=st.integers(1, 3),
-        **hu.gcs_gpu_only
-    )
-    def test_conv_gradients(
-        self,
-        stride,
-        pad,
-        kernel,
-        dilation,
-        size,
-        input_channels,
-        output_channels,
-        batch_size,
-        order,
-        engine,
-        use_bias,
-        deformable_group,
-        gc,
-        dc,
-    ):
-        dkernel = dilation * (kernel - 1) + 1
-
-        if gc.device_type == caffe2_pb2.CUDA and engine == "CUDNN":
-            assume(_cudnn_supports(dilation=(dilation > 1), nhwc=(order == "NHWC")))
-
-        assume(engine != "MKLDNN" or use_bias is True)
-
-        op = core.CreateOperator(
-            "DeformConv",
-            ["X", "o", "w", "b"] if use_bias else ["X", "o", "w"],
-            ["Y"],
-            stride=stride,
-            kernel=kernel,
-            dilation=dilation,
-            pad=pad,
-            order=order,
-            engine=engine,
-            deformable_group=deformable_group,
-        )
-        X = (
-            np.random.rand(batch_size, size, size, input_channels).astype(np.float32)
-            - 0.5
-        )
-        output_size = _conv_2d_output_size(
-            size, kernel, pad, pad, dilation, stride, stride
-        )
-        o = _conv_2d_random_offsets(batch_size, kernel, output_size, deformable_group)
-        w = (
-            np.random.rand(output_channels, kernel, kernel, input_channels).astype(
-                np.float32
-            )
-            - 0.5
-        )
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-
-        inputs = [X, o, w, b] if use_bias else [X, o, w]
-        # Error handling path.
-        if size + pad + pad < dkernel or size + pad + pad < dkernel:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-        if input_channels % deformable_group != 0:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-        if output_channels % deformable_group != 0:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-
-if __name__ == "__main__":
-    import unittest
-
-    unittest.main()
diff --git a/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py b/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py
deleted file mode 100644
index 8b6f42417fd4..000000000000
--- a/caffe2/python/operator_test/dense_vector_to_id_list_op_test.py
+++ /dev/null
@@ -1,66 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-
-import hypothesis.extra.numpy as hnp
-import hypothesis.strategies as st
-import numpy as np
-
-
-@st.composite
-def id_list_batch(draw):
-    batch_size = draw(st.integers(2, 2))
-    values_dtype = np.float32
-    inputs = []
-    sample_size = draw(st.integers(5, 10))
-    for _ in range(batch_size):
-        values = draw(hnp.arrays(values_dtype, sample_size, st.integers(0, 1)))
-        inputs += [values]
-    return [np.array(inputs)]
-
-
-def dense_vector_to_id_list_ref(*arg):
-    arg = arg[0]
-    batch_size = len(arg)
-    assert batch_size > 0
-    out_length = []
-    out_values = []
-    for row in arg:
-        length = 0
-        for idx, entry in enumerate(row):
-            if entry != 0:
-                out_values += [idx]
-                length += 1
-        out_length += [length]
-    return (out_length, out_values)
-
-
-class TestDenseVectorToIdList(hu.HypothesisTestCase):
-    def test_dense_vector_to_id_list_ref(self):
-        # Verify that the reference implementation is correct!
-        dense_input = np.array(
-            [[1, 0, 0, 1, 0, 0, 0, 1],
-             [1, 0, 1, 0, 0, 0, 0, 1],
-             [0, 1, 0, 0, 0, 1, 0, 1]],
-            dtype=np.float32)
-        sparse_lengths, sparse_values = dense_vector_to_id_list_ref(dense_input)
-        expected_lengths = np.array([3, 3, 3], dtype=np.int32)
-        expected_values = np.array([0, 3, 7, 0, 2, 7, 1, 5, 7], dtype=np.int64)
-
-        np.testing.assert_array_equal(sparse_lengths, expected_lengths)
-        np.testing.assert_array_equal(sparse_values, expected_values)
-
-    @given(inputs=id_list_batch(), **hu.gcs_cpu_only)
-    def test_dense_vector_to_id_list_op(self, inputs, gc, dc):
-        op = core.CreateOperator(
-            "DenseVectorToIdList",
-            ["values"],
-            ["out_lengths", "out_values"]
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        self.assertReferenceChecks(gc, op, inputs, dense_vector_to_id_list_ref)
diff --git a/caffe2/python/operator_test/depthwise_3x3_conv_test.py b/caffe2/python/operator_test/depthwise_3x3_conv_test.py
deleted file mode 100644
index cdfffce288dd..000000000000
--- a/caffe2/python/operator_test/depthwise_3x3_conv_test.py
+++ /dev/null
@@ -1,56 +0,0 @@
-
-
-
-
-
-import numpy as np
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core, utils
-from hypothesis import given, settings
-import hypothesis.strategies as st
-
-
-class Depthwise3x3ConvOpsTest(hu.HypothesisTestCase):
-    @given(pad=st.integers(0, 1),
-           kernel=st.integers(3, 3),
-           size=st.integers(4, 8),
-           channels=st.integers(2, 4),
-           batch_size=st.integers(1, 1),
-           order=st.sampled_from(["NCHW"]),
-           engine=st.sampled_from(["DEPTHWISE_3x3"]),
-           use_bias=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_convolution_gradients(self, pad, kernel, size,
-                                   channels, batch_size,
-                                   order, engine, use_bias, gc, dc):
-        op = core.CreateOperator(
-            "Conv",
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            kernel=kernel,
-            pad=pad,
-            group=channels,
-            order=order,
-            engine=engine,
-        )
-        X = np.random.rand(
-            batch_size, size, size, channels).astype(np.float32) - 0.5
-        w = np.random.rand(
-            channels, kernel, kernel, 1).astype(np.float32)\
-            - 0.5
-        b = np.random.rand(channels).astype(np.float32) - 0.5
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-
-        inputs = [X, w, b] if use_bias else [X, w]
-        # Error handling path.
-        if size + pad + pad < kernel or size + pad + pad < kernel:
-            with self.assertRaises(RuntimeError):
-                self.assertDeviceChecks(dc, op, inputs, [0])
-            return
-
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
diff --git a/caffe2/python/operator_test/detectron_keypoints.py b/caffe2/python/operator_test/detectron_keypoints.py
deleted file mode 100644
index 319e8b5bbffd..000000000000
--- a/caffe2/python/operator_test/detectron_keypoints.py
+++ /dev/null
@@ -1,203 +0,0 @@
-
-
-
-
-
-try:
-    import cv2
-except ImportError:
-    pass  # skip if opencv is not available
-import numpy as np
-
-
-# === copied from utils/keypoints.py as reference ===
-_NUM_KEYPOINTS = -1  # cfg.KRCNN.NUM_KEYPOINTS
-_INFERENCE_MIN_SIZE = 0  # cfg.KRCNN.INFERENCE_MIN_SIZE
-
-
-def heatmaps_to_keypoints(maps, rois):
-    """Extracts predicted keypoint locations from heatmaps. Output has shape
-    (#rois, 4, #keypoints) with the 4 rows corresponding to (x, y, logit, prob)
-    for each keypoint.
-    """
-    # This function converts a discrete image coordinate in a HEATMAP_SIZE x
-    # HEATMAP_SIZE image to a continuous keypoint coordinate. We maintain
-    # consistency with keypoints_to_heatmap_labels by using the conversion from
-    # Heckbert 1990: c = d + 0.5, where d is a discrete coordinate and c is a
-    # continuous coordinate.
-    offset_x = rois[:, 0]
-    offset_y = rois[:, 1]
-
-    widths = rois[:, 2] - rois[:, 0]
-    heights = rois[:, 3] - rois[:, 1]
-    widths = np.maximum(widths, 1)
-    heights = np.maximum(heights, 1)
-    widths_ceil = np.ceil(widths).astype(int)
-    heights_ceil = np.ceil(heights).astype(int)
-
-    num_keypoints = np.maximum(maps.shape[1], _NUM_KEYPOINTS)
-
-    # NCHW to NHWC for use with OpenCV
-    maps = np.transpose(maps, [0, 2, 3, 1])
-    min_size = _INFERENCE_MIN_SIZE
-
-    xy_preds = np.zeros(
-        (len(rois), 4, num_keypoints), dtype=np.float32)
-    for i in range(len(rois)):
-        if min_size > 0:
-            roi_map_width = int(np.maximum(widths_ceil[i], min_size))
-            roi_map_height = int(np.maximum(heights_ceil[i], min_size))
-        else:
-            roi_map_width = widths_ceil[i]
-            roi_map_height = heights_ceil[i]
-        width_correction = widths[i] / roi_map_width
-        height_correction = heights[i] / roi_map_height
-        roi_map = cv2.resize(
-            maps[i], (roi_map_width, roi_map_height),
-            interpolation=cv2.INTER_CUBIC)
-
-        # Bring back to CHW
-        roi_map = np.transpose(roi_map, [2, 0, 1])
-        roi_map_probs = scores_to_probs(roi_map.copy())
-        w = roi_map.shape[2]
-        for k in range(num_keypoints):
-            pos = roi_map[k, :, :].argmax()
-            x_int = pos % w
-            y_int = (pos - x_int) // w
-            assert (roi_map_probs[k, y_int, x_int] ==
-                    roi_map_probs[k, :, :].max())
-            x = (x_int + 0.5) * width_correction
-            y = (y_int + 0.5) * height_correction
-            xy_preds[i, 0, k] = x + offset_x[i]
-            xy_preds[i, 1, k] = y + offset_y[i]
-            xy_preds[i, 2, k] = roi_map[k, y_int, x_int]
-            xy_preds[i, 3, k] = roi_map_probs[k, y_int, x_int]
-
-    return xy_preds
-
-
-def scores_to_probs(scores):
-    """Transforms CxHxW of scores to probabilities spatially."""
-    channels = scores.shape[0]
-    for c in range(channels):
-        temp = scores[c, :, :]
-        max_score = temp.max()
-        temp = np.exp(temp - max_score) / np.sum(np.exp(temp - max_score))
-        scores[c, :, :] = temp
-    return scores
-
-
-def approx_heatmap_keypoint(heatmaps_in, bboxes_in):
-    '''
-Mask R-CNN uses bicubic upscaling before taking the maximum of the heat map
-for keypoints. We are using bilinear upscaling, which means we can approximate
-the maximum coordinate with the low dimension maximum coordinates. We would like
-to avoid bicubic upscaling, because it is computationally expensive. Brown and
-Lowe  (Invariant Features from Interest Point Groups, 2002) uses a method  for
-fitting a 3D quadratic function to the local sample points to determine the
-interpolated location of the maximum of scale space, and his experiments showed
-that this provides a substantial improvement to matching and stability for
-keypoint extraction. This approach uses the Taylor expansion (up to the
-quadratic terms) of the scale-space function. It is equivalent with the Newton
-method. This efficient method were used in many keypoint estimation algorithms
-like SIFT, SURF etc...
-
-The implementation of Newton methods with numerical analysis is straight forward
-and super simple, though we need a linear solver.
-
-    '''
-    assert len(bboxes_in.shape) == 2
-    N = bboxes_in.shape[0]
-    assert bboxes_in.shape[1] == 4
-    assert len(heatmaps_in.shape) == 4
-    assert heatmaps_in.shape[0] == N
-    keypoint_count = heatmaps_in.shape[1]
-    heatmap_size = heatmaps_in.shape[2]
-    assert heatmap_size >= 2
-    assert heatmaps_in.shape[3] == heatmap_size
-
-    keypoints_out = np.zeros((N, keypoint_count, 4))
-
-    for k in range(N):
-        x0, y0, x1, y1 = bboxes_in[k, :]
-        xLen = np.maximum(x1 - x0, 1)
-        yLen = np.maximum(y1 - y0, 1)
-        softmax_map = scores_to_probs(heatmaps_in[k, :, :, :].copy())
-        f = heatmaps_in[k]
-        for j in range(keypoint_count):
-            f = heatmaps_in[k][j]
-            maxX = -1
-            maxY = -1
-            maxScore = -100.0
-            maxProb = -100.0
-            for y in range(heatmap_size):
-                for x in range(heatmap_size):
-                    score = f[y, x]
-                    prob = softmax_map[j, y, x]
-                    if maxX < 0 or maxScore < score:
-                        maxScore = score
-                        maxProb = prob
-                        maxX = x
-                        maxY = y
-
-            # print(maxScore, maxX, maxY)
-            # initialize fmax values of 3x3 grid
-            # when 3x3 grid going out-of-bound, mirrowing around center
-            fmax = [[0] * 3 for r in range(3)]
-            for x in range(3):
-                for y in range(3):
-                    hm_x = x + maxX - 1
-                    hm_y = y + maxY - 1
-                    hm_x = hm_x - 2 * (hm_x >= heatmap_size) + 2 * (hm_x < 0)
-                    hm_y = hm_y - 2 * (hm_y >= heatmap_size) + 2 * (hm_y < 0)
-                    assert((hm_x < heatmap_size) and (hm_x >= 0))
-                    assert((hm_y < heatmap_size) and (hm_y >= 0))
-                    fmax[y][x] = f[hm_y][hm_x]
-
-            # print("python fmax ", fmax)
-            # b = -f'(0), A = f''(0) Hessian matrix
-            b = [-(fmax[1][2] - fmax[1][0]) / 2, -
-                 (fmax[2][1] - fmax[0][1]) / 2]
-            A = [[fmax[1][0] - 2 * fmax[1][1] + fmax[1][2],
-                  (fmax[2][2] - fmax[2][0] - fmax[0][2] + fmax[0][0]) / 4],
-                 [(fmax[2][2] - fmax[2][0] - fmax[0][2] + fmax[0][0]) / 4,
-                  fmax[0][1] - 2 * fmax[1][1] + fmax[2][1]]]
-            # print("python A")
-            # print(A)
-            # solve Ax=b
-            div = A[1][1] * A[0][0] - A[0][1] * A[1][0]
-            if abs(div) < 0.0001:
-                deltaX = 0
-                deltaY = 0
-                deltaScore = maxScore
-            else:
-                deltaY = (b[1] * A[0][0] - b[0] * A[1][0]) / div
-                deltaX = (b[0] * A[1][1] - b[1] * A[0][1]) / div
-                # clip delta if going out-of-range of 3x3 grid
-                if abs(deltaX) > 1.5 or abs(deltaY) > 1.5:
-                    scale = 1.5 / max(abs(deltaX), abs(deltaY))
-                    deltaX *= scale
-                    deltaY *= scale
-                # score = f(0) + f'(0)*x + 1/2 * f''(0) * x^2
-                #    = f(0) - b*x + 1/2*x*A*x
-                deltaScore = (
-                    fmax[1][1] - (b[0] * deltaX + b[1] * deltaY) +
-                    0.5 * (deltaX * deltaX * A[0][0] +
-                           deltaX * deltaY * A[1][0] +
-                           deltaY * deltaX * A[0][1] +
-                           deltaY * deltaY * A[1][1]))
-
-            assert abs(deltaX) <= 1.5
-            assert abs(deltaY) <= 1.5
-
-            # final coordinates
-            keypoints_out[k, j, :] = (
-                x0 + (maxX + deltaX + .5) * xLen / heatmap_size,
-                y0 + (maxY + deltaY + .5) * yLen / heatmap_size,
-                deltaScore,
-                maxProb,
-            )
-
-    keypoints_out = np.transpose(keypoints_out, [0, 2, 1])
-
-    return keypoints_out
diff --git a/caffe2/python/operator_test/distance_op_test.py b/caffe2/python/operator_test/distance_op_test.py
deleted file mode 100644
index 5b46548e072b..000000000000
--- a/caffe2/python/operator_test/distance_op_test.py
+++ /dev/null
@@ -1,108 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class DistanceTest(serial.SerializedTestCase):
-    @serial.given(n=st.integers(1, 3),
-           dim=st.integers(4, 16),
-           **hu.gcs)
-    def test_cosine_similarity(self, n, dim, gc, dc):
-        X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
-        Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
-        self.ws.create_blob("X").feed(X)
-        self.ws.create_blob("Y").feed(Y)
-        kEps = 1e-12
-        cos_op = core.CreateOperator("CosineSimilarity", ["X", "Y"], ["cos"])
-        self.ws.run(cos_op)
-        cos = np.divide(np.multiply(X, Y).sum(axis=1),
-                        np.multiply(np.linalg.norm(X, axis=1) + kEps,
-                                    np.linalg.norm(Y, axis=1) + kEps))
-        np.testing.assert_allclose(self.ws.blobs[("cos")].fetch(), cos,
-                                   rtol=1e-4, atol=1e-4)
-        self.assertGradientChecks(gc, cos_op, [X, Y], 0, [0],
-                                  stepsize=1e-2, threshold=1e-2)
-        self.assertGradientChecks(gc, cos_op, [X, Y], 1, [0],
-                                  stepsize=1e-2, threshold=1e-2)
-
-    @serial.given(inputs=hu.tensors(n=2,
-                             min_dim=1,
-                             max_dim=2,
-                             dtype=np.float32),
-           **hu.gcs)
-    def test_dot_product(self, inputs, gc, dc):
-        X, Y = inputs
-        op = core.CreateOperator(
-            'DotProduct',
-            ['X', 'Y'],
-            ['DOT'],
-        )
-
-        def dot_ref(X, Y):
-            return ([np.dot(x, y) for x, y in zip(X, Y)],)
-
-        # Check against numpy dot reference
-        self.assertReferenceChecks(gc, op, [X, Y], dot_ref)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-        # Gradient check wrt X
-        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
-        # Gradient check wrt Y
-        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
-
-    @serial.given(n=st.integers(1, 3),
-           dim=st.integers(4, 16),
-           **hu.gcs)
-    def test_L1_distance(self, n, dim, gc, dc):
-        X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
-        Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
-        # avoid kinks by moving away from 0
-        X += 0.02 * np.sign(X - Y)
-        X[(X - Y) == 0.0] += 0.02
-
-        self.ws.create_blob("X").feed(X)
-        self.ws.create_blob("Y").feed(Y)
-        op = core.CreateOperator(
-            'L1Distance',
-            ['X', 'Y'],
-            ['l1_dist'],
-        )
-        self.ws.run(op)
-        np.testing.assert_allclose(self.ws.blobs[("l1_dist")].fetch(),
-                                    [np.linalg.norm(x - y, ord=1)
-                                        for x, y in zip(X, Y)],
-                                    rtol=1e-4, atol=1e-4)
-
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-        # Gradient check wrt X
-        self.assertGradientChecks(gc, op, [X, Y], 0, [0],
-                                  stepsize=1e-2, threshold=1e-2)
-        # Gradient check wrt Y
-        self.assertGradientChecks(gc, op, [X, Y], 1, [0],
-                                  stepsize=1e-2, threshold=1e-2)
-
-    @serial.given(n=st.integers(1, 3),
-           dim=st.integers(4, 16),
-           **hu.gcs)
-    def test_L2_distance(self, n, dim, gc, dc):
-        X = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
-        Y = np.random.uniform(-1, 1, (n, dim)).astype(np.float32)
-        self.ws.create_blob("X").feed(X)
-        self.ws.create_blob("Y").feed(Y)
-        l2_op = core.CreateOperator("SquaredL2Distance",
-                                    ["X", "Y"], ["l2_dist"])
-        self.ws.run(l2_op)
-        np.testing.assert_allclose(self.ws.blobs[("l2_dist")].fetch(),
-                                   np.square(X - Y).sum(axis=1) * 0.5,
-                                   rtol=1e-4, atol=1e-4)
-        self.assertGradientChecks(gc, l2_op, [X, Y], 0, [0],
-                                  stepsize=1e-2, threshold=1e-2)
-        self.assertGradientChecks(gc, l2_op, [X, Y], 1, [0],
-                                  stepsize=1e-2, threshold=1e-2)
diff --git a/caffe2/python/operator_test/dropout_op_test.py b/caffe2/python/operator_test/dropout_op_test.py
deleted file mode 100644
index 6837b013fe34..000000000000
--- a/caffe2/python/operator_test/dropout_op_test.py
+++ /dev/null
@@ -1,108 +0,0 @@
-
-
-
-
-
-from hypothesis import assume, given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-class TestDropout(serial.SerializedTestCase):
-
-    @serial.given(X=hu.tensor(),
-           in_place=st.booleans(),
-           ratio=st.floats(0, 0.999),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs_cpu_only)
-    def test_dropout_is_test(self, X, in_place, ratio, engine, gc, dc):
-        """Test with is_test=True for a deterministic reference impl."""
-        # TODO(lukeyeager): enable this path when the GPU path is fixed
-        if in_place:
-            # Skip if trying in-place on GPU
-            assume(not (gc.device_type in {caffe2_pb2.CUDA, caffe2_pb2.HIP} and engine == ''))
-            # If in-place on CPU, don't compare with GPU
-            dc = dc[:1]
-
-        op = core.CreateOperator("Dropout", ["X"],
-                                 ["X" if in_place else "Y"],
-                                 ratio=ratio, engine=engine, is_test=True)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        # No sense in checking gradients for test phase
-
-        def reference_dropout_test(x):
-            return x, np.ones(x.shape, dtype=bool)
-        self.assertReferenceChecks(
-            gc, op, [X], reference_dropout_test,
-            # The 'mask' output may be uninitialized
-            outputs_to_check=[0])
-
-    @given(X=hu.tensor(),
-           in_place=st.booleans(),
-           output_mask=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_dropout_ratio0(self, X, in_place, output_mask, engine, gc, dc):
-        """Test with ratio=0 for a deterministic reference impl."""
-        # TODO(lukeyeager): enable this path when the op is fixed
-        if in_place:
-            # Skip if trying in-place on GPU
-            assume(gc.device_type not in {caffe2_pb2.CUDA, caffe2_pb2.HIP})
-            # If in-place on CPU, don't compare with GPU
-            dc = dc[:1]
-        is_test = not output_mask
-        op = core.CreateOperator("Dropout", ["X"],
-                                 ["X" if in_place else "Y"] +
-                                 (["mask"] if output_mask else []),
-                                 ratio=0.0, engine=engine,
-                                 is_test=is_test)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        if not is_test:
-            self.assertGradientChecks(gc, op, [X], 0, [0])
-
-        def reference_dropout_ratio0(x):
-            return (x,) if is_test else (x, np.ones(x.shape, dtype=bool))
-        self.assertReferenceChecks(
-            gc, op, [X], reference_dropout_ratio0,
-            # Don't check the mask with cuDNN because it's packed data
-            outputs_to_check=None if engine != 'CUDNN' else [0])
-
-
-    @given(X=hu.tensor(),
-           in_place=st.booleans(),
-           output_mask=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_dropout_ratio1(self, X, in_place, output_mask, engine, gc, dc):
-        """Test with ratio=0 for a deterministic reference impl."""
-        if in_place:
-            # Skip if trying in-place on GPU
-            assume(gc.device_type not in {caffe2_pb2.CUDA, caffe2_pb2.HIP})
-            # If in-place on CPU, don't compare with GPU
-            dc = dc[:1]
-        is_test = not output_mask
-        op = core.CreateOperator("Dropout", ["X"],
-                                 ["X" if in_place else "Y"] +
-                                 (["mask"] if output_mask else []),
-                                 ratio=1.0, engine=engine,
-                                 is_test=is_test)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        if not is_test:
-            self.assertGradientChecks(gc, op, [X], 0, [0])
-
-        def reference_dropout_ratio1(x):
-            return (x,) if is_test else (np.zeros(x.shape, dtype=np.float64), np.zeros(x.shape, dtype=bool))
-        self.assertReferenceChecks(
-            gc, op, [X], reference_dropout_ratio1,
-            # Don't check the mask with cuDNN because it's packed data
-            outputs_to_check=None if engine != 'CUDNN' else [0])
diff --git a/caffe2/python/operator_test/duplicate_operands_test.py b/caffe2/python/operator_test/duplicate_operands_test.py
deleted file mode 100644
index 179b42dbabc8..000000000000
--- a/caffe2/python/operator_test/duplicate_operands_test.py
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-
-
-class TestDuplicateOperands(TestCase):
-    def test_duplicate_operands(self):
-        net = core.Net('net')
-        shape = (2, 4)
-        x_in = np.random.uniform(size=shape)
-        x = net.GivenTensorFill([], 'X', shape=shape,
-                                values=x_in.flatten().tolist())
-        xsq = net.Mul([x, x])
-        y = net.DotProduct([xsq, xsq])
-        net.AddGradientOperators([y])
-        workspace.RunNetOnce(net)
-        self.assertTrue(np.allclose(workspace.FetchBlob('X_grad'),
-                                    4 * x_in**3))
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/elementwise_linear_op_test.py b/caffe2/python/operator_test/elementwise_linear_op_test.py
deleted file mode 100644
index 2bd85625a3d9..000000000000
--- a/caffe2/python/operator_test/elementwise_linear_op_test.py
+++ /dev/null
@@ -1,46 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestElementwiseLinearOp(serial.SerializedTestCase):
-
-    @serial.given(n=st.integers(2, 100), d=st.integers(2, 10), **hu.gcs)
-    # @given(n=st.integers(2, 50), d=st.integers(2, 50), **hu.gcs_cpu_only)
-    def test(self, n, d, gc, dc):
-        X = np.random.rand(n, d).astype(np.float32)
-        a = np.random.rand(d).astype(np.float32)
-        b = np.random.rand(d).astype(np.float32)
-
-        def ref_op(X, a, b):
-            d = a.shape[0]
-            return [np.multiply(X, a.reshape(1, d)) + b.reshape(1, d)]
-
-        op = core.CreateOperator(
-            "ElementwiseLinear",
-            ["X", "a", "b"],
-            ["Y"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, a, b],
-            reference=ref_op,
-        )
-
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X, a, b], [0])
-        # Gradient check wrt X
-        self.assertGradientChecks(gc, op, [X, a, b], 0, [0])
-        # Gradient check wrt a
-        self.assertGradientChecks(gc, op, [X, a, b], 1, [0])
-        # # Gradient check wrt b
-        self.assertGradientChecks(gc, op, [X, a, b], 2, [0])
diff --git a/caffe2/python/operator_test/elementwise_logical_ops_test.py b/caffe2/python/operator_test/elementwise_logical_ops_test.py
deleted file mode 100644
index 3195d969dee5..000000000000
--- a/caffe2/python/operator_test/elementwise_logical_ops_test.py
+++ /dev/null
@@ -1,137 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-def mux(select, left, right):
-    return [np.vectorize(lambda c, x, y: x if c else y)(select, left, right)]
-
-
-def rowmux(select_vec, left, right):
-    select = [[s] * len(left) for s in select_vec]
-    return mux(select, left, right)
-
-
-class TestWhere(serial.SerializedTestCase):
-
-    def test_reference(self):
-        self.assertTrue((
-            np.array([1, 4]) == mux([True, False],
-                                    [1, 2],
-                                    [3, 4])[0]
-        ).all())
-        self.assertTrue((
-            np.array([[1], [4]]) == mux([[True], [False]],
-                                        [[1], [2]],
-                                        [[3], [4]])[0]
-        ).all())
-
-    @given(N=st.integers(min_value=1, max_value=10),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_where(self, N, gc, dc, engine):
-        C = np.random.rand(N).astype(bool)
-        X = np.random.rand(N).astype(np.float32)
-        Y = np.random.rand(N).astype(np.float32)
-        op = core.CreateOperator("Where", ["C", "X", "Y"], ["Z"], engine=engine)
-        self.assertDeviceChecks(dc, op, [C, X, Y], [0])
-        self.assertReferenceChecks(gc, op, [C, X, Y], mux)
-
-    @given(N=st.integers(min_value=1, max_value=10),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_where_dim2(self, N, gc, dc, engine):
-        C = np.random.rand(N, N).astype(bool)
-        X = np.random.rand(N, N).astype(np.float32)
-        Y = np.random.rand(N, N).astype(np.float32)
-        op = core.CreateOperator("Where", ["C", "X", "Y"], ["Z"], engine=engine)
-        self.assertDeviceChecks(dc, op, [C, X, Y], [0])
-        self.assertReferenceChecks(gc, op, [C, X, Y], mux)
-
-
-class TestRowWhere(hu.HypothesisTestCase):
-
-    def test_reference(self):
-        self.assertTrue((
-            np.array([1, 2]) == rowmux([True],
-                                       [1, 2],
-                                       [3, 4])[0]
-        ).all())
-        self.assertTrue((
-            np.array([[1, 2], [7, 8]]) == rowmux([True, False],
-                                                 [[1, 2], [3, 4]],
-                                                 [[5, 6], [7, 8]])[0]
-        ).all())
-
-    @given(N=st.integers(min_value=1, max_value=10),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs_cpu_only)
-    def test_rowwhere(self, N, gc, dc, engine):
-        C = np.random.rand(N).astype(bool)
-        X = np.random.rand(N).astype(np.float32)
-        Y = np.random.rand(N).astype(np.float32)
-        op = core.CreateOperator(
-            "Where",
-            ["C", "X", "Y"],
-            ["Z"],
-            broadcast_on_rows=True,
-            engine=engine,
-        )
-        self.assertDeviceChecks(dc, op, [C, X, Y], [0])
-        self.assertReferenceChecks(gc, op, [C, X, Y], mux)
-
-    @given(N=st.integers(min_value=1, max_value=10),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs_cpu_only)
-    def test_rowwhere_dim2(self, N, gc, dc, engine):
-        C = np.random.rand(N).astype(bool)
-        X = np.random.rand(N, N).astype(np.float32)
-        Y = np.random.rand(N, N).astype(np.float32)
-        op = core.CreateOperator(
-            "Where",
-            ["C", "X", "Y"],
-            ["Z"],
-            broadcast_on_rows=True,
-            engine=engine,
-        )
-        self.assertDeviceChecks(dc, op, [C, X, Y], [0])
-        self.assertReferenceChecks(gc, op, [C, X, Y], rowmux)
-
-
-class TestIsMemberOf(serial.SerializedTestCase):
-
-    @given(N=st.integers(min_value=1, max_value=10),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_is_member_of(self, N, gc, dc, engine):
-        X = np.random.randint(10, size=N).astype(np.int64)
-        values = [0, 3, 4, 6, 8]
-        op = core.CreateOperator(
-            "IsMemberOf",
-            ["X"],
-            ["Y"],
-            value=np.array(values),
-            engine=engine,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-        values = set(values)
-
-        def test(x):
-            return [np.vectorize(lambda x: x in values)(x)]
-        self.assertReferenceChecks(gc, op, [X], test)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
deleted file mode 100644
index 2d8222b59c9f..000000000000
--- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py
+++ /dev/null
@@ -1,427 +0,0 @@
-
-
-
-
-
-import unittest
-
-from hypothesis import given, assume, settings
-import hypothesis.strategies as st
-import numpy as np
-import operator
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-# TODO(jiayq): make them hypothesis tests for better coverage.
-class TestElementwiseBroadcast(serial.SerializedTestCase):
-
-    def __generate_test_cases(self, allow_broadcast_fastpath: bool):
-        """
-        generates a set of test cases
-
-        For each iteration, generates X, Y, args, X_out, Y_out
-        where
-          X, Y         are test input tensors
-          args         is a dictionary of arguments to be passed to
-                       core.CreateOperator()
-          X_out, Y_out are reshaped versions of X and Y
-                       which can be used to calculate the expected
-                       result with the operator to be tested
-        """
-        # Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(4, 5).astype(np.float32)
-        args = dict(broadcast=1, allow_broadcast_fastpath=allow_broadcast_fastpath)
-        yield X, Y, args, X, Y
-
-        # broadcasting intermediate dimensions
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(3, 4).astype(np.float32)
-        args = dict(broadcast=1, axis=1, allow_broadcast_fastpath=allow_broadcast_fastpath)
-        yield X, Y, args, X, Y[:, :, np.newaxis]
-
-        # broadcasting the first dimension
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(2).astype(np.float32)
-        args = dict(broadcast=1, axis=0, allow_broadcast_fastpath=allow_broadcast_fastpath)
-        yield X, Y, args, X, Y[:, np.newaxis, np.newaxis, np.newaxis]
-
-        # broadcasting with single elem dimensions at both ends
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(1, 4, 1).astype(np.float32)
-        args = dict(broadcast=1, axis=1, allow_broadcast_fastpath=allow_broadcast_fastpath)
-        yield X, Y, args, X, Y
-
-    def __test_binary_op(
-        self, gc, dc, caffe2_op, op_function, allow_broadcast_fastpath: bool = False
-    ):
-        """
-        Args:
-            caffe2_op: A string. Name of the caffe operator to test.
-            op_function: an actual python operator (e.g. operator.add)
-        path_prefix: A string. Optional param used to construct db name or path
-            where checkpoint files are stored.
-        """
-
-        for X, Y, op_args, X_out, Y_out in self.__generate_test_cases(allow_broadcast_fastpath):
-            op = core.CreateOperator(caffe2_op, ["X", "Y"], "out", **op_args)
-            workspace.FeedBlob("X", X)
-            workspace.FeedBlob("Y", Y)
-            workspace.RunOperatorOnce(op)
-            out = workspace.FetchBlob("out")
-            np.testing.assert_array_almost_equal(out, op_function(X_out, Y_out))
-            self.assertDeviceChecks(dc, op, [X, Y], [0])
-            self.assertGradientChecks(gc, op, [X, Y], 1, [0])
-
-    @given(allow_broadcast_fastpath=st.booleans(), **hu.gcs)
-    @settings(deadline=None)
-    def test_broadcast_Add(self, allow_broadcast_fastpath: bool, gc, dc):
-        self.__test_binary_op(
-            gc, dc, "Add", operator.add, allow_broadcast_fastpath=allow_broadcast_fastpath
-        )
-
-    @given(allow_broadcast_fastpath=st.booleans(), **hu.gcs)
-    @settings(deadline=None)
-    def test_broadcast_Mul(self, allow_broadcast_fastpath: bool, gc, dc):
-        self.__test_binary_op(
-            gc, dc, "Mul", operator.mul, allow_broadcast_fastpath=allow_broadcast_fastpath
-        )
-
-    @given(allow_broadcast_fastpath=st.booleans(), **hu.gcs)
-    @settings(deadline=None)
-    def test_broadcast_Sub(self, allow_broadcast_fastpath: bool, gc, dc):
-        self.__test_binary_op(
-            gc, dc, "Sub", operator.sub, allow_broadcast_fastpath=allow_broadcast_fastpath
-        )
-
-    @given(**hu.gcs)
-    @settings(deadline=None)
-    def test_broadcast_powt(self, gc, dc):
-        np.random.seed(101)
-
-        #operator
-        def powt_op(X, Y):
-            return [np.power(X, Y)]
-
-        #two gradients Y*X^(Y-1) and X^Y * ln(X)
-        def powt_grad(g_out, outputs, fwd_inputs):
-            [X, Y] = fwd_inputs
-            Z = outputs[0]
-            return ([Y * np.power(X, Y - 1), Z * np.log(X)] * g_out)
-
-        #1. Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
-        Y = np.random.rand(4, 5).astype(np.float32) + 2.0
-
-        #two gradients Y*X^(Y-1) and X^Y * ln(X)
-        #latter gradient is summed over 1 and 0 dims to account for broadcast
-        def powt_grad_broadcast(g_out, outputs, fwd_inputs):
-            [GX, GY] = powt_grad(g_out, outputs, fwd_inputs)
-            return ([GX, np.sum(np.sum(GY, 1), 0)])
-
-        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1)
-        self.assertReferenceChecks(device_option=gc,
-                                   op=op,
-                                   inputs=[X, Y],
-                                   reference=powt_op,
-                                   output_to_grad="Z",
-                                   grad_reference=powt_grad_broadcast)
-
-        #2. broadcasting intermediate dimensions
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
-        Y = np.random.rand(3, 4).astype(np.float32) + 2.0
-
-        #pow op with the latter array increased by one dim
-        def powt_op_axis1(X, Y):
-            return powt_op(X, Y[:, :, np.newaxis])
-
-        #two gradients Y*X^(Y-1) and X^Y * ln(X)
-        #latter gradient is summed over 3 and 0 dims to account for broadcast
-        def powt_grad_axis1(g_out, outputs, fwd_inputs):
-            [X, Y] = fwd_inputs
-            [GX, GY] = powt_grad(g_out, outputs, [X, Y[:, :, np.newaxis]])
-            return ([GX, np.sum(np.sum(GY, 3), 0)])
-
-        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=1)
-        self.assertReferenceChecks(device_option=gc,
-                                   op=op,
-                                   inputs=[X, Y],
-                                   reference=powt_op_axis1,
-                                   output_to_grad="Z",
-                                   grad_reference=powt_grad_axis1)
-
-        #3. broadcasting the first dimension
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
-        Y = np.random.rand(2).astype(np.float32) + 2.0
-
-        #pow op with the latter array increased by one dim
-        def powt_op_axis0(X, Y):
-            return powt_op(X, Y[:, np.newaxis, np.newaxis, np.newaxis])
-
-        #two gradients Y*X^(Y-1) and X^Y * ln(X)
-        #latter gradient is summed over 3, 2 and 1 dims to account for broadcast
-        def powt_grad_axis0(g_out, outputs, fwd_inputs):
-            [X, Y] = fwd_inputs
-            [GX, GY] = powt_grad(g_out,
-                                 outputs,
-                                 [X, Y[:, np.newaxis, np.newaxis, np.newaxis]])
-            return ([GX, np.sum(np.sum(np.sum(GY, 3), 2), 1)])
-
-        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=0)
-        self.assertReferenceChecks(device_option=gc,
-                                   op=op,
-                                   inputs=[X, Y],
-                                   reference=powt_op_axis0,
-                                   output_to_grad="Z",
-                                   grad_reference=powt_grad_axis0)
-
-        #4. broadcasting with single elem dimensions at both ends
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32) + 1.0
-        Y = np.random.rand(1, 4, 1).astype(np.float32) + 2.0
-
-        #pow op with the latter array increased by one dim
-        def powt_op_mixed(X, Y):
-            return powt_op(X, Y[np.newaxis, :, :, :])
-
-        #two gradients Y*X^(Y-1) and X^Y * ln(X)
-        #latter gradient is summed over 0 and 1 dims to account for broadcast
-        def powt_grad_mixed(g_out, outputs, fwd_inputs):
-            [X, Y] = fwd_inputs
-            [GX, GY] = powt_grad(g_out, outputs, [X, Y[np.newaxis, :, :, :]])
-            return ([GX, np.reshape(np.sum(np.sum(np.sum(GY, 3), 1), 0),
-                                    (1, 4, 1))])
-
-        op = core.CreateOperator("Pow", ["X", "Y"], "Z", broadcast=1, axis=1)
-        self.assertReferenceChecks(device_option=gc,
-                                   op=op,
-                                   inputs=[X, Y],
-                                   reference=powt_op_mixed,
-                                   output_to_grad="Z",
-                                   grad_reference=powt_grad_mixed)
-
-    @given(allow_broadcast_fastpath=st.booleans(), **hu.gcs)
-    def test_broadcast_scalar(self, allow_broadcast_fastpath: bool, gc, dc):
-        # broadcasting constant
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(1).astype(np.float32)
-        op = core.CreateOperator(
-            "Add", ["X", "Y"], "out", broadcast=1, allow_broadcast_fastpath=allow_broadcast_fastpath
-        )
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        np.testing.assert_array_almost_equal(
-            out, X + Y)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-        # broadcasting scalar
-        X = np.random.rand(1).astype(np.float32)
-        Y = np.random.rand(1).astype(np.float32).reshape([])
-        op = core.CreateOperator(
-            "Add", ["X", "Y"], "out", broadcast=1, allow_broadcast_fastpath=allow_broadcast_fastpath
-        )
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        np.testing.assert_array_almost_equal(
-            out, X + Y)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-    @given(allow_broadcast_fastpath=st.booleans(), **hu.gcs)
-    def test_semantic_broadcast(self, allow_broadcast_fastpath: bool, gc, dc):
-        # NCHW as default
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(3).astype(np.float32)
-        op = core.CreateOperator(
-            "Add", ["X", "Y"], "out", broadcast=1, axis_str="C",
-            allow_broadcast_fastpath=allow_broadcast_fastpath)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        np.testing.assert_array_almost_equal(
-            out, X + Y[:, np.newaxis, np.newaxis])
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-        # NHWC
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(5).astype(np.float32)
-        op = core.CreateOperator(
-            "Add", ["X", "Y"], "out", broadcast=1, axis_str="C", order="NHWC",
-            allow_broadcast_fastpath=allow_broadcast_fastpath)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        np.testing.assert_array_almost_equal(out, X + Y)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-    @given(**hu.gcs)
-    def test_sum_reduce_empty_blob(self, gc, dc):
-        net = core.Net('test')
-
-        with core.DeviceScope(gc):
-            net.GivenTensorFill([], ["X"], values=[], shape=[2, 0, 5])
-            net.GivenTensorFill([], ["Y"], values=[], shape=[2, 0])
-            net.SumReduceLike(["X", "Y"], "out", axis=0)
-            workspace.RunNetOnce(net)
-
-    @given(**hu.gcs)
-    def test_sum_reduce(self, gc, dc):
-        # Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(4, 5).astype(np.float32)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        res = np.sum(X, axis=0)
-        res = np.sum(res, axis=0)
-        np.testing.assert_array_almost_equal(out, res)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-        # Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(2, 3).astype(np.float32)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        res = np.sum(X, axis=3)
-        res = np.sum(res, axis=2)
-        np.testing.assert_array_almost_equal(out, res, decimal=3)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-        # broadcasting intermediate dimensions
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(3, 4).astype(np.float32)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        res = np.sum(X, axis=0)
-        res = np.sum(res, axis=2)
-        np.testing.assert_array_almost_equal(out, res)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-        # broadcasting intermediate dimensions
-        X = np.random.rand(2, 3, 4, 500).astype(np.float64)
-        Y = np.random.rand(1).astype(np.float64)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        res = np.array(np.sum(X))
-        np.testing.assert_array_almost_equal(out, res, decimal=0)
-
-        # broadcasting with single elem dimensions at both ends
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        Y = np.random.rand(1, 3, 4, 1).astype(np.float32)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.RunOperatorOnce(op)
-        out = workspace.FetchBlob("out")
-        res = np.sum(X, axis=0)
-        res = np.sum(res, axis=2).reshape(Y.shape)
-        np.testing.assert_array_almost_equal(out, res)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-        # fp64 is not supported with the CUDA op
-        dc_cpu_only = [d for d in dc if d.device_type != caffe2_pb2.CUDA]
-        self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0])
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(**hu.gcs)
-    def test_sum_reduce_fp16(self, gc, dc):
-        assume(core.IsGPUDeviceType(gc.device_type))
-
-        # Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
-        Y = np.random.rand(4, 5).astype(np.float16)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1, device_option=gc)
-
-        def ref_op(X, Y):
-            res = np.sum(X, axis=0)
-            res = np.sum(res, axis=0)
-            return [res]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y],
-            reference=ref_op,
-            threshold=1e-3)
-
-        # Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
-        Y = np.random.rand(2, 3).astype(np.float16)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=0)
-
-        def ref_op(X, Y):
-            res = np.sum(X, axis=3)
-            res = np.sum(res, axis=2)
-            return [res]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y],
-            reference=ref_op,
-            threshold=1e-3)
-
-        # broadcasting intermediate dimensions
-        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
-        Y = np.random.rand(3, 4).astype(np.float16)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1, axis=1)
-
-        def ref_op(X, Y):
-            res = np.sum(X, axis=0)
-            res = np.sum(res, axis=2)
-            return [res]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y],
-            reference=ref_op,
-            threshold=1e-3)
-
-        # broadcasting with single elem dimensions at both ends
-        X = np.random.rand(2, 3, 4, 5).astype(np.float16)
-        Y = np.random.rand(1, 3, 4, 1).astype(np.float16)
-        op = core.CreateOperator(
-            "SumReduceLike", ["X", "Y"], "out", broadcast=1)
-
-        def ref_op(X, Y):
-            res = np.sum(X, axis=0)
-            res = np.sum(res, axis=2)
-            return [res.reshape(Y.shape)]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y],
-            reference=ref_op,
-            threshold=1e-3)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/elementwise_ops_test.py b/caffe2/python/operator_test/elementwise_ops_test.py
deleted file mode 100644
index 2fa753f27a3e..000000000000
--- a/caffe2/python/operator_test/elementwise_ops_test.py
+++ /dev/null
@@ -1,1048 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from hypothesis import given, assume, settings
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-
-class TestElementwiseOps(hu.HypothesisTestCase):
-
-    @given(X=hu.tensor(dtype=np.float32), **hu.gcs)
-    @settings(deadline=10000)
-    def test_abs(self, X, gc, dc):
-        op = core.CreateOperator(
-            "Abs",
-            ["X"],
-            ["Y"],
-        )
-
-        def abs_ref(X):
-            return [np.absolute(X)]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=abs_ref,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], ensure_outputs_are_inferred=True)
-
-    @given(X=hu.tensor(dtype=np.float32), inplace=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_exp(self, X, inplace, gc, dc):
-        op = core.CreateOperator(
-            "Exp",
-            ["X"],
-            ["X"] if inplace else ["Y"],
-        )
-
-        def exp_ref(X):
-            return [np.exp(X)]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=exp_ref,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], ensure_outputs_are_inferred=True)
-
-    @given(n=st.integers(0, 6), m=st.integers(4, 6),
-           seed=st.integers(0, 1000), **hu.gcs)
-    @settings(deadline=10000)
-    def test_log(self, n, m, gc, dc, seed):
-        np.random.seed(seed)
-        X = np.random.rand(n, m).astype(np.float32) + 1.0
-
-        def log_op(X):
-            return [np.log(X)]
-
-        op = core.CreateOperator(
-            "Log",
-            ["X"],
-            ["Z"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=log_op,
-            ensure_outputs_are_inferred=True,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2,
-            ensure_outputs_are_inferred=True)
-
-    @given(n=st.integers(0, 10), m=st.integers(4, 6),
-           d=st.integers(2, 3), seed=st.integers(0, 1000), **hu.gcs)
-    @settings(deadline=10000)
-    def test_powt(self, n, m, d, gc, dc, seed):
-        np.random.seed(seed)
-        X = np.random.rand(n, m, d).astype(np.float32) + 1.0
-        Y = np.random.rand(n, m, d).astype(np.float32) + 2.0
-
-        def powt_op(X, Y):
-            return [np.power(X, Y)]
-
-        #two gradients Y*X^(Y-1) and X^Y * ln(X)
-        def powt_grad(g_out, outputs, fwd_inputs):
-            [X, Y] = fwd_inputs
-            Z = outputs[0]
-            return ([Y * np.power(X, Y - 1), Z * np.log(X)] * g_out)
-
-        op = core.CreateOperator(
-            "Pow",
-            ["X", "Y"],
-            ["Z"]
-        )
-
-        self.assertReferenceChecks(device_option=gc,
-                                   op=op,
-                                   inputs=[X, Y],
-                                   reference=powt_op,
-                                   output_to_grad="Z",
-                                   grad_reference=powt_grad,
-                                   ensure_outputs_are_inferred=True)
-
-    @given(n=st.integers(0, 6), m=st.integers(4, 6),
-           seed=st.integers(0, 1000), **hu.gcs)
-    @settings(deadline=10000)
-    def test_sqr(self, n, m, gc, dc, seed):
-        np.random.seed(seed)
-        X = np.random.rand(n, m).astype(np.float32)
-
-        def sqr_op(X):
-            return [np.square(X)]
-
-        op = core.CreateOperator(
-            "Sqr",
-            ["X"],
-            ["Z"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=sqr_op,
-            ensure_outputs_are_inferred=True,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2,
-            ensure_outputs_are_inferred=True)
-
-    @given(
-        X=hu.tensor(
-            elements=hu.floats(min_value=0.1, max_value=10),
-            # allow empty tensor
-            min_value=0),
-        inplace=st.booleans(),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_sqrt(self, X, inplace, gc, dc):
-        def sqrt_op(X):
-            return [np.sqrt(X)]
-
-        op = core.CreateOperator(
-            "Sqrt",
-            ["X"],
-            ["X"] if inplace else ["Y"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=sqrt_op,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-        # stepsize need to be smaller than the possible minimum X, so the
-        # sqrt is well defined
-        self.assertGradientChecks(
-            gc, op, [X], 0, [0], stepsize=1e-2, ensure_outputs_are_inferred=True)
-
-    @given(X=hu.tensor(dtype=np.float32), inplace=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_softsign(self, X, inplace, gc, dc):
-        op = core.CreateOperator(
-            "Softsign",
-            ["X"],
-            ["X"] if inplace else ["Y"],
-        )
-
-        def softsign_ref(X):
-            return [X / (1.0 + np.absolute(X))]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=softsign_ref,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-        if not inplace:
-            self.assertGradientChecks(
-                gc, op, [X], 0, [0],
-                ensure_outputs_are_inferred=True,
-            )
-
-    @given(X=hu.tensor(elements=hu.floats(min_value=0.1, max_value=10.0), dtype=np.float32),
-           inplace=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_rsqrt(self, X, inplace, gc, dc):
-        op = core.CreateOperator(
-            "Rsqrt",
-            ["X"],
-            ["X"] if inplace else ["Y"],
-        )
-
-        def rsqrt_ref(X):
-            return [1.0 / np.sqrt(X)]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=rsqrt_ref,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(
-            gc, op, [X], 0, [0], stepsize=5e-3,
-            ensure_outputs_are_inferred=True,
-        )
-
-    @given(X=hu.tensor(dtype=np.float32), **hu.gcs)
-    @settings(deadline=10000)
-    def test_cube(self, X, gc, dc):
-        op = core.CreateOperator(
-            "Cube",
-            ["X"],
-            ["Y"],
-        )
-
-        def cube_ref(X):
-            return [np.power(X, 3)]
-
-        def cube_grad_ref(g_out, outputs, fwd_inputs):
-            dY = g_out
-            [X] = fwd_inputs
-            return [dY * np.square(X) * 3]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=cube_ref,
-            output_to_grad="Y",
-            grad_reference=cube_grad_ref,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_cbrt(self, X, in_place, gc, dc):
-        op = core.CreateOperator(
-            "Cbrt",
-            ["X"],
-            ["X"] if in_place else ["Y"],
-        )
-
-        def cbrt_ref(X):
-            return [np.cbrt(X)]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=cbrt_ref,
-            ensure_outputs_are_inferred=True,
-        )
-
-    @given(X=hu.tensor(elements=hu.floats(min_value=1.0, max_value=10.0), dtype=np.float32),
-           in_place=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_cbrt_grad(self, X, in_place, gc, dc):
-        op = core.CreateOperator(
-            "Cbrt",
-            ["X"],
-            ["X"] if in_place else ["Y"],
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X], 0, [0],
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertGradientChecks(
-            gc, op, [-X], 0, [0],
-            ensure_outputs_are_inferred=True,
-        )
-
-
-    @given(n=st.integers(0, 6), m=st.integers(4, 6),
-           seed=st.integers(0, 1000), **hu.gcs)
-    @settings(deadline=10000)
-    def test_swish(self, n, m, gc, dc, seed):
-        np.random.seed(seed)
-        X = np.random.rand(n, m).astype(np.float32)
-
-        def swish(X):
-            return [np.divide(X, (1. + np.exp(-X)))]
-
-        op = core.CreateOperator(
-            "Swish",
-            ["X"],
-            ["Z"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=swish,
-            ensure_outputs_are_inferred=True,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2,
-            ensure_outputs_are_inferred=True)
-
-    @given(n=st.integers(0, 6), m=st.integers(4, 6),
-           seed=st.integers(0, 1000), **hu.gcs)
-    @settings(deadline=10000)
-    def test_swish_gradient_inplace(self, n, m, gc, dc, seed):
-        np.random.seed(seed)
-
-        def swish(X):
-            return [np.divide(X, (1. + np.exp(-X)))]
-
-        def swish_gradient(X, Y, dY):
-            return [dY * (Y + np.divide(1. - Y, 1. + np.exp(-X)))]
-
-        X = np.random.rand(n, m).astype(np.float32)
-        Y = swish(X)[0]
-        dY = np.random.rand(n, m).astype(np.float32)
-        op = core.CreateOperator(
-            "SwishGradient",
-            ["X", "Y", "grad"],
-            "grad"
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y, dY],
-            reference=swish_gradient,
-        )
-
-    @given(n=st.integers(1, 6),
-           m=st.integers(4, 6),
-           inplace=st.booleans(),
-           allow_broadcast_fastpath=st.booleans(),
-           seed=st.integers(0, 1000), **hu.gcs)
-    @settings(deadline=10000)
-    def test_mul_gradient_inplace_or_broadcast(
-        self,
-        n: int,
-        m: int,
-        inplace: bool,
-        allow_broadcast_fastpath: bool,
-        gc,
-        dc,
-        seed: int,
-    ):
-        broadcast = not inplace
-        np.random.seed(seed)
-
-        def mul_gradient(dC, A, B):
-            dA = B * dC
-            dB = A * dC
-            if broadcast:
-                dB = np.sum(dB, axis=0)
-            return [dA, dB]
-
-        A = np.random.rand(n, m).astype(np.float32)
-        if broadcast:
-            B = np.random.rand(m).astype(np.float32)
-        else:
-            B = np.random.rand(n, m).astype(np.float32)
-        dC = np.random.rand(n, m).astype(np.float32)
-        op_dA_inplace = core.CreateOperator(
-            "MulGradient",
-            ["dC", "A", "B"],
-            ["dC" if inplace else "dA", "dB"],
-            allow_broadcast_fastpath=allow_broadcast_fastpath,
-        )
-        op_dB_inplace = core.CreateOperator(
-            "MulGradient",
-            ["dC", "A", "B"],
-            ["dA", "dC" if inplace else "dB"],
-            allow_broadcast_fastpath=allow_broadcast_fastpath,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op_dA_inplace,
-            inputs=[dC, A, B],
-            reference=mul_gradient,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op_dB_inplace,
-            inputs=[dC, A, B],
-            reference=mul_gradient,
-        )
-
-    @given(n=st.integers(1, 6),
-           m=st.integers(4, 6),
-           inplace=st.booleans(),
-           allow_broadcast_fastpath=st.booleans(),
-           seed=st.integers(0, 1000), **hu.gcs)
-    @settings(deadline=10000)
-    def test_div_gradient_inplace_or_broadcast(
-        self,
-        n: int,
-        m: int,
-        inplace: bool,
-        allow_broadcast_fastpath: bool,
-        gc,
-        dc,
-        seed: int,
-    ):
-        broadcast = not inplace
-        np.random.seed(seed)
-
-        def div_gradient(dC, _A, B, C):
-            dA = dC / B
-            dB = -dC * C / B
-            if broadcast:
-                dB = np.sum(dB, axis=0)
-            return [dA, dB]
-
-        A = np.random.rand(n, m).astype(np.float32)
-        if broadcast:
-            B = np.random.rand(m).astype(np.float32) + 1.0
-        else:
-            B = np.random.rand(n, m).astype(np.float32) + 1.0
-        C = A / B
-        dC = np.random.rand(n, m).astype(np.float32)
-        op = core.CreateOperator(
-            "DivGradient",
-            ["dC", "A", "B", "C"],
-            ["dC" if inplace else "dA", "dB"],
-            allow_broadcast_fastpath=allow_broadcast_fastpath,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[dC, A, B, C],
-            reference=div_gradient,
-        )
-
-    @given(n=st.integers(1, 6),
-           m=st.integers(4, 6),
-           inplace=st.booleans(),
-           allow_broadcast_fastpath=st.booleans(),
-           seed=st.integers(0, 1000), **hu.gcs)
-    @settings(deadline=10000)
-    def test_add_gradient_inplace_or_broadcast(
-        self,
-        n: int,
-        m: int,
-        inplace: bool,
-        allow_broadcast_fastpath: bool,
-        gc,
-        dc,
-        seed: int,
-    ):
-        broadcast = not inplace
-        np.random.seed(seed)
-
-        def add_gradient(dC, _A, _B):
-            dA, dB = dC, dC
-            if broadcast:
-                dB = np.sum(dB, axis=0)
-            return [dA, dB]
-
-        A = np.random.rand(n, m).astype(np.float32)
-        if broadcast:
-            B = np.random.rand(m).astype(np.float32)
-        else:
-            B = np.random.rand(n, m).astype(np.float32)
-        dC = np.random.rand(n, m).astype(np.float32)
-        op_dA_inplace = core.CreateOperator(
-            "AddGradient",
-            ["dC", "A", "B"],
-            ["dC" if inplace else "dA", "dB"],
-            allow_broadcast_fastpath=allow_broadcast_fastpath,
-        )
-        op_dB_inplace = core.CreateOperator(
-            "AddGradient",
-            ["dC", "A", "B"],
-            ["dA", "dC" if inplace else "dB"],
-            allow_broadcast_fastpath=allow_broadcast_fastpath,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op_dA_inplace,
-            inputs=[dC, A, B],
-            reference=add_gradient,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op_dB_inplace,
-            inputs=[dC, A, B],
-            reference=add_gradient,
-        )
-
-    @given(n=st.integers(1, 6),
-           m=st.integers(4, 6),
-           inplace=st.booleans(),
-           allow_broadcast_fastpath=st.booleans(),
-           seed=st.integers(0, 1000), **hu.gcs)
-    @settings(deadline=10000)
-    def test_sub_gradient_inplace_or_broadcast(
-        self,
-        n: int,
-        m: int,
-        inplace: bool,
-        allow_broadcast_fastpath: bool,
-        gc,
-        dc,
-        seed: int,
-    ):
-        broadcast = not inplace
-        np.random.seed(seed)
-
-        def sub_gradient(dC, _A, _B):
-            dA, dB = dC, -dC
-            if broadcast:
-                dB = np.sum(dB, axis=0)
-            return [dA, dB]
-
-        A = np.random.rand(n, m).astype(np.float32)
-        if broadcast:
-            B = np.random.rand(m).astype(np.float32)
-        else:
-            B = np.random.rand(n, m).astype(np.float32)
-        dC = np.random.rand(n, m).astype(np.float32)
-        op_dA_inplace = core.CreateOperator(
-            "SubGradient",
-            ["dC", "A", "B"],
-            ["dC" if inplace else "dA", "dB"],
-            allow_broadcast_fastpath=allow_broadcast_fastpath,
-        )
-        op_dB_inplace = core.CreateOperator(
-            "SubGradient",
-            ["dC", "A", "B"],
-            ["dA", "dC" if inplace else "dB"],
-            allow_broadcast_fastpath=allow_broadcast_fastpath,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op_dA_inplace,
-            inputs=[dC, A, B],
-            reference=sub_gradient,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op_dB_inplace,
-            inputs=[dC, A, B],
-            reference=sub_gradient,
-        )
-
-    @given(X=hu.tensor(dtype=np.float32), inplace=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
-    @settings(deadline=10000)
-    def test_sigmoid(self, X, inplace, engine, gc, dc):
-        op = core.CreateOperator(
-            "Sigmoid",
-            ["X"],
-            ["X"] if inplace else ["Y"],
-            engine=engine,
-        )
-
-        def sigmoid_ref(X):
-            return [1.0 / (1.0 + np.exp(-X))]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=sigmoid_ref,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], ensure_outputs_are_inferred=True)
-
-    @given(X=hu.tensor(dtype=np.float32), inplace=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
-    @settings(deadline=10000)
-    def test_tanh(self, X, inplace, engine, gc, dc):
-        op = core.CreateOperator(
-            "Tanh",
-            ["X"],
-            ["X"] if inplace else ["Y"],
-            engine=engine,
-        )
-
-        def tanh_ref(X):
-            return [np.tanh(X)]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=tanh_ref,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], ensure_outputs_are_inferred=True)
-
-    @given(X=hu.tensor(dtype=np.float32),
-           inplace=st.booleans(),
-           alpha=hu.floats(min_value=-100.0, max_value=100.0),
-           beta=hu.floats(min_value=-100.0, max_value=100.0),
-           engine=st.sampled_from([""]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_hard_sigmoid(self, X, inplace, alpha, beta, engine, gc, dc):
-        # Prevent alpha and beta from mutually being 0 to avoid a division
-        # error when adjusting our inputs
-        assume(alpha != 0.0 or beta != 0.0)
-        op = core.CreateOperator(
-            "HardSigmoid",
-            ["X"],
-            ["X"] if inplace else ["Y"],
-            alpha=alpha,
-            beta=beta,
-            engine=engine,
-        )
-
-        def hard_sigmoid_ref(X):
-            return [np.minimum(1.0, np.maximum(0.0, X * alpha + beta))]
-
-        # Adjust inputs to avoid differentitating at inflection points
-        if abs(alpha) > 0.001:
-            Y = X * alpha + beta
-            Y += 0.04 * np.sign(Y)
-            Y[Y == 0.0] += 0.1
-            Y[Y == 1.0] -= 0.1
-            X = (Y - beta) / alpha
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=hard_sigmoid_ref,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(
-            gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2,
-            ensure_outputs_are_inferred=True)
-
-    @given(n=st.integers(0, 6), m=st.integers(4, 6), **hu.gcs)
-    @settings(deadline=10000)
-    def test_eq(self, n, m, gc, dc):
-        # Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.randint(2, size=(n, m))
-        Y = np.random.randint(2, size=(n, m))
-        op = core.CreateOperator("EQ", ["X", "Y"], "out", broadcast=1)
-
-        def eq(X, Y):
-            return [X == Y]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y],
-            reference=eq,
-            ensure_outputs_are_inferred=True,
-        )
-
-        workspace.FeedBlob('X', X)
-        workspace.FeedBlob('Y', Y)
-
-        net = core.Net("batch_bucket_one_hot_test")
-        result = net.EQ(["X", "Y"], 1)
-        (shapes, types) = workspace.InferShapesAndTypes([net])
-        workspace.RunNetOnce(net)
-
-        self.assertEqual(shapes[result], list(workspace.blobs[result].shape))
-        self.assertEqual(shapes[result], list(X.shape))
-        self.assertEqual(types[result], core.DataType.BOOL)
-
-    @given(n=st.integers(0, 6), m=st.integers(4, 6), **hu.gcs)
-    @settings(deadline=10000)
-    def test_eq_bcast(self, n, m, gc, dc):
-        # Set broadcast and no axis, i.e. broadcasting last dimensions.
-        X = np.random.randint(2, size=(n, m))
-        Y = np.random.randint(2, size=(m,))
-        op = core.CreateOperator("EQ", ["X", "Y"], "out", broadcast=1)
-
-        def eq(X, Y):
-            return [X == Y]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, Y],
-            reference=eq,
-            ensure_outputs_are_inferred=True,
-        )
-
-        workspace.FeedBlob('X', X)
-        workspace.FeedBlob('Y', Y)
-
-        net = core.Net("eq_bast")
-        result = net.EQ(["X", "Y"], 1, broadcast=1)
-        (shapes, types) = workspace.InferShapesAndTypes([net])
-        workspace.RunNetOnce(net)
-        self.assertTrue(str(result) in shapes)
-        self.assertEqual(shapes[result], list(workspace.blobs[result].shape))
-        self.assertEqual(shapes[result], list(X.shape))
-        self.assertEqual(types[result], core.DataType.BOOL)
-
-        net_2 = core.Net("eq_bast_invalid")
-        result_2 = net_2.EQ(["X", "Y"], 1)
-        (shapes, types) = workspace.InferShapesAndTypes([net])
-        self.assertTrue(str(result_2) not in shapes)
-
-    def _run_single_test(
-            self, op, ref, A, B, reverse_inputs, test_grad, gc, dc):
-        inputs = [A, B]
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=ref,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        if test_grad:
-            for i in range(len(inputs)):
-                self.assertGradientChecks(
-                    gc, op, inputs, i, [0],
-                    ensure_outputs_are_inferred=True,
-                )
-
-        if reverse_inputs:
-            inputs = [B, A]
-            self.assertReferenceChecks(
-                device_option=gc,
-                op=op,
-                inputs=inputs,
-                reference=ref,
-                ensure_outputs_are_inferred=True,
-           )
-            self.assertDeviceChecks(dc, op, inputs, [0])
-            if test_grad:
-                for i in range(len(inputs)):
-                    self.assertGradientChecks(
-                        gc, op, inputs, i, [0],
-                        ensure_outputs_are_inferred=True,
-                    )
-
-    def _test_binary_op(
-            self, op_name, np_ref, n, m, k, t, bias, test_grad, gc, dc):
-        op = core.CreateOperator(
-            op_name,
-            ["A", "B"],
-            ["C"],
-        )
-
-        def ref(A, B):
-            return [np_ref(A, B)]
-
-        A = np.random.rand(n, m, k, t).astype(np.float32) + bias
-        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
-        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
-
-        A = np.random.rand(1).astype(np.float32) + bias
-        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
-        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
-
-        A = np.random.rand(k, t).astype(np.float32) + bias
-        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
-        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
-
-        A = np.random.rand(n, m, 1, 1).astype(np.float32) + bias
-        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
-        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
-
-        A = np.random.rand(1, m, k, 1).astype(np.float32) + bias
-        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
-        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
-
-        A = np.random.rand(m, 1, t).astype(np.float32) + bias
-        B = np.random.rand(n, m, k, t).astype(np.float32) + bias
-        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
-
-        A = np.random.rand(1, m, 1, t).astype(np.float32) + bias
-        B = np.random.rand(n, 1, k, 1).astype(np.float32) + bias
-        self._run_single_test(op, ref, A, B, True, test_grad, gc, dc)
-
-    def _test_binary_op_in_place(
-            self, op_name, np_ref, n, m, bias, test_grad, in_place_2nd, gc, dc):
-        def ref(A, B):
-            return [np_ref(A, B)]
-
-        op = core.CreateOperator(
-            op_name,
-            ["A", "B"],
-            ["A"],
-        )
-        A = np.random.rand(n, m).astype(np.float32) + bias
-        B = np.random.rand(m).astype(np.float32) + bias
-
-        self._run_single_test(op, ref, A, B, False, test_grad, gc, dc)
-        A = np.random.rand(n, m).astype(np.float32) + bias
-        B = np.random.rand(n, 1).astype(np.float32) + bias
-        self._run_single_test(op, ref, A, B, False, test_grad, gc, dc)
-
-        if in_place_2nd:
-            op = core.CreateOperator(
-                op_name,
-                ["A", "B"],
-                ["B"],
-            )
-            A = np.random.rand(m).astype(np.float32) + bias
-            B = np.random.rand(n, m).astype(np.float32) + bias
-            self._run_single_test(op, ref, A, B, False, test_grad, gc, dc)
-            A = np.random.rand(n, 1).astype(np.float32) + bias
-            B = np.random.rand(n, m).astype(np.float32) + bias
-            self._run_single_test(op, ref, A, B, False, test_grad, gc, dc)
-
-    @given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
-           t=st.integers(0, 5), **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_add(self, n, m, k, t, gc, dc):
-        self._test_binary_op("Add", np.add, n, m, k, t, -0.5, True, gc, dc)
-        self._test_binary_op_in_place(
-            "Add", np.add, n, m, -0.5, True, True, gc, dc)
-
-    @given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
-           t=st.integers(0, 5), **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_sub(self, n, m, k, t, gc, dc):
-        self._test_binary_op("Sub", np.subtract, n, m,
-                             k, t, -0.5, True, gc, dc)
-        self._test_binary_op_in_place(
-            "Sub", np.subtract, n, m, -0.5, True, True, gc, dc)
-
-    @given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
-           t=st.integers(0, 5), **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_mul(self, n, m, k, t, gc, dc):
-        self._test_binary_op("Mul", np.multiply, n, m,
-                             k, t, -0.5, True, gc, dc)
-
-    @given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
-           t=st.integers(0, 5), **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_div(self, n, m, k, t, gc, dc):
-        self._test_binary_op("Div", np.divide, n, m, k, t, 1.0, True, gc, dc)
-        self._test_binary_op_in_place(
-            "Div", np.divide, n, m, 1.0, True, False, gc, dc)
-
-    @given(n=st.integers(1, 5), m=st.integers(1, 5), broadcast=st.booleans(),
-           allow_broadcast_fastpath=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_div_legacy_grad(
-        self,
-        n: int,
-        m: int,
-        broadcast: bool,
-        allow_broadcast_fastpath: bool,
-        gc,
-        dc
-    ):
-        op = core.CreateOperator(
-            "DivGradient",
-            ["B", "C", "dC"],
-            ["dA", "dB"],
-            allow_broadcast_fastpath=allow_broadcast_fastpath,
-        )
-
-        def div_grad_ref(B, C, dC):
-            dA = dC / B
-            dB = -dC * C / B
-            if broadcast:
-                dB = np.sum(dB, axis=0)
-            return [dA, dB]
-
-        if broadcast:
-            B = np.random.rand(m).astype(np.float32) + 1.0
-        else:
-            B = np.random.rand(n, m).astype(np.float32) + 1.0
-        C = np.random.randn(n, m).astype(np.float32)
-        dC = np.random.randn(n, m).astype(np.float32)
-        inputs = [B, C, dC]
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=div_grad_ref,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0, 1])
-
-    def _test_bitwise_binary_op(self, op_name, np_ref, n, m, k, t, gc, dc):
-        op = core.CreateOperator(
-            op_name,
-            ["A", "B"],
-            ["C"],
-        )
-
-        def ref(A, B):
-            return [np_ref(A, B)]
-
-        A = np.random.randint(128, size=(n, m, k, t))
-        B = np.random.randint(128, size=(n, m, k, t))
-        self._run_single_test(op, ref, A, B, True, False, gc, dc)
-
-        A = np.random.randint(128, size=1)
-        B = np.random.randint(128, size=(n, m, k, t))
-        self._run_single_test(op, ref, A, B, True, False, gc, dc)
-
-        A = np.random.randint(128, size=(k, t))
-        B = np.random.randint(128, size=(n, m, k, t))
-        self._run_single_test(op, ref, A, B, True, False, gc, dc)
-
-        A = np.random.randint(128, size=(n, m, 1, 1))
-        B = np.random.randint(128, size=(n, m, k, t))
-        self._run_single_test(op, ref, A, B, True, False, gc, dc)
-
-        A = np.random.randint(128, size=(1, m, k, 1))
-        B = np.random.randint(128, size=(n, m, k, t))
-        self._run_single_test(op, ref, A, B, True, False, gc, dc)
-
-        A = np.random.randint(128, size=(m, 1, t))
-        B = np.random.randint(128, size=(n, m, k, t))
-        self._run_single_test(op, ref, A, B, True, False, gc, dc)
-
-        A = np.random.randint(128, size=(1, m, 1, t))
-        B = np.random.randint(128, size=(n, 1, k, 1))
-        self._run_single_test(op, ref, A, B, True, False, gc, dc)
-
-    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
-           t=st.integers(1, 5), **hu.gcs)
-    @settings(deadline=10000)
-    def test_bitwise_and(self, n, m, k, t, gc, dc):
-        self._test_bitwise_binary_op(
-            "BitwiseAnd", np.bitwise_and, n, m, k, t, gc, dc)
-
-    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
-           t=st.integers(1, 5), **hu.gcs)
-    @settings(deadline=10000)
-    def test_bitwise_or(self, n, m, k, t, gc, dc):
-        self._test_bitwise_binary_op(
-            "BitwiseOr", np.bitwise_or, n, m, k, t, gc, dc)
-
-    @given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
-           t=st.integers(1, 5), **hu.gcs)
-    @settings(deadline=10000)
-    def test_bitwise_xor(self, n, m, k, t, gc, dc):
-        self._test_bitwise_binary_op(
-            "BitwiseXor", np.bitwise_xor, n, m, k, t, gc, dc)
-
-    @given(X=hu.tensor(elements=hu.floats(min_value=0.5, max_value=2), dtype=np.float32),
-           inplace=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_reciprocal(self, X, inplace, gc, dc):
-        def reciprocal_op(X):
-            return [np.reciprocal(X)]
-
-        op = core.CreateOperator(
-            "Reciprocal",
-            ["X"],
-            ["X"] if inplace else ["Y"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=reciprocal_op,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(
-            gc, op, [X], 0, [0], stepsize=1e-3, threshold=0.05,
-            ensure_outputs_are_inferred=True)
-
-    @given(X=hu.tensor(dtype=bool), **hu.gcs)
-    @settings(deadline=10000)
-    def test_not(self, X, gc, dc):
-        def not_op(X):
-            return [np.logical_not(X)]
-
-        op = core.CreateOperator(
-            "Not",
-            ["X"],
-            ["Y"],
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=not_op,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(X=hu.tensor(dtype=np.float32), **hu.gcs)
-    @settings(deadline=10000)
-    def test_log1p(self, X, gc, dc):
-        op = core.CreateOperator(
-            "Log1p",
-            ["X"],
-            ["Y"]
-        )
-
-        def ref_log1p(input):
-            result = np.log1p(input)
-            return (result,)
-
-        def ref_log1p_grad(g_out, outputs, fwd_inputs):
-            result = g_out / (fwd_inputs[0] + 1)
-            return (result,)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=ref_log1p,
-            output_to_grad="Y",
-            grad_reference=ref_log1p_grad,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/emptysample_ops_test.py b/caffe2/python/operator_test/emptysample_ops_test.py
deleted file mode 100644
index 0f728b723163..000000000000
--- a/caffe2/python/operator_test/emptysample_ops_test.py
+++ /dev/null
@@ -1,63 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-import numpy as np
-
-
-lengths = [[0], [1, 2], [1, 0, 2, 0]]
-features1 = [[],
-             [1, 2, 2],
-             [[1, 1], [2, 2], [2, 2]]
-             ]
-features2 = [[],
-             [2, 4, 4],
-             [[2, 2], [4, 4], [4, 4]]
-             ]
-
-lengths_exp = [[1], [1, 2], [1, 1, 2, 1]]
-features1_exp = [[0],
-                 [1, 2, 2],
-                 [[1, 1], [0, 0], [2, 2], [2, 2], [0, 0]]]
-features2_exp = [[0],
-                 [2, 4, 4],
-                 [[2, 2], [0, 0], [4, 4], [4, 4], [0, 0]]]
-
-
-class TestEmptySampleOps(TestCase):
-    def test_emptysample(self):
-        for i in range(0, 3):
-            PadEmptyTest = core.CreateOperator(
-                'PadEmptySamples',
-                ['lengths', 'features1', 'features2'],
-                ['out_lengths', 'out_features1', 'out_features2'],
-            )
-            workspace.FeedBlob(
-                'lengths',
-                np.array(lengths[i], dtype=np.int32))
-            workspace.FeedBlob(
-                'features1',
-                np.array(features1[i], dtype=np.int64))
-            workspace.FeedBlob(
-                'features2',
-                np.array(features2[i], dtype=np.int64))
-            workspace.RunOperatorOnce(PadEmptyTest)
-            np.testing.assert_allclose(
-                lengths_exp[i],
-                workspace.FetchBlob('out_lengths'),
-                atol=1e-4, rtol=1e-4, err_msg='Mismatch in lengths')
-            np.testing.assert_allclose(
-                features1_exp[i],
-                workspace.FetchBlob('out_features1'),
-                atol=1e-4, rtol=1e-4, err_msg='Mismatch in features1')
-            np.testing.assert_allclose(
-                features2_exp[i],
-                workspace.FetchBlob('out_features2'),
-                atol=1e-4, rtol=1e-4, err_msg='Mismatch in features2')
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/enforce_finite_op_test.py b/caffe2/python/operator_test/enforce_finite_op_test.py
deleted file mode 100644
index 8150977945a2..000000000000
--- a/caffe2/python/operator_test/enforce_finite_op_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-
-
-
-
-
-from hypothesis import given, settings
-import numpy as np
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestEnforceFinite(hu.HypothesisTestCase):
-    @given(
-        X=hu.tensor(
-            # allow empty
-            min_value=0,
-            elements=hu.floats(allow_nan=True, allow_infinity=True),
-        ),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_enforce_finite(self, X, gc, dc):
-
-        def all_finite_value(X):
-            if X.size <= 0:
-                return True
-
-            return np.isfinite(X).all()
-
-        net = core.Net('test_net')
-        net.Const(array=X, blob_out="X")
-        net.EnforceFinite("X", [])
-
-        if all_finite_value(X):
-            self.assertTrue(workspace.RunNetOnce(net))
-        else:
-            with self.assertRaises(RuntimeError):
-                workspace.RunNetOnce(net)
-
-    @given(
-        X=hu.tensor(
-            elements=hu.floats(min_value=0, max_value=10, allow_nan=False, allow_infinity=False),
-        ),
-        **hu.gcs
-    )
-    def test_enforce_finite_device_check(self, X, gc, dc):
-        op = core.CreateOperator(
-            "EnforceFinite",
-            ["X"],
-            [],
-        )
-        self.assertDeviceChecks(dc, op, [X], [])
diff --git a/caffe2/python/operator_test/ensure_clipped_test.py b/caffe2/python/operator_test/ensure_clipped_test.py
deleted file mode 100644
index a89718745b1c..000000000000
--- a/caffe2/python/operator_test/ensure_clipped_test.py
+++ /dev/null
@@ -1,43 +0,0 @@
-
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-import numpy.testing as npt
-from caffe2.python import core, workspace
-from hypothesis import given
-
-
-class TestEnsureClipped(hu.HypothesisTestCase):
-    @given(
-        X=hu.arrays(dims=[5, 10], elements=hu.floats(min_value=-1.0, max_value=1.0)),
-        in_place=st.booleans(),
-        sparse=st.booleans(),
-        indices=hu.arrays(dims=[5], elements=st.booleans()),
-        **hu.gcs_cpu_only
-    )
-    def test_ensure_clipped(self, X, in_place, sparse, indices, gc, dc):
-        if (not in_place) and sparse:
-            return
-        param = X.astype(np.float32)
-        m, n = param.shape
-        indices = np.array(np.nonzero(indices)[0], dtype=np.int64)
-        grad = np.random.rand(len(indices), n)
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("grad", grad)
-        workspace.FeedBlob("param", param)
-        input = ["param", "indices", "grad"] if sparse else ["param"]
-        output = "param" if in_place else "output"
-        op = core.CreateOperator("EnsureClipped", input, output, min=0.0)
-        workspace.RunOperatorOnce(op)
-
-        def ref():
-            return (
-                np.array(
-                    [np.clip(X[i], 0, None) if i in indices else X[i] for i in range(m)]
-                )
-                if sparse
-                else np.clip(X, 0, None)
-            )
-
-        npt.assert_allclose(workspace.blobs[output], ref(), rtol=1e-3)
diff --git a/caffe2/python/operator_test/ensure_cpu_output_op_test.py b/caffe2/python/operator_test/ensure_cpu_output_op_test.py
deleted file mode 100644
index 4812ee3042e0..000000000000
--- a/caffe2/python/operator_test/ensure_cpu_output_op_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-
-
-
-
-
-from hypothesis import given
-import numpy as np
-import hypothesis.strategies as st
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-
-
-@st.composite
-def _dev_options(draw):
-    op_dev = draw(st.sampled_from(hu.device_options))
-    if op_dev == hu.cpu_do:
-        # the CPU op can only handle CPU tensor
-        input_blob_dev = hu.cpu_do
-    else:
-        input_blob_dev = draw(st.sampled_from(hu.device_options))
-
-    return op_dev, input_blob_dev
-
-
-class TestEnsureCPUOutputOp(hu.HypothesisTestCase):
-
-    @given(
-        input=hu.tensor(dtype=np.float32),
-        dev_options=_dev_options()
-    )
-    def test_ensure_cpu_output(self, input, dev_options):
-        op_dev, input_blob_dev = dev_options
-        net = core.Net('test_net')
-        data = net.GivenTensorFill(
-            [],
-            ["data"],
-            values=input,
-            shape=input.shape,
-            device_option=input_blob_dev
-        )
-
-        data_cpu = net.EnsureCPUOutput(
-            [data],
-            ["data_cpu"],
-            device_option=op_dev
-        )
-        workspace.RunNetOnce(net)
-
-        data_cpu_value = workspace.FetchBlob(data_cpu)
-        np.testing.assert_allclose(input, data_cpu_value)
diff --git a/caffe2/python/operator_test/erf_op_test.py b/caffe2/python/operator_test/erf_op_test.py
deleted file mode 100644
index a4ed0d5fb23e..000000000000
--- a/caffe2/python/operator_test/erf_op_test.py
+++ /dev/null
@@ -1,30 +0,0 @@
-
-
-
-
-
-import math
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-import numpy as np
-import unittest
-
-
-class TestErfOp(serial.SerializedTestCase):
-    @given(
-        X=hu.tensor(elements=hu.floats(min_value=-0.7, max_value=0.7)),
-        **hu.gcs)
-    @settings(deadline=10000)
-    def test_erf(self, X, gc, dc):
-        op = core.CreateOperator('Erf', ["X"], ["Y"])
-        self.assertReferenceChecks(gc, op, [X], lambda x: (np.vectorize(math.erf)(X),))
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/expand_op_test.py b/caffe2/python/operator_test/expand_op_test.py
deleted file mode 100644
index bd608f6fcc24..000000000000
--- a/caffe2/python/operator_test/expand_op_test.py
+++ /dev/null
@@ -1,65 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestExpandOp(serial.SerializedTestCase):
-    def _rand_shape(self, X_shape, max_length):
-        length = np.random.randint(max_length)
-        shape = np.ones(length, dtype=np.int64)
-        i = len(X_shape) - 1
-        for j in reversed(range(length)):
-            if i >= 0:
-                k = np.random.choice([1, X_shape[i]])
-                i -= 1
-            else:
-                k = np.random.randint(3) + 1
-            shape[j] = k
-        return shape
-
-    def _run_expand_op_test(self, X, shape, gc, dc):
-        shape = np.array(shape)
-        op = core.CreateOperator(
-            'Expand',
-            ["X", "shape"],
-            ["Y"],
-        )
-        def ref(X, shape):
-            return (X * np.ones(abs(shape)),)
-
-        self.assertReferenceChecks(gc, op, [X, shape], ref)
-        self.assertDeviceChecks(dc, op, [X, shape], [0])
-        self.assertGradientChecks(gc, op, [X, shape], 0, [0])
-
-    @serial.given(X=hu.tensor(max_dim=5, dtype=np.float32),
-           **hu.gcs)
-    def test_expand_rand_shape(self, X, gc, dc):
-        shape = self._rand_shape(X.shape, 5)
-        self._run_expand_op_test(X, shape, gc, dc)
-
-    @given(X=st.sampled_from([np.ones([1, 3, 1]),
-                             np.ones([3, 1, 3]),
-                             np.ones([1, 3])]),
-           **hu.gcs)
-    def test_expand_nonrand_shape1(self, X, gc, dc):
-        self._run_expand_op_test(X, [3, 1, 3], gc, dc)
-        self._run_expand_op_test(X, [3, -1, 3], gc, dc)
-
-
-    @given(X=st.sampled_from([np.ones([4, 4, 2, 1]),
-                             np.ones([1, 4, 1, 2]),
-                             np.ones([4, 1, 2])]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_expand_nonrand_shape2(self, X, gc, dc):
-        self._run_expand_op_test(X, [4, 1, 2, 2], gc, dc)
-        self._run_expand_op_test(X, [4, -1, 2, 2], gc, dc)
diff --git a/caffe2/python/operator_test/fc_operator_test.py b/caffe2/python/operator_test/fc_operator_test.py
deleted file mode 100644
index bd203b7c84a6..000000000000
--- a/caffe2/python/operator_test/fc_operator_test.py
+++ /dev/null
@@ -1,108 +0,0 @@
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from hypothesis import assume, given, settings, HealthCheck
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-class TestFcOperator(serial.SerializedTestCase):
-    def _run_test(self, n, m, k, transposed, multi_dim, dtype, engine, gc, dc):
-        if dtype == np.float16:
-            # fp16 only supported with CUDA/HIP
-            assume(core.IsGPUDeviceType(gc.device_type))
-            dc = [d for d in dc if core.IsGPUDeviceType(d.device_type)]
-
-        if engine == 'TENSORCORE':
-            # TensorCore only makes sense with CUDA
-            assume(gc.device_type == caffe2_pb2.CUDA)
-            # ensures TensorCore kernels can be called
-            m *= 8
-            k *= 8
-            n *= 8
-
-        X = np.random.rand(m, k).astype(dtype) - 0.5
-        if multi_dim:
-            if transposed:
-                W = np.random.rand(k, n, 1, 1).astype(dtype) - 0.5
-            else:
-                W = np.random.rand(n, k, 1, 1).astype(dtype) - 0.5
-        else:
-            if transposed:
-                W = np.random.rand(k, n).astype(dtype) - 0.5
-            else:
-                W = np.random.rand(n, k).astype(dtype) - 0.5
-        b = np.random.rand(n).astype(dtype) - 0.5
-
-        def fc_op(X, W, b):
-            return [np.dot(X, W.reshape(n, k).transpose()) + b.reshape(n)]
-
-        def fc_transposed_op(X, W, b):
-            return [np.dot(X, W.reshape(k, n)) + b.reshape(n)]
-
-        op = core.CreateOperator(
-            'FCTransposed' if transposed else 'FC',
-            ['X', 'W', 'b'],
-            'out',
-            engine=engine,
-        )
-
-        if dtype == np.float16 and core.IsGPUDeviceType(gc.device_type):
-            a = caffe2_pb2.Argument()
-            a.i = 1
-            a.name = "float16_compute"
-            op.arg.extend([a])
-
-        # Check against numpy reference
-        # ReferenceChecks is flaky, Relaxing to 1e-3.
-        threshold = 1e-3
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, W, b],
-            reference=fc_transposed_op if transposed else fc_op,
-            threshold=threshold
-        )
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X, W, b], [0])
-
-        # Gradient checks
-        threshold = 0.5 if dtype == np.float16 else 0.005
-        stepsize = 0.5 if dtype == np.float16 else 0.05
-        for i in range(3):
-            self.assertGradientChecks(gc, op, [X, W, b], i, [0],
-                                      threshold=threshold, stepsize=stepsize)
-
-    @settings(max_examples=50, suppress_health_check=[HealthCheck.filter_too_much])
-    @serial.given(n=st.integers(1, 5),
-           m=st.integers(0, 5),
-           k=st.integers(1, 5),
-           multi_dim=st.sampled_from([True, False]),
-           dtype=st.sampled_from([np.float32, np.float16]),
-           engine=st.sampled_from(['', 'TENSORCORE']),
-           **hu.gcs)
-    def test_fc(self, **kwargs):
-        self._run_test(transposed=False, **kwargs)
-
-    @settings(max_examples=50, suppress_health_check=[HealthCheck.filter_too_much])
-    @given(n=st.integers(1, 5),
-           m=st.integers(0, 5),
-           k=st.integers(1, 5),
-           multi_dim=st.sampled_from([True, False]),
-           dtype=st.sampled_from([np.float32, np.float16]),
-           engine=st.sampled_from(['', 'TENSORCORE']),
-           **hu.gcs)
-    def test_fc_transposed(self, **kwargs):
-        self._run_test(transposed=True, **kwargs)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/feature_maps_ops_test.py b/caffe2/python/operator_test/feature_maps_ops_test.py
deleted file mode 100644
index aed2168c0f53..000000000000
--- a/caffe2/python/operator_test/feature_maps_ops_test.py
+++ /dev/null
@@ -1,705 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-import numpy as np
-
-
-class TestFeatureMapsOps(TestCase):
-
-    def test_merge_dense_feature_tensors(self):
-        op = core.CreateOperator(
-            "MergeDenseFeatureTensors",
-            [
-                "in1", "in1_presence",
-            ],
-            [
-                "out_lengths", "out_keys", "out_values",
-            ],
-            feature_ids=[11, 12, 13, 14]
-        )
-        # Input 1.
-        workspace.FeedBlob(
-            "in1",
-            np.array([[11.1, 12.1, 13.1, 14.1], [11.2, 12.2, 13.2, 14.2]], dtype=np.float64)
-        )
-        workspace.FeedBlob(
-            "in1_presence",
-            np.array([[True, False, False, True], [False, True, True, False]], dtype=bool)
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_lengths"),
-            np.array([2, 2], dtype=np.int32)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_keys"),
-            np.array([11, 14, 12, 13], dtype=np.int64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values"),
-            np.array([11.1, 14.1, 12.2, 13.2], dtype=np.float64)
-        )
-
-
-    def test_merge_single_scalar_feature_tensors(self):
-        op = core.CreateOperator(
-            "MergeSingleScalarFeatureTensors",
-            [
-                "in1", "in1_presence",
-                "in2", "in2_presence",
-            ],
-            [
-                "out_lengths", "out_keys", "out_values",
-            ],
-            feature_ids=[11, 12]
-        )
-
-        # Input 1.
-        workspace.FeedBlob(
-            "in1",
-            np.array([11.1, 0.0], dtype=np.float64)
-        )
-        workspace.FeedBlob(
-            "in1_presence",
-            np.array([True, False], dtype=bool)
-        )
-        # Input 2.
-        workspace.FeedBlob(
-            "in2",
-            np.array([12.1, 12.2], dtype=np.float64)
-        )
-        workspace.FeedBlob(
-            "in2_presence",
-            np.array([True, True], dtype=bool)
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_lengths"),
-            np.array([2, 1], dtype=np.int32)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_keys"),
-            np.array([11, 12, 12], dtype=np.int64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values"),
-            np.array([11.1, 12.1, 12.2], dtype=np.float64)
-        )
-
-    def test_merge_single_scalar_feature_tensors_gradient(self):
-        op = core.CreateOperator(
-            "MergeSingleScalarFeatureTensorsGradient",
-            [
-                "in1_presence",
-                "in2_presence",
-                "in3_presence",
-                "out_values_grad",
-            ],
-            [
-                "in1_grad", "in2_grad", "in3_grad",
-            ],
-        )
-
-        # Inputs 1, 2 & 3.
-        workspace.FeedBlob(
-            "in1_presence",
-            np.array([True, False], dtype=bool)
-        )
-        workspace.FeedBlob(
-            "in2_presence",
-            np.array([True, True], dtype=bool)
-        )
-        workspace.FeedBlob(
-            "in3_presence",
-            np.array([False, True], dtype=bool)
-        )
-        # Input 4.
-        workspace.FeedBlob(
-            "out_values_grad",
-            np.array([0.1, 1.1, 1.2, 2.3], dtype=np.float64)
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in1_grad"),
-            np.array([0.1, 0], dtype=np.float64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in2_grad"),
-            np.array([1.1, 1.2], dtype=np.float64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in3_grad"),
-            np.array([0, 2.3], dtype=np.float64)
-        )
-
-    def test_merge_single_scalar_feature_tensors_gradient_with_strings(self):
-        op = core.CreateOperator(
-            "MergeSingleScalarFeatureTensorsGradient",
-            [
-                "in1_presence",
-                "in2_presence",
-                "in3_presence",
-                "out_values_grad",
-            ],
-            [
-                "in1_grad", "in2_grad", "in3_grad",
-            ],
-        )
-
-        # Inputs 1, 2 & 3.
-        workspace.FeedBlob(
-            "in1_presence",
-            np.array([True, False], dtype=bool)
-        )
-        workspace.FeedBlob(
-            "in2_presence",
-            np.array([True, True], dtype=bool)
-        )
-        workspace.FeedBlob(
-            "in3_presence",
-            np.array([False, True], dtype=bool)
-        )
-        # Input 4.
-        workspace.FeedBlob(
-            "out_values_grad",
-            np.array(["0.1", "1.1", "1.2", "2.3"], dtype=np.unicode_)
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in1_grad"),
-            np.array(["0.1", ""], dtype=np.bytes_)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in2_grad"),
-            np.array(["1.1", "1.2"], dtype=np.bytes_)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in3_grad"),
-            np.array(["", "2.3"], dtype=np.bytes_)
-        )
-
-    def test_merge_single_list_feature_tensors(self):
-        op = core.CreateOperator(
-            "MergeSingleListFeatureTensors",
-            [
-                "in1_lengths", "in1_values", "in1_presence",
-                "in2_lengths", "in2_values", "in2_presence",
-            ],
-            [
-                "out_lengths", "out_keys", "out_values_lengths",
-                "out_values_values",
-            ],
-            feature_ids=[11, 12]
-        )
-
-        # Input 1.
-        workspace.FeedBlob(
-            "in1_lengths",
-            np.array([2, 0], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in1_values",
-            np.array([11.1, 11.2], dtype=np.float64)
-        )
-        workspace.FeedBlob(
-            "in1_presence",
-            np.array([True, False], dtype=bool)
-        )
-        # Input 2.
-        workspace.FeedBlob(
-            "in2_lengths",
-            np.array([2, 2], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in2_values",
-            np.array([12.1, 12.2, 12.3, 12.4], dtype=np.float64)
-        )
-        workspace.FeedBlob(
-            "in2_presence",
-            np.array([True, True], dtype=bool)
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_lengths"),
-            np.array([2, 1], dtype=np.int32)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_keys"),
-            np.array([11, 12, 12], dtype=np.int64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values_lengths"),
-            np.array([2, 2, 2], dtype=np.int32)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values_values"),
-            np.array([11.1, 11.2, 12.1, 12.2, 12.3, 12.4], dtype=np.float64)
-        )
-
-    def test_merge_single_list_feature_tensors_gradient(self):
-        self._test_merge_single_list_or_map_feature_tensors_gradient(
-            "MergeSingleListFeatureTensorsGradient"
-        )
-
-    def test_merge_single_map_feature_tensors_gradient(self):
-        self._test_merge_single_list_or_map_feature_tensors_gradient(
-            "MergeSingleMapFeatureTensorsGradient"
-        )
-
-    def _test_merge_single_list_or_map_feature_tensors_gradient(self, op_name):
-        op = core.CreateOperator(
-            op_name,
-            [
-                "in1_lengths", "in1_presence",
-                "in2_lengths", "in2_presence",
-                "out_values_values_grad",
-            ],
-            [
-                "in1_values_grad",
-                "in2_values_grad",
-            ],
-        )
-
-        # Input 1.
-        workspace.FeedBlob(
-            "in1_lengths",
-            np.array([2, 0], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in1_presence",
-            np.array([True, False], dtype=bool)
-        )
-        # Input 2.
-        workspace.FeedBlob(
-            "in2_lengths",
-            np.array([2, 2], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in2_presence",
-            np.array([True, True], dtype=bool)
-        )
-        workspace.FeedBlob(
-            "out_values_values_grad",
-            np.array([11.1, 11.2, 12.1, 12.2, 12.3, 12.4], dtype=np.float64)
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in1_values_grad"),
-            np.array([11.1, 11.2], dtype=np.float64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in2_values_grad"),
-            np.array([12.1, 12.2, 12.3, 12.4], dtype=np.float64)
-        )
-
-    def test_merge_single_map_feature_tensors(self):
-        op = core.CreateOperator(
-            "MergeSingleMapFeatureTensors",
-            [
-                "in1_lengths", "in1_keys", "in1_values", "in1_presence",
-                "in2_lengths", "in2_keys", "in2_values", "in2_presence",
-            ],
-            [
-                "out_lengths", "out_keys", "out_values_lengths",
-                "out_values_keys", "out_values_values",
-            ],
-            feature_ids=[11, 12]
-        )
-
-        # Input 1.
-        workspace.FeedBlob(
-            "in1_lengths",
-            np.array([2, 0], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in1_keys",
-            np.array([111, 112], dtype=np.int64)
-        )
-        workspace.FeedBlob(
-            "in1_values",
-            np.array([11.1, 11.2], dtype=np.float64)
-        )
-        workspace.FeedBlob(
-            "in1_presence",
-            np.array([True, False], dtype=bool)
-        )
-        # Input 2.
-        workspace.FeedBlob(
-            "in2_lengths",
-            np.array([2, 2], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in2_keys",
-            np.array([121, 122, 123, 124], dtype=np.int64)
-        )
-        workspace.FeedBlob(
-            "in2_values",
-            np.array([12.1, 12.2, 12.3, 12.4], dtype=np.float64)
-        )
-        workspace.FeedBlob(
-            "in2_presence",
-            np.array([True, True], dtype=bool)
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_lengths"),
-            np.array([2, 1], dtype=np.int32)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_keys"),
-            np.array([11, 12, 12], dtype=np.int64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values_lengths"),
-            np.array([2, 2, 2], dtype=np.int32)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values_keys"),
-            np.array([111, 112, 121, 122, 123, 124], dtype=np.int64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values_values"),
-            np.array([11.1, 11.2, 12.1, 12.2, 12.3, 12.4], dtype=np.float64)
-        )
-
-    def test_merge_multi_scalar_feature_tensors(self):
-        op = core.CreateOperator(
-            "MergeMultiScalarFeatureTensors",
-            [
-                "in1_lengths", "in1_keys", "in1_values",
-                "in2_lengths", "in2_keys", "in2_values",
-            ],
-            [
-                "out_lengths", "out_keys", "out_values",
-            ]
-        )
-
-        # Input 1.
-        workspace.FeedBlob(
-            "in1_lengths",
-            np.array([1, 2], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in1_keys",
-            np.array([11, 12, 13], dtype=np.int64)
-        )
-        workspace.FeedBlob(
-            "in1_values",
-            np.array([11.0, 12.0, 13.0], dtype=np.float64)
-        )
-        # Input 2.
-        workspace.FeedBlob(
-            "in2_lengths",
-            np.array([2, 1], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in2_keys",
-            np.array([14, 15, 16], dtype=np.int64)
-        )
-        workspace.FeedBlob(
-            "in2_values",
-            np.array([14.0, 15.0, 16.0], dtype=np.float64)
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_lengths"),
-            np.array([3, 3], dtype=np.int32)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_keys"),
-            np.array([11, 14, 15, 12, 13, 16], dtype=np.int64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values"),
-            np.array([11.0, 14.0, 15.0, 12.0, 13.0, 16.0], dtype=np.float64)
-        )
-
-    def test_merge_multi_scalar_feature_tensors_gradient(self):
-        op = core.CreateOperator(
-            "MergeMultiScalarFeatureTensorsGradient",
-            [
-                "in1_lengths",
-                "in2_lengths",
-                "out_values_grad"
-            ],
-            [
-                "in1_values_grad",
-                "in2_values_grad",
-            ]
-        )
-
-        # Input 1.
-        workspace.FeedBlob(
-            "in1_lengths",
-            np.array([1, 2, 0], dtype=np.int32)
-        )
-        # Input 2.
-        workspace.FeedBlob(
-            "in2_lengths",
-            np.array([2, 1, 1], dtype=np.int32)
-        )
-        # Grad input.
-        workspace.FeedBlob(
-            "out_values_grad",
-            np.array([11.0, 14.0, 15.0, 12.0, 13.0, 16.0, 17.0], dtype=np.float64)
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in1_values_grad"),
-            np.array([11.0, 12.0, 13.0], dtype=np.float64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in2_values_grad"),
-            np.array([14.0, 15.0, 16.0, 17.0], dtype=np.float64)
-        )
-
-    def test_merge_multi_list_feature_tensors(self):
-        op = core.CreateOperator(
-            "MergeMultiListFeatureTensors",
-            [
-                "in1_lengths", "in1_keys", "in1_values_lengths",
-                "in1_values_values",
-                "in2_lengths", "in2_keys", "in2_values_lengths",
-                "in2_values_values",
-            ],
-            [
-                "out_lengths", "out_keys", "out_values_lengths",
-                "out_values_values"
-            ]
-        )
-
-        # Input 1.
-        workspace.FeedBlob(
-            "in1_lengths",
-            np.array([1, 2], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in1_keys",
-            np.array([11, 12, 13], dtype=np.int64)
-        )
-        workspace.FeedBlob(
-            "in1_values_lengths",
-            np.array([2, 2, 2], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in1_values_values",
-            np.array([11.1, 11.2, 12.1, 12.2, 13.1, 13.2], dtype=np.float64)
-        )
-        # Input 2.
-        workspace.FeedBlob(
-            "in2_lengths",
-            np.array([2, 1], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in2_keys",
-            np.array([14, 15, 16], dtype=np.int64)
-        )
-        workspace.FeedBlob(
-            "in2_values_lengths",
-            np.array([2, 2, 2], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in2_values_values",
-            np.array([14.1, 14.2, 15.1, 15.2, 16.1, 16.2], dtype=np.float64)
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_lengths"),
-            np.array([3, 3], dtype=np.int32)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_keys"),
-            np.array([11, 14, 15, 12, 13, 16], dtype=np.int64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values_lengths"),
-            np.array([2, 2, 2, 2, 2, 2], dtype=np.int32)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values_values"),
-            np.array(
-                [
-                    11.1, 11.2, 14.1, 14.2, 15.1, 15.2, 12.1, 12.2, 13.1, 13.2,
-                    16.1, 16.2
-                ],
-                dtype=np.float64
-            )
-        )
-
-    def test_merge_multi_map_feature_tensors(self):
-        op = core.CreateOperator(
-            "MergeMultiMapFeatureTensors",
-            [
-                "in1_lengths", "in1_keys", "in1_values_lengths",
-                "in1_values_keys", "in1_values_values",
-                "in2_lengths", "in2_keys", "in2_values_lengths",
-                "in2_values_keys", "in2_values_values",
-            ],
-            [
-                "out_lengths", "out_keys", "out_values_lengths",
-                "out_values_keys", "out_values_values"
-            ]
-        )
-
-        # Input 1.
-        workspace.FeedBlob(
-            "in1_lengths",
-            np.array([1, 2], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in1_keys",
-            np.array([11, 12, 13], dtype=np.int64)
-        )
-        workspace.FeedBlob(
-            "in1_values_lengths",
-            np.array([2, 2, 2], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in1_values_keys",
-            np.array([111, 112, 121, 122, 131, 132], dtype=np.int64)
-        )
-        workspace.FeedBlob(
-            "in1_values_values",
-            np.array([11.1, 11.2, 12.1, 12.2, 13.1, 13.2], dtype=np.float64)
-        )
-        # Input 2.
-        workspace.FeedBlob(
-            "in2_lengths",
-            np.array([2, 1], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in2_keys",
-            np.array([14, 15, 16], dtype=np.int64)
-        )
-        workspace.FeedBlob(
-            "in2_values_lengths",
-            np.array([2, 2, 2], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in2_values_keys",
-            np.array([141, 142, 151, 152, 161, 162], dtype=np.int64)
-        )
-        workspace.FeedBlob(
-            "in2_values_values",
-            np.array([14.1, 14.2, 15.1, 15.2, 16.1, 16.2], dtype=np.float64)
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_lengths"),
-            np.array([3, 3], dtype=np.int32)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_keys"),
-            np.array([11, 14, 15, 12, 13, 16], dtype=np.int64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values_lengths"),
-            np.array([2, 2, 2, 2, 2, 2], dtype=np.int32)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values_keys"),
-            np.array(
-                [111, 112, 141, 142, 151, 152, 121, 122, 131, 132, 161, 162],
-                dtype=np.int64
-            )
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("out_values_values"),
-            np.array(
-                [
-                    11.1, 11.2, 14.1, 14.2, 15.1, 15.2, 12.1, 12.2, 13.1, 13.2,
-                    16.1, 16.2
-                ],
-                dtype=np.float64
-            )
-        )
-
-    def test_merge_multi_list_feature_tensors_gradient(self):
-        self._test_merge_multi_list_or_map_feature_tensors_gradient(
-            "MergeMultiListFeatureTensorsGradient"
-        )
-
-    def test_merge_multi_map_feature_tensors_gradient(self):
-        self._test_merge_multi_list_or_map_feature_tensors_gradient(
-            "MergeMultiMapFeatureTensorsGradient"
-        )
-
-    def _test_merge_multi_list_or_map_feature_tensors_gradient(self, op_name):
-        op = core.CreateOperator(
-            op_name,
-            [
-                "in1_lengths", "in1_values_lengths",
-                "in2_lengths", "in2_values_lengths",
-                "out_values_values_grad"
-            ],
-            [
-                "in1_values_values_grad",
-                "in2_values_values_grad",
-            ]
-        )
-
-        # Input 1.
-        workspace.FeedBlob(
-            "in1_lengths",
-            np.array([1, 2], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in1_values_lengths",
-            np.array([2, 2, 2], dtype=np.int32)
-        )
-        # Input 2.
-        workspace.FeedBlob(
-            "in2_lengths",
-            np.array([2, 1], dtype=np.int32)
-        )
-        workspace.FeedBlob(
-            "in2_values_lengths",
-            np.array([2, 2, 2], dtype=np.int32)
-        )
-        # Grad Input.
-        workspace.FeedBlob(
-            "out_values_values_grad",
-            np.array(
-                [
-                    11.1, 11.2, 14.1, 14.2, 15.1, 15.2, 12.1, 12.2, 13.1, 13.2,
-                    16.1, 16.2
-                ],
-                dtype=np.float64
-            )
-        )
-
-        workspace.RunOperatorOnce(op)
-
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in1_values_values_grad"),
-            np.array([11.1, 11.2, 12.1, 12.2, 13.1, 13.2], dtype=np.float64)
-        )
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("in2_values_values_grad"),
-            np.array([14.1, 14.2, 15.1, 15.2, 16.1, 16.2], dtype=np.float64)
-        )
diff --git a/caffe2/python/operator_test/filler_ops_test.py b/caffe2/python/operator_test/filler_ops_test.py
deleted file mode 100644
index 442f5866cb09..000000000000
--- a/caffe2/python/operator_test/filler_ops_test.py
+++ /dev/null
@@ -1,271 +0,0 @@
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-
-def _fill_diagonal(shape, value):
-    result = np.zeros(shape)
-    np.fill_diagonal(result, value)
-    return (result,)
-
-
-class TestFillerOperator(serial.SerializedTestCase):
-
-    @given(**hu.gcs)
-    @settings(deadline=10000)
-    def test_shape_error(self, gc, dc):
-        op = core.CreateOperator(
-            'GaussianFill',
-            [],
-            'out',
-            shape=32,  # illegal parameter
-            mean=0.0,
-            std=1.0,
-        )
-        exception = False
-        try:
-            workspace.RunOperatorOnce(op)
-        except Exception:
-            exception = True
-        self.assertTrue(exception, "Did not throw exception on illegal shape")
-
-        op = core.CreateOperator(
-            'ConstantFill',
-            [],
-            'out',
-            shape=[],  # scalar
-            value=2.0,
-        )
-        exception = False
-        self.assertTrue(workspace.RunOperatorOnce(op))
-        self.assertEqual(workspace.FetchBlob('out'), [2.0])
-
-    @given(**hu.gcs)
-    @settings(deadline=10000)
-    def test_int64_shape(self, gc, dc):
-        large_dim = 2 ** 31 + 1
-        net = core.Net("test_shape_net")
-        net.UniformFill(
-            [],
-            'out',
-            shape=[0, large_dim],
-            min=0.0,
-            max=1.0,
-        )
-        self.assertTrue(workspace.CreateNet(net))
-        self.assertTrue(workspace.RunNet(net.Name()))
-        self.assertEqual(workspace.blobs['out'].shape, (0, large_dim))
-
-    @given(
-        shape=hu.dims().flatmap(
-            lambda dims: hu.arrays(
-                [dims], dtype=np.int64,
-                elements=st.integers(min_value=0, max_value=20)
-            )
-        ),
-        a=st.integers(min_value=0, max_value=100),
-        b=st.integers(min_value=0, max_value=100),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_uniform_int_fill_op_blob_input(self, shape, a, b, gc, dc):
-        net = core.Net('test_net')
-
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-            shape_blob = net.Const(shape, dtype=np.int64)
-        a_blob = net.Const(a, dtype=np.int32)
-        b_blob = net.Const(b, dtype=np.int32)
-        uniform_fill = net.UniformIntFill([shape_blob, a_blob, b_blob],
-                                          1, input_as_shape=1)
-
-        workspace.RunNetOnce(net)
-
-        blob_out = workspace.FetchBlob(uniform_fill)
-        if b < a:
-            new_shape = shape[:]
-            new_shape[0] = 0
-            np.testing.assert_array_equal(new_shape, blob_out.shape)
-        else:
-            np.testing.assert_array_equal(shape, blob_out.shape)
-            self.assertTrue((blob_out >= a).all())
-            self.assertTrue((blob_out <= b).all())
-
-    @given(
-        **hu.gcs
-    )
-    def test_uniform_fill_using_arg(self, gc, dc):
-        net = core.Net('test_net')
-        shape = [2**3, 5]
-        # uncomment this to test filling large blob
-        # shape = [2**30, 5]
-        min_v = -100
-        max_v = 100
-        output_blob = net.UniformIntFill(
-            [],
-            ['output_blob'],
-            shape=shape,
-            min=min_v,
-            max=max_v,
-        )
-
-        workspace.RunNetOnce(net)
-        output_data = workspace.FetchBlob(output_blob)
-
-        np.testing.assert_array_equal(shape, output_data.shape)
-        min_data = np.min(output_data)
-        max_data = np.max(output_data)
-
-        self.assertGreaterEqual(min_data, min_v)
-        self.assertLessEqual(max_data, max_v)
-
-        self.assertNotEqual(min_data, max_data)
-
-    @serial.given(
-        shape=st.sampled_from(
-            [
-                [3, 3],
-                [5, 5, 5],
-                [7, 7, 7, 7],
-            ]
-        ),
-        **hu.gcs
-    )
-    def test_diagonal_fill_op_float(self, shape, gc, dc):
-        value = 2.5
-        op = core.CreateOperator(
-            'DiagonalFill',
-            [],
-            'out',
-            shape=shape,  # scalar
-            value=value,
-        )
-
-        for device_option in dc:
-            op.device_option.CopyFrom(device_option)
-            # Check against numpy reference
-            self.assertReferenceChecks(gc, op, [shape, value], _fill_diagonal)
-
-    @given(**hu.gcs)
-    def test_diagonal_fill_op_int(self, gc, dc):
-        value = 2
-        shape = [3, 3]
-        op = core.CreateOperator(
-            'DiagonalFill',
-            [],
-            'out',
-            shape=shape,
-            dtype=core.DataType.INT32,
-            value=value,
-        )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(gc, op, [shape, value], _fill_diagonal)
-
-    @serial.given(lengths=st.lists(st.integers(min_value=0, max_value=10),
-                                   min_size=0,
-                                   max_size=10),
-           **hu.gcs)
-    def test_lengths_range_fill(self, lengths, gc, dc):
-        op = core.CreateOperator(
-            "LengthsRangeFill",
-            ["lengths"],
-            ["increasing_seq"])
-
-        def _len_range_fill(lengths):
-            sids = []
-            for _, l in enumerate(lengths):
-                sids.extend(list(range(l)))
-            return (np.array(sids, dtype=np.int32), )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[np.array(lengths, dtype=np.int32)],
-            reference=_len_range_fill)
-
-    @given(**hu.gcs)
-    def test_gaussian_fill_op(self, gc, dc):
-        op = core.CreateOperator(
-            'GaussianFill',
-            [],
-            'out',
-            shape=[17, 3, 3],  # sample odd dimensions
-            mean=0.0,
-            std=1.0,
-        )
-
-        for device_option in dc:
-            op.device_option.CopyFrom(device_option)
-            assert workspace.RunOperatorOnce(op), "GaussianFill op did not run "
-            "successfully"
-
-            blob_out = workspace.FetchBlob('out')
-            assert np.count_nonzero(blob_out) > 0, "All generated elements are "
-            "zeros. Is the random generator functioning correctly?"
-
-    @given(**hu.gcs)
-    def test_msra_fill_op(self, gc, dc):
-        op = core.CreateOperator(
-            'MSRAFill',
-            [],
-            'out',
-            shape=[15, 5, 3],  # sample odd dimensions
-        )
-        for device_option in dc:
-            op.device_option.CopyFrom(device_option)
-            assert workspace.RunOperatorOnce(op), "MSRAFill op did not run "
-            "successfully"
-
-            blob_out = workspace.FetchBlob('out')
-            assert np.count_nonzero(blob_out) > 0, "All generated elements are "
-            "zeros. Is the random generator functioning correctly?"
-
-    @given(
-        min=st.integers(min_value=0, max_value=5),
-        range=st.integers(min_value=1, max_value=10),
-        emb_size=st.sampled_from((10000, 20000, 30000)),
-        dim_size=st.sampled_from((16, 32, 64)),
-        **hu.gcs)
-    @settings(deadline=None)
-    def test_fp16_uniformfill_op(self, min, range, emb_size, dim_size, gc, dc):
-        op = core.CreateOperator(
-            'Float16UniformFill',
-            [],
-            'out',
-            shape=[emb_size, dim_size],
-            min=float(min),
-            max=float(min + range),
-        )
-        for device_option in dc:
-            op.device_option.CopyFrom(device_option)
-            assert workspace.RunOperatorOnce(op), "Float16UniformFill op did not run successfully"
-
-            self.assertEqual(workspace.blobs['out'].shape, (emb_size, dim_size))
-
-            blob_out = workspace.FetchBlob('out')
-
-            expected_type = "float16"
-            expected_mean = min + range / 2.0
-            expected_var = range * range / 12.0
-            expected_min = min
-            expected_max = min + range
-
-            self.assertEqual(blob_out.dtype.name, expected_type)
-            self.assertAlmostEqual(np.mean(blob_out, dtype=np.float32), expected_mean, delta=0.1)
-            self.assertAlmostEqual(np.var(blob_out, dtype=np.float32), expected_var, delta=0.1)
-            self.assertGreaterEqual(np.min(blob_out), expected_min)
-            self.assertLessEqual(np.max(blob_out), expected_max)
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/find_op_test.py b/caffe2/python/operator_test/find_op_test.py
deleted file mode 100644
index fc25913d8744..000000000000
--- a/caffe2/python/operator_test/find_op_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-
-
-class TestFindOperator(serial.SerializedTestCase):
-
-    @given(n=st.sampled_from([1, 4, 8, 31, 79, 150]),
-        idxsize=st.sampled_from([2, 4, 8, 1000, 5000]),
-        **hu.gcs)
-    @settings(deadline=10000)
-    def test_find(self, n, idxsize, gc, dc):
-        maxval = 10
-
-        def findop(idx, X):
-            res = []
-            for j in list(X.flatten()):
-                i = np.where(idx == j)[0]
-                if len(i) == 0:
-                    res.append(-1)
-                else:
-                    res.append(i[-1])
-
-            print("Idx: {} X: {}".format(idx, X))
-            print("Res: {}".format(res))
-            return [np.array(res).astype(np.int32)]
-
-        X = (np.random.rand(n) * maxval).astype(np.int32)
-        idx = (np.random.rand(idxsize) * maxval).astype(np.int32)
-
-        op = core.CreateOperator(
-            "Find",
-            ["idx", "X"],
-            ["y"],
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[idx, X],
-            reference=findop,
-        )
diff --git a/caffe2/python/operator_test/flatten_op_test.py b/caffe2/python/operator_test/flatten_op_test.py
deleted file mode 100644
index 2e0340c68779..000000000000
--- a/caffe2/python/operator_test/flatten_op_test.py
+++ /dev/null
@@ -1,38 +0,0 @@
-
-
-
-
-
-from hypothesis import given
-import numpy as np
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestFlatten(hu.HypothesisTestCase):
-    @given(X=hu.tensor(min_dim=2, max_dim=4),
-           **hu.gcs)
-    def test_flatten(self, X, gc, dc):
-        for axis in range(X.ndim + 1):
-            op = core.CreateOperator(
-                "Flatten",
-                ["X"],
-                ["Y"],
-                axis=axis)
-
-            def flatten_ref(X):
-                shape = X.shape
-                outer = np.prod(shape[:axis]).astype(int)
-                inner = np.prod(shape[axis:]).astype(int)
-                return np.copy(X).reshape(outer, inner),
-
-            self.assertReferenceChecks(gc, op, [X], flatten_ref)
-
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/flexible_top_k_test.py b/caffe2/python/operator_test/flexible_top_k_test.py
deleted file mode 100644
index 0cccabb5f2e9..000000000000
--- a/caffe2/python/operator_test/flexible_top_k_test.py
+++ /dev/null
@@ -1,77 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from collections import OrderedDict
-from hypothesis import given, settings
-import numpy as np
-
-
-class TestFlexibleTopK(serial.SerializedTestCase):
-    def flexible_top_k_ref(self, X, k):
-        X_flat = X.reshape((-1, X.shape[-1]))
-        indices_ref = np.ndarray(shape=sum(k), dtype=np.int32)
-        values_ref = np.ndarray(shape=sum(k), dtype=np.float32)
-        offset = 0
-        for i in range(X_flat.shape[0]):
-            od = OrderedDict()
-            for j in range(X_flat.shape[1]):
-                val = X_flat[i, j]
-                if val not in od:
-                    od[val] = []
-                od[val].append(j)
-            k_ = 0
-            for val, idxs in sorted(od.items(), reverse=True):
-                for idx in idxs:
-                    indices_ref[offset + k_] = idx
-                    values_ref[offset + k_] = val
-                    k_ += 1
-                    if k_ >= k[i]:
-                        break
-                if k_ >= k[i]:
-                    break
-            offset += k[i]
-
-        return (values_ref, indices_ref)
-
-    @given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_flexible_top_k(self, X, gc, dc):
-        X = X.astype(dtype=np.float32)
-        k_shape = (int(X.size / X.shape[-1]), )
-        k = np.random.randint(1, high=X.shape[-1] + 1, size=k_shape)
-
-        output_list = ["Values", "Indices"]
-        op = core.CreateOperator("FlexibleTopK", ["X", "k"], output_list,
-                                 device_option=gc)
-
-        def bind_ref(X_loc, k):
-            ret = self.flexible_top_k_ref(X_loc, k)
-            return ret
-
-        self.assertReferenceChecks(gc, op, [X, k], bind_ref)
-
-    @given(X=hu.tensor(min_dim=2), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_flexible_top_k_grad(self, X, gc, dc):
-        X = X.astype(np.float32)
-        k_shape = (int(X.size / X.shape[-1]), )
-        k = np.random.randint(1, high=X.shape[-1] + 1, size=k_shape)
-
-        # this try to make sure adding stepsize (0.05)
-        # will not change TopK selections at all
-        # since dims max_value = 5 as defined in
-        # caffe2/caffe2/python/hypothesis_test_util.py
-        for i in range(X.shape[-1]):
-            X[..., i] = i * 1.0 / X.shape[-1]
-
-        op = core.CreateOperator(
-            "FlexibleTopK", ["X", "k"], ["Values", "Indices"], device_option=gc
-        )
-
-        self.assertGradientChecks(gc, op, [X, k], 0, [0])
diff --git a/caffe2/python/operator_test/floor_op_test.py b/caffe2/python/operator_test/floor_op_test.py
deleted file mode 100644
index 8c0974bb8579..000000000000
--- a/caffe2/python/operator_test/floor_op_test.py
+++ /dev/null
@@ -1,39 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-class TestFloor(serial.SerializedTestCase):
-
-    @given(X=hu.tensor(),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_floor(self, X, gc, dc, engine):
-        op = core.CreateOperator("Floor", ["X"], ["Y"], engine=engine)
-
-        def floor_ref(X):
-            return (np.floor(X),)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=floor_ref)
-
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py b/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py
deleted file mode 100644
index d2e794da0651..000000000000
--- a/caffe2/python/operator_test/fused_nbit_rowwise_conversion_ops_test.py
+++ /dev/null
@@ -1,354 +0,0 @@
-
-
-import math
-import struct
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, workspace
-from caffe2.python.operator_test.fused_nbit_rowwise_test_helper import (
-    _compress_uniform_simplified,
-    param_search_greedy,
-)
-from hypothesis import assume, given, settings
-
-
-# Eigen/Python round 0.5 away from 0, Numpy rounds to even
-round_to_nearest = np.vectorize(round)
-
-
-def bytes_to_half_floats(byte_matrix):
-    floats = np.empty([np.shape(byte_matrix)[0], 1], dtype=np.float16)
-    for i, byte_values in enumerate(byte_matrix):
-        (floats[i],) = np.frombuffer(
-            memoryview(byte_values).tobytes(), dtype=np.float16
-        )
-    return floats
-
-
-def half_floats_to_bytes(floats):
-    byte_matrix = np.empty([np.shape(floats)[0], 2], dtype=np.uint8)
-    for i, value in enumerate(floats):
-        assert isinstance(value, np.float16), (value, floats)
-        byte_matrix[i] = np.frombuffer(
-            memoryview(np.array([value])).tobytes(), dtype=np.uint8
-        )
-    return byte_matrix
-
-
-def int8_to_bytes(int8s):
-    byte_matrix = np.empty([np.shape(int8s)[0], 1], dtype=np.uint8)
-    for i, value in enumerate(int8s):
-        assert isinstance(value, np.int8), (value, int8s)
-        as_bytes = struct.pack("b", value)
-        # In Python3 bytes will be a list of int, in Python2 a list of string
-        if isinstance(as_bytes[0], int):
-            byte_matrix[i] = list(as_bytes)
-        else:
-            byte_matrix[i] = [ord(i) for i in as_bytes]
-    return byte_matrix
-
-
-def fused_rowwise_nbit_quantize_reference(data, bit):
-    minimum = np.min(data, axis=1).astype(np.float16).astype(np.float32)
-    maximum = np.max(data, axis=1)
-    span = maximum - minimum
-    qmax = (1 << bit) - 1
-    scale = (span / qmax).astype(np.float16).astype(np.float32)
-    bias = np.zeros(data.shape[0])
-    quantized_data = np.zeros(data.shape).astype(np.uint8)
-
-    for i in range(data.shape[0]):
-        bias[i] = minimum[i]
-        inverse_scale = 1.0 if scale[i] == 0.0 else 1 / scale[i]
-        if scale[i] == 0.0 or math.isinf(inverse_scale):
-            scale[i] = 1.0
-            inverse_scale = 1.0
-        quantized_data[i] = np.clip(
-            np.round((data[i, :] - minimum[i]) * inverse_scale), 0, qmax
-        )
-
-    # pack
-    assert 8 % bit == 0
-    num_elem_per_byte = 8 // bit
-    packed_dim = (data.shape[1] + num_elem_per_byte - 1) // num_elem_per_byte
-    packed_data = np.zeros([data.shape[0], packed_dim]).astype(np.uint8)
-    for i in range(data.shape[0]):
-        for j in range(data.shape[1]):
-            if j % num_elem_per_byte == 0:
-                packed_data[i, j // num_elem_per_byte] = quantized_data[i, j]
-            else:
-                packed_data[i, j // num_elem_per_byte] += quantized_data[i, j] << (
-                    (j % num_elem_per_byte) * bit
-                )
-
-    scale_bytes = half_floats_to_bytes(scale.astype(np.float16))
-    bias_bytes = half_floats_to_bytes(bias.astype(np.float16))
-    return np.concatenate([packed_data, scale_bytes, bias_bytes], axis=1)
-
-
-def fused_rowwise_nbit_quantize_dequantize_reference(data, bit):
-    fused_quantized = fused_rowwise_nbit_quantize_reference(data, bit)
-    scale = bytes_to_half_floats(fused_quantized[:, -4:-2].astype(np.uint8)).astype(
-        np.float32
-    )
-    bias = bytes_to_half_floats(fused_quantized[:, -2:].astype(np.uint8)).astype(
-        np.float32
-    )
-    quantized_data = fused_quantized[:, :-4]
-
-    # unpack
-    packed_dim = fused_quantized.shape[1] - 4
-    assert 8 % bit == 0
-    num_elem_per_byte = 8 // bit
-    assert packed_dim == ((data.shape[1] + num_elem_per_byte - 1) // num_elem_per_byte)
-    unpacked_data = np.zeros(data.shape).astype(np.uint8)
-    for i in range(data.shape[0]):
-        for j in range(data.shape[1]):
-            unpacked_data[i, j] = (
-                quantized_data[i, j // num_elem_per_byte]
-                >> ((j % num_elem_per_byte) * bit)
-            ) & ((1 << bit) - 1)
-
-    return scale * unpacked_data + bias
-
-
-class TestFusedNBitRowwiseQuantizationConversion(hu.HypothesisTestCase):
-    @given(input_data=hu.tensor(min_dim=2, max_dim=2), bit_rate=st.sampled_from([2, 4]))
-    def test_quantize_op(self, input_data, bit_rate):
-        assert 8 % bit_rate == 0
-        num_elem_per_byte = 8 // bit_rate
-        assume(input_data.shape[1] % num_elem_per_byte == 0)
-        quantize = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
-            ["input_data"],
-            ["quantized_data"],
-        )
-        workspace.FeedBlob("input_data", input_data)
-        workspace.RunOperatorOnce(quantize)
-
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        reference = fused_rowwise_nbit_quantize_reference(
-            input_data.astype(np.float32), bit_rate
-        )
-
-        interleaved_dim = input_data.shape[1] // num_elem_per_byte
-        # compare quantized data
-        np.testing.assert_array_equal(
-            quantized_data[:, :interleaved_dim], reference[:, :interleaved_dim]
-        )
-        # compare scales
-        np.testing.assert_array_almost_equal(
-            bytes_to_half_floats(
-                quantized_data[:, interleaved_dim : interleaved_dim + 2]
-            ),
-            bytes_to_half_floats(reference[:, interleaved_dim : interleaved_dim + 2]),
-        )
-        # compare zero points
-        np.testing.assert_array_equal(
-            quantized_data[:, interleaved_dim + 2], reference[:, interleaved_dim + 2]
-        )
-
-    @given(
-        batch_size=st.integers(1, 100),
-        block_size=st.integers(1, 100),
-        bit_rate=st.sampled_from([2, 4]),
-    )
-    def test_quantize_and_dequantize_op(self, batch_size, block_size, bit_rate):
-        assert 8 % bit_rate == 0
-        num_elem_per_byte = 8 // bit_rate
-        input_data = np.random.rand(batch_size, block_size).astype(np.float32)
-        assume(input_data.shape[1] % num_elem_per_byte == 0)
-        quantize = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
-            ["input_data"],
-            ["quantized_data"],
-        )
-        workspace.FeedBlob("input_data", input_data)
-        workspace.RunOperatorOnce(quantize)
-
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        dequantize = core.CreateOperator(
-            "Fused" + str(bit_rate) + "BitRowwiseQuantizedToFloat",
-            ["quantized_data"],
-            ["dequantized_data"],
-        )
-        workspace.FeedBlob("quantized_data", quantized_data)
-        workspace.RunOperatorOnce(dequantize)
-
-        dequantized_data = workspace.FetchBlob("dequantized_data")
-
-        reference = fused_rowwise_nbit_quantize_dequantize_reference(
-            input_data, bit_rate
-        )
-        np.testing.assert_array_almost_equal(dequantized_data, reference)
-
-
-def ErrorThresholdRow(X, bit_rate):
-    # minimum representable error in bit_rate per row
-    min_elem = np.min(X, axis=1)
-    max_elem = np.max(X, axis=1)
-
-    bias = np.float16(min_elem)
-    scale = np.float16((max_elem - bias) / ((1 << bit_rate) - 1))
-
-    max_round_error = scale / 2
-    max_clip_error = np.maximum(
-        np.abs(min_elem - bias), np.abs(scale * ((1 << bit_rate) - 1) + bias - max_elem)
-    )
-    thres = np.maximum(max_round_error, max_clip_error) * 1.1
-    return thres
-
-
-class TestNBitFakeFused(hu.HypothesisTestCase):
-    @given(bit_rate=st.sampled_from([2, 4]))
-    @settings(deadline=10000)
-    def testNBit(self, bit_rate):
-        # uncomment for debugging
-        # np.random.seed(0)
-        net = core.Net("bench")
-        batchsize = np.random.randint(2, 1000)
-        blocksize = np.random.randint(2, 1000)
-        input_data = np.random.rand(batchsize, blocksize).astype(np.float32)
-        op = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitFakeRowwiseQuantized",
-            "input_data",
-            "minmax_quantized_data",
-        )
-        net.Proto().op.extend([op])
-        net.Fused8BitRowwiseQuantizedToFloat(
-            "minmax_quantized_data", "minmax_dequantized_data"
-        )
-        op = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitFakeRowwiseQuantized",
-            "input_data",
-            "greedy_quantized_data",
-            engine="GREEDY",
-        )
-        net.Proto().op.extend([op])
-        net.Fused8BitRowwiseQuantizedToFloat(
-            "greedy_quantized_data", "greedy_dequantized_data"
-        )
-        workspace.FeedBlob("input_data", input_data)
-
-        workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"])
-        workspace.RunNetOnce(net)
-
-        minmax_dequantized_data = workspace.FetchBlob("minmax_dequantized_data")
-        greedy_dequantized_data = workspace.FetchBlob("greedy_dequantized_data")
-
-        err_thres = ErrorThresholdRow(input_data, bit_rate)
-        diff_minmax = np.abs(input_data - minmax_dequantized_data)
-        diff_greedy = np.abs(input_data - greedy_dequantized_data)
-        for i in range(err_thres.size):
-            # Check error from minmax quantization is within the bound derived from the range
-            assert (
-                np.sum(diff_minmax[i, :] > err_thres[i]) == 0
-            ), "error at row {} too high (diff_minmax[i, :] {} diff_minmax[i, :] > err_thres[i] {} err_thres[i] {}".format(
-                i, diff_minmax[i, :], diff_minmax[i, :] > err_thres[i], err_thres[i]
-            )
-
-            # Check error from greedy quantization is smaller than minmax quantization
-            # Multiply by a margin 1.03 to consider inexactness of
-            # floating-point operations and from binning (in exact math,
-            # l2_greedy should be no less than l2_minmax).
-            l2_minmax_err = np.linalg.norm(diff_minmax[i, :])
-            l2_greedy_err = np.linalg.norm(diff_greedy[i, :])
-            assert (
-                l2_greedy_err <= l2_minmax_err * 1.03
-            ), "L2 quantization error using greedy algorithm {} at row {} is bigger than error using minmax {} (input_data[i,:] {} minmax_dequantized_data[i,:] {} greedy_dequantized_data[i,:] {}".format(  # noqa
-                l2_greedy_err,
-                i,
-                l2_minmax_err,
-                input_data[i, :],
-                minmax_dequantized_data[i, :],
-                greedy_dequantized_data[i, :],
-            )
-
-
-class TestNBitGreedyFused(hu.HypothesisTestCase):
-    @given(bit_rate=st.sampled_from([2, 4]))
-    @settings(deadline=None, max_examples=50)
-    def testNBit(self, bit_rate):
-        # uncomment for debugging
-        # np.random.seed(0)
-        net = core.Net("bench")
-        batchsize = np.random.randint(2, 1000)
-        assert 8 % bit_rate == 0
-        num_elem_per_byte = 8 // bit_rate
-        blocksize = np.random.randint(2, 500) * num_elem_per_byte
-        input_data = np.random.rand(batchsize, blocksize).astype(np.float32)
-
-        op = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
-            "input_data",
-            "minmax_quantized_data",
-        )
-        net.Proto().op.extend([op])
-        op = core.CreateOperator(
-            "Fused" + str(bit_rate) + "BitRowwiseQuantizedToFloat",
-            "minmax_quantized_data",
-            "minmax_dequantized_data",
-        )
-        net.Proto().op.extend([op])
-        op = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
-            "input_data",
-            "greedy_quantized_data",
-            engine="GREEDY",
-        )
-        net.Proto().op.extend([op])
-        op = core.CreateOperator(
-            "Fused" + str(bit_rate) + "BitRowwiseQuantizedToFloat",
-            "greedy_quantized_data",
-            "greedy_dequantized_data",
-        )
-        net.Proto().op.extend([op])
-        workspace.FeedBlob("input_data", input_data)
-
-        workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"])
-        workspace.RunNetOnce(net)
-
-        minmax_dequantized_data = workspace.FetchBlob("minmax_dequantized_data")
-        greedy_dequantized_data = workspace.FetchBlob("greedy_dequantized_data")
-
-        diff_minmax = np.abs(input_data - minmax_dequantized_data)
-        l2_minmax = np.linalg.norm(input_data - minmax_dequantized_data, axis=1)
-        diff_greedy = np.abs(input_data - greedy_dequantized_data)
-        l2_greedy = np.linalg.norm(input_data - greedy_dequantized_data, axis=1)
-
-        for i in range(input_data.shape[0]):
-            # Compare with Python reference greedy search implementation
-            xmin, xmax = param_search_greedy(
-                input_data[i, :], bit_rate, n_bins=200, ratio=0.16
-            )
-            X_q_ref, l2_greedy_ref = _compress_uniform_simplified(
-                input_data[i, :], bit_rate, xmin, xmax, fp16_scale_bias=True
-            )
-            l2_discrepancy = np.abs(l2_greedy[i] - l2_greedy_ref) / input_data.shape[1]
-            # C++ implementation has a different accumulation order when
-            # computing norm in compress_uniform_simplified_ so we shouldn't
-            # use too small tolerance.
-            assert (
-                l2_discrepancy < 1e-5
-            ), "l2_discrepancy between C++ and Python greedy algorithm {} at row {} is too high (actual l2 err {} ref l2 err {} actual {} ref {})".format(  # noqa
-                l2_discrepancy,
-                i,
-                l2_greedy[i],
-                l2_greedy_ref,
-                greedy_dequantized_data[i, :],
-                X_q_ref,
-            )
-
-            # Check error from greedy quantization is smaller than minmax quantization
-            # Multiply by a margin 1.03 to consider inexactness of
-            # floating-point operations and from binning (in exact math,
-            # l2_greedy should be no less than l2_minmax).
-            assert (
-                l2_greedy[i] <= l2_minmax[i] * 1.03
-            ), "L2 quantization error using greedy algorithm {} at row {} is bigger than error using minmax {}".format(
-                l2_greedy[i], i, l2_minmax[i]
-            )
diff --git a/caffe2/python/operator_test/fused_nbit_rowwise_test.cc b/caffe2/python/operator_test/fused_nbit_rowwise_test.cc
deleted file mode 100644
index d34f43612e88..000000000000
--- a/caffe2/python/operator_test/fused_nbit_rowwise_test.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/operator_schema.h"
-#include "caffe2/utils/proto_utils.h"
-
-#include <gtest/gtest.h>
-
-namespace caffe2 {
-
-TEST(OperatorSchemaTest, TensorInferenceNbit) {
-  for (int bit_rate : {2, 4}) {
-    const OpSchema* schema = OpSchemaRegistry::Schema(
-        "FloatToFused" + std::to_string(bit_rate) + "BitRowwiseQuantized");
-    EXPECT_TRUE(schema != nullptr);
-
-    OperatorDef def = CreateOperatorDef(
-        "FloatToFused" + std::to_string(bit_rate) + "BitRowwiseQuantized",
-        "",
-        vector<string>{"in"},
-        vector<string>{"out"});
-    vector<TensorShape> in_shapes(1);
-    in_shapes[0].set_data_type(TensorProto::FLOAT);
-    in_shapes[0].add_dims(20000);
-    in_shapes[0].add_dims(64);
-
-    vector<TensorShape> out = schema->InferTensor(def, in_shapes);
-    EXPECT_EQ(out.size(), 1);
-    EXPECT_EQ(out[0].data_type(), TensorProto::UINT8);
-    EXPECT_EQ(out[0].dims_size(), 2);
-    EXPECT_EQ(out[0].dims(0), 20000);
-    EXPECT_EQ(out[0].dims(1), 64 / (8 / bit_rate) + 4);
-  }
-}
-
-TEST(OperatorSchemaTest, TensorInferenceNbitHalf) {
-  for (int bit_rate : {2, 4}) {
-    const OpSchema* schema = OpSchemaRegistry::Schema(
-        "HalfToFused" + std::to_string(bit_rate) + "BitRowwiseQuantized");
-    EXPECT_TRUE(schema != nullptr);
-
-    OperatorDef def = CreateOperatorDef(
-        "HalfToFused" + std::to_string(bit_rate) + "BitRowwiseQuantized",
-        "",
-        vector<string>{"in"},
-        vector<string>{"out"});
-    vector<TensorShape> in_shapes(1);
-    in_shapes[0].set_data_type(TensorProto::FLOAT16);
-    in_shapes[0].add_dims(20000);
-    in_shapes[0].add_dims(64);
-
-    vector<TensorShape> out = schema->InferTensor(def, in_shapes);
-    EXPECT_EQ(out.size(), 1);
-    EXPECT_EQ(out[0].data_type(), TensorProto::UINT8);
-    EXPECT_EQ(out[0].dims_size(), 2);
-    EXPECT_EQ(out[0].dims(0), 20000);
-    EXPECT_EQ(out[0].dims(1), 64 / (8 / bit_rate) + 4);
-  }
-}
-
-TEST(OperatorSchemaTest, TensorInferenceNbitBack) {
-  for (int bit_rate : {2, 4}) {
-    const OpSchema* schema = OpSchemaRegistry::Schema(
-        "Fused" + std::to_string(bit_rate) + "BitRowwiseQuantizedToFloat");
-    EXPECT_TRUE(schema != nullptr);
-
-    OperatorDef def = CreateOperatorDef(
-        "Fused" + std::to_string(bit_rate) + "BitRowwiseQuantizedToFloat",
-        "",
-        vector<string>{"in"},
-        vector<string>{"out"});
-    vector<TensorShape> in_shapes(1);
-    in_shapes[0].set_data_type(TensorProto::UINT8);
-    in_shapes[0].add_dims(20000);
-    in_shapes[0].add_dims(36);
-
-    vector<TensorShape> out = schema->InferTensor(def, in_shapes);
-    EXPECT_EQ(out.size(), 1);
-    EXPECT_EQ(out[0].data_type(), TensorProto::FLOAT);
-    EXPECT_EQ(out[0].dims_size(), 2);
-    EXPECT_EQ(out[0].dims(0), 20000);
-    EXPECT_EQ(out[0].dims(1), (36 - 4) * (8 / bit_rate));
-  }
-}
-
-TEST(OperatorSchemaTest, TensorInferenceNbitHalfBack) {
-  for (int bit_rate : {2, 4}) {
-    const OpSchema* schema = OpSchemaRegistry::Schema(
-        "Fused" + std::to_string(bit_rate) + "BitRowwiseQuantizedToHalf");
-    EXPECT_TRUE(schema != nullptr);
-
-    OperatorDef def = CreateOperatorDef(
-        "Fused" + std::to_string(bit_rate) + "BitRowwiseQuantizedToHalf",
-        "",
-        vector<string>{"in"},
-        vector<string>{"out"});
-    vector<TensorShape> in_shapes(1);
-    in_shapes[0].set_data_type(TensorProto::UINT8);
-    in_shapes[0].add_dims(20000);
-    in_shapes[0].add_dims(36);
-
-    vector<TensorShape> out = schema->InferTensor(def, in_shapes);
-    EXPECT_EQ(out.size(), 1);
-    EXPECT_EQ(out[0].data_type(), TensorProto::FLOAT16);
-    EXPECT_EQ(out[0].dims_size(), 2);
-    EXPECT_EQ(out[0].dims(0), 20000);
-    EXPECT_EQ(out[0].dims(1), (36 - 4) * (8 / bit_rate));
-  }
-}
-
-} // namespace caffe2
diff --git a/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py b/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py
deleted file mode 100644
index e9af40a128a6..000000000000
--- a/caffe2/python/operator_test/fused_nbit_rowwise_test_helper.py
+++ /dev/null
@@ -1,73 +0,0 @@
-
-
-import numpy as np
-
-
-# Note we explicitly cast variables to np.float32 in a couple of places to avoid
-# the default casting in Python often resuling in double precision and to make
-# sure we're doing the same numerics as C++ code.
-def param_search_greedy(x, bit_rate, n_bins=200, ratio=0.16):
-    xmin, xmax = np.min(x), np.max(x)
-    stepsize = (xmax - xmin) / np.float32(n_bins)
-    min_bins = np.float32(n_bins) * (np.float32(1) - np.float32(ratio))
-    xq, loss = _compress_uniform_simplified(x, bit_rate, xmin, xmax)
-
-    solutions = []  # [(left, right, loss)] # local optima solution
-
-    cur_min, cur_max, cur_loss = xmin, xmax, loss
-    thr = min_bins * stepsize
-    while cur_min + thr < cur_max:
-        # move left
-        xq, loss1 = _compress_uniform_simplified(
-            x, bit_rate, cur_min + stepsize, cur_max
-        )
-        # move right
-        xq, loss2 = _compress_uniform_simplified(
-            x, bit_rate, cur_min, cur_max - stepsize
-        )
-
-        if cur_loss < loss1 and cur_loss < loss2:
-            # found a local optima
-            solutions.append((cur_min, cur_max, cur_loss))
-        if loss1 < loss2:
-            cur_min, cur_max, cur_loss = cur_min + stepsize, cur_max, loss1
-        else:
-            cur_min, cur_max, cur_loss = cur_min, cur_max - stepsize, loss2
-    if len(solutions):
-        best = solutions[0]
-        for solution in solutions:
-            if solution[-1] < best[-1]:
-                best = solution
-        return best[0], best[1]
-    return xmin, xmax
-
-
-def _compress_uniform_simplified(X, bit_rate, xmin, xmax, fp16_scale_bias=True):
-    # affine transform to put Xq in [0,2**bit_rate - 1]
-    # Xq = (2 ** bit_rate - 1) * (Xq - xmin) / data_range
-    if fp16_scale_bias:
-        xmin = xmin.astype(np.float16).astype(np.float32)
-    data_range = xmax - xmin
-    scale = np.where(
-        data_range == 0, np.float32(1), data_range / np.float32(2 ** bit_rate - 1)
-    )
-    if fp16_scale_bias:
-        scale = scale.astype(np.float16).astype(np.float32)
-    inverse_scale = np.float32(1) / scale
-    Xq = np.clip(np.round((X - xmin) * inverse_scale), 0, np.float32(2 ** bit_rate - 1))
-    Xq = Xq * scale + xmin
-
-    # Manually compute loss instead of using np.linalg.norm to use the same
-    # accumulation order used by C++ code
-    vlen = 8
-    loss_v = np.zeros(vlen).astype(np.float32)
-    for i in range(len(Xq) // vlen * vlen):
-        loss_v[i % vlen] += (X[i] - Xq[i]) * (X[i] - Xq[i])
-    loss = np.float32(0)
-    for i in range(vlen):
-        loss += loss_v[i]
-    for i in range(len(Xq) // vlen * vlen, len(Xq)):
-        loss += (X[i] - Xq[i]) * (X[i] - Xq[i])
-    loss = np.sqrt(loss)
-
-    return Xq, loss
diff --git a/caffe2/python/operator_test/gather_ops_test.py b/caffe2/python/operator_test/gather_ops_test.py
deleted file mode 100644
index b0d64506e4c7..000000000000
--- a/caffe2/python/operator_test/gather_ops_test.py
+++ /dev/null
@@ -1,242 +0,0 @@
-
-
-
-
-import numpy as np
-
-from caffe2.python import core, workspace
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import hypothesis.extra.numpy as hnp
-
-# Basic implementation of gather for axis == 0, shich is lookup of indices
-# in the outer dimension. Keeping it for reference here, although is similar
-# to more general function below.
-def ref_gather_axis0():
-    def inner(data, ind):
-        if ind.size == 0 or data.shape[0] == 0:
-            return [np.zeros((0, 10, 20)).astype(np.float32)]
-        output = [data[i] for i in ind]
-        return [output]
-    return inner
-
-# Returns axis-based lookup. We just use numpy take() which handles different
-# axis values as we want.
-def ref_gather(axis):
-    def inner(data, ind):
-        if ind.size == 0 or data.shape[axis] == 0:
-            shape = list(data.shape)
-            shape[0] = 0
-            return [np.zeros(tuple(shape)).astype(np.float32)]
-        # np.take() does axis lookup same as gather
-        output = data.take(ind, axis).astype(np.float32)
-        return [output]
-    return inner
-
-# Gather(..., match_outer==True)
-def ref_gather_match_outer(axis=1):
-    def inner(data, ind):
-        if ind.size == 0 or data.shape[axis] == 0:
-            shape = list(data.shape)
-            shape[0] = 0
-            return [np.zeros(tuple(shape)).astype(np.float32)]
-        input_shape = list(data.shape)
-        output_shape = input_shape[:axis] + list(ind.shape[axis:]) + input_shape[axis + 1:]
-        output = np.zeros(tuple(output_shape)).astype(np.float32)
-        if axis == 1:
-            for i in range(data.shape[0]):
-                output[i] = data[i, ind[i], ]
-        elif axis == 2:
-            for i in range(data.shape[0]):
-                for j in range(data.shape[1]):
-                    output[i, j] = data[i, j, ind[i, j], ]
-        else:
-            raise NotImplementedError
-        return [output]
-    return inner
-
-class TestGatherOps(serial.SerializedTestCase):
-    @given(rows_num=st.integers(0, 10000),
-           index_num=st.integers(0, 5000),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_gather_ops(self, rows_num, index_num, gc, dc):
-        data = np.random.random((rows_num, 10, 20)).astype(np.float32)
-
-        if rows_num > 0:
-            ind = np.random.randint(rows_num, size=(index_num, )).astype('int32')
-        else:
-            ind = np.random.randint(10, size=(index_num, )).astype('int32')
-        op = core.CreateOperator(
-            'Gather',
-            ['data', 'ind'],
-            ['output'])
-
-        self.assertReferenceChecks(gc, op, [data, ind], ref_gather_axis0())
-        self.assertDeviceChecks(dc, op, [data, ind], [0])
-        return
-
-    # Test axis == 2, this keeps outer dimension but will replace data
-    # within axis by lookup of index array (repeated for each outer entry)
-    @given(batch_num=st.integers(1, 4000),
-           rows_num=st.integers(1, 6),
-           index_num=st.integers(1, 20),
-           **hu.gcs)
-    def test_gather_ops_axis2(self, batch_num, rows_num, index_num, gc, dc):
-        data = np.random.random((batch_num, rows_num, 5)).astype(np.float32)
-        ind = np.random.randint(5, size=(index_num, )).astype('int32')
-        op = core.CreateOperator(
-            'Gather',
-            ['data', 'ind'],
-            ['output'],
-            axis=2)
-
-        self.assertReferenceChecks(gc, op, [data, ind], ref_gather(axis=2))
-        self.assertDeviceChecks(dc, op, [data, ind], [0])
-        return
-
-    # Test match_outer == true, the indices has the same outer dimensions as data
-    @given(batch_num=st.integers(1, 40),
-           rows_num=st.integers(1, 6),
-           index_num=st.integers(1, 20),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_gather_ops_match_outer(self, batch_num, rows_num, index_num, gc, dc):
-        data = np.random.random((batch_num, rows_num, 5)).astype(np.float32)
-        ind = np.random.randint(rows_num, size=(batch_num, index_num)).astype('int32')
-        op = core.CreateOperator(
-            'Gather',
-            ['data', 'ind'],
-            ['output'],
-            axis=1,
-            match_outer=True)
-
-        self.assertReferenceChecks(gc, op, [data, ind], ref_gather_match_outer())
-        self.assertDeviceChecks(dc, op, [data, ind], [0])
-        self.assertGradientChecks(gc, op, [data, ind], 0, [0])
-        return
-
-    # Test BatchGather with match_outer == true, the indices has the same outer dimensions as data
-    # Note BatchGather is equivalent to Gather(..., axis=1)
-    @given(batch_num=st.integers(1, 40),
-           rows_num=st.integers(1, 6),
-           index_num=st.integers(1, 20),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_batch_gather_op_match_outer(self, batch_num, rows_num, index_num, gc, dc):
-        data = np.random.random((batch_num, rows_num, 5)).astype(np.float32)
-        ind = np.random.randint(rows_num, size=(batch_num, index_num)).astype('int32')
-        op = core.CreateOperator(
-            'BatchGather',
-            ['data', 'ind'],
-            ['output'],
-            match_outer=True)
-
-        self.assertReferenceChecks(gc, op, [data, ind], ref_gather_match_outer())
-        self.assertDeviceChecks(dc, op, [data, ind], [0])
-        self.assertGradientChecks(gc, op, [data, ind], 0, [0])
-        return
-
-    # when the data is larger,
-    # this test sometimes passes, sometimes fails,
-    # test log here: https://fb.quip.com/SeiyAVWQXvsN (second run failed)
-    # after some digging, this turns out to be numerical error,
-    # the failed run has max|grad - estimated_grad| = 0.009
-    # so here we changed the gradient checking threshold to 0.02 for this test to pass
-    @given(batch_num=st.integers(1, 30),
-           rows_num=st.integers(1, 6),
-           index_num=st.integers(1, 10),
-           index_num2=st.integers(1, 10),
-           axis2_num=st.integers(1, 10),
-           **hu.gcs_cpu_only)
-    @settings(deadline=None, max_examples=50)
-    def test_gather_op_match_outer_axis2_data4D_ind4D(
-        self, batch_num, rows_num, axis2_num, index_num, index_num2, gc, dc
-    ):
-        data = np.random.random((batch_num, rows_num, axis2_num, 5)).astype(np.float32)
-        ind = np.random.randint(axis2_num, size=(batch_num, rows_num, index_num, index_num2)).astype('int32')
-        op = core.CreateOperator(
-            'Gather',
-            ['data', 'ind'],
-            ['output'],
-            axis=2,
-            match_outer=True)
-
-        self.assertReferenceChecks(gc, op, [data, ind], ref_gather_match_outer(axis=2))
-        self.assertDeviceChecks(dc, op, [data, ind], [0])
-        self.assertGradientChecks(gc, op, [data, ind], 0, [0], threshold=0.02)
-        return
-
-
-# Generates data arrays of max dims 10x100x2 and indexing array up to rows_num
-@st.composite
-def _inputs(draw):
-    batch_size = draw(st.integers(2, 10))
-    rows_num = draw(st.integers(1, 100))
-    block_size = draw(st.integers(1, 2))
-    index_num = draw(st.integers(1, 10))
-    return (
-        draw(hnp.arrays(
-            np.float32,
-            (batch_size, rows_num, block_size),
-            elements=hu.floats(-10.0, 10.0),
-        )),
-        draw(hnp.arrays(
-            np.int32,
-            (index_num, 1),
-            elements=st.integers(0, rows_num - 1),
-        )),
-    )
-
-class TestBatchGatherOps(hu.HypothesisTestCase):
-    @given(inputs=_inputs(),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_batch_gather_ops(self, inputs, gc, dc):
-        data, ind = inputs
-        op = core.CreateOperator(
-            'BatchGather',
-            ['data', 'ind'],
-            ['output'])
-        self.assertReferenceChecks(gc, op, [data, ind], ref_gather(axis=1))
-        self.assertGradientChecks(gc, op, [data, ind], 0, [0])
-
-
-class TestGatherFused8BitRowwise(hu.HypothesisTestCase):
-    @given(rows_num=st.integers(1, 10000),
-           cols_num=st.integers(1, 128),
-           index_num=st.integers(0, 5000),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_batch_gather_ops(self, rows_num, cols_num, index_num, gc, dc):
-        data = np.random.random((rows_num, cols_num)).astype(np.float32)
-        ind = np.random.randint(rows_num, size=(index_num, )).astype('int32')
-
-        net = core.Net("bench")
-
-        quantized_data = net.FloatToFused8BitRowwiseQuantized(
-            'data', 'quantized_data')
-        dequantized_data = net.Fused8BitRowwiseQuantizedToFloat(
-            quantized_data, 'dequantized_data')
-
-        net.Gather(
-            [dequantized_data, 'ind'], 'gather_reference')
-        net.GatherFused8BitRowwise(
-            [quantized_data, 'ind'], 'gather_quantized')
-
-        workspace.FeedBlob('data', data)
-        workspace.FeedBlob('ind', ind)
-        workspace.CreateNet(net)
-        workspace.RunNetOnce(net)
-
-        gather_reference = workspace.FetchBlob('gather_reference')
-        gather_quantized = workspace.FetchBlob('gather_quantized')
-        np.testing.assert_array_almost_equal(gather_reference, gather_quantized)
-
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/gather_ranges_op_test.py b/caffe2/python/operator_test/gather_ranges_op_test.py
deleted file mode 100644
index b6ec8823f4dd..000000000000
--- a/caffe2/python/operator_test/gather_ranges_op_test.py
+++ /dev/null
@@ -1,277 +0,0 @@
-
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import numpy as np
-from caffe2.python import core, workspace
-from hypothesis import given, settings, strategies as st
-
-
-def batched_boarders_and_data(
-    data_min_size=5,
-    data_max_size=10,
-    examples_min_number=1,
-    examples_max_number=4,
-    example_min_size=1,
-    example_max_size=3,
-    dtype=np.float32,
-    elements=None,
-):
-    dims_ = st.tuples(
-        st.integers(min_value=data_min_size, max_value=data_max_size),
-        st.integers(min_value=examples_min_number, max_value=examples_max_number),
-        st.integers(min_value=example_min_size, max_value=example_max_size),
-    )
-    return dims_.flatmap(
-        lambda dims: st.tuples(
-            hu.arrays(
-                [dims[1], dims[2], 2],
-                dtype=np.int32,
-                elements=st.integers(min_value=0, max_value=dims[0]),
-            ),
-            hu.arrays([dims[0]], dtype, elements),
-        )
-    )
-
-
-@st.composite
-def _tensor_splits(draw):
-    lengths = draw(st.lists(st.integers(1, 5), min_size=1, max_size=10))
-    batch_size = draw(st.integers(1, 5))
-    element_pairs = [
-        (batch, r) for batch in range(batch_size) for r in range(len(lengths))
-    ]
-    perm = draw(st.permutations(element_pairs))
-    perm = perm[:-1]  # skip one range
-    ranges = [[(0, 0)] * len(lengths) for _ in range(batch_size)]
-    offset = 0
-    for pair in perm:
-        ranges[pair[0]][pair[1]] = (offset, lengths[pair[1]])
-        offset += lengths[pair[1]]
-
-    data = draw(
-        st.lists(
-            st.floats(min_value=-1.0, max_value=1.0), min_size=offset, max_size=offset
-        )
-    )
-
-    key = draw(st.permutations(range(offset)))
-
-    return (
-        np.array(data).astype(np.float32),
-        np.array(ranges),
-        np.array(lengths),
-        np.array(key).astype(np.int64),
-    )
-
-
-@st.composite
-def _bad_tensor_splits(draw):
-    lengths = draw(st.lists(st.integers(4, 6), min_size=4, max_size=4))
-    batch_size = 4
-    element_pairs = [
-        (batch, r) for batch in range(batch_size) for r in range(len(lengths))
-    ]
-    perm = draw(st.permutations(element_pairs))
-    ranges = [[(0, 0)] * len(lengths) for _ in range(batch_size)]
-    offset = 0
-
-    # Inject some bad samples depending on the batch.
-    # Batch 2: length is set to 0. This way, 25% of the samples are empty.
-    # Batch 0-1: length is set to half the original length. This way, 50% of the
-    # samples are of mismatched length.
-    for pair in perm:
-        if pair[0] == 2:
-            length = 0
-        elif pair[0] <= 1:
-            length = lengths[pair[1]] // 2
-        else:
-            length = lengths[pair[1]]
-        ranges[pair[0]][pair[1]] = (offset, length)
-        offset += length
-
-    data = draw(
-        st.lists(
-            st.floats(min_value=-1.0, max_value=1.0), min_size=offset, max_size=offset
-        )
-    )
-
-    key = draw(st.permutations(range(offset)))
-
-    return (
-        np.array(data).astype(np.float32),
-        np.array(ranges),
-        np.array(lengths),
-        np.array(key).astype(np.int64),
-    )
-
-
-def gather_ranges(data, ranges):
-    lengths = []
-    output = []
-    for example_ranges in ranges:
-        length = 0
-        for range in example_ranges:
-            assert len(range) == 2
-            output.extend(data[range[0] : range[0] + range[1]])
-            length += range[1]
-        lengths.append(length)
-    return output, lengths
-
-
-def gather_ranges_to_dense(data, ranges, lengths):
-    outputs = []
-    assert len(ranges)
-    batch_size = len(ranges)
-    assert len(ranges[0])
-    num_ranges = len(ranges[0])
-    assert ranges.shape[2] == 2
-    for i in range(num_ranges):
-        out = []
-        for j in range(batch_size):
-            start, length = ranges[j][i]
-            if not length:
-                out.append([0] * lengths[i])
-            else:
-                assert length == lengths[i]
-                out.append(data[start : start + length])
-        outputs.append(np.array(out))
-    return outputs
-
-
-def gather_ranges_to_dense_with_key(data, ranges, key, lengths):
-    outputs = []
-    assert len(ranges)
-    batch_size = len(ranges)
-    assert len(ranges[0])
-    num_ranges = len(ranges[0])
-    assert ranges.shape[2] == 2
-    for i in range(num_ranges):
-        out = []
-        for j in range(batch_size):
-            start, length = ranges[j][i]
-            if not length:
-                out.append([0] * lengths[i])
-            else:
-                assert length == lengths[i]
-                key_data_list = zip(
-                    key[start : start + length], data[start : start + length]
-                )
-                sorted_key_data_list = sorted(key_data_list, key=lambda x: x[0])
-                sorted_data = [d for (k, d) in sorted_key_data_list]
-                out.append(sorted_data)
-        outputs.append(np.array(out))
-    return outputs
-
-
-class TestGatherRanges(serial.SerializedTestCase):
-    @given(boarders_and_data=batched_boarders_and_data(), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_gather_ranges(self, boarders_and_data, gc, dc):
-        boarders, data = boarders_and_data
-
-        def boarders_to_range(boarders):
-            assert len(boarders) == 2
-            boarders = sorted(boarders)
-            return [boarders[0], boarders[1] - boarders[0]]
-
-        ranges = np.apply_along_axis(boarders_to_range, 2, boarders)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=core.CreateOperator(
-                "GatherRanges", ["data", "ranges"], ["output", "lengths"]
-            ),
-            inputs=[data, ranges],
-            reference=gather_ranges,
-        )
-
-    @given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_gather_ranges_split(self, tensor_splits, gc, dc):
-        data, ranges, lengths, _ = tensor_splits
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=core.CreateOperator(
-                "GatherRangesToDense",
-                ["data", "ranges"],
-                ["X_{}".format(i) for i in range(len(lengths))],
-                lengths=lengths,
-            ),
-            inputs=[data, ranges, lengths],
-            reference=gather_ranges_to_dense,
-        )
-
-    @given(tensor_splits=_tensor_splits(), **hu.gcs_cpu_only)
-    def test_gather_ranges_with_key_split(self, tensor_splits, gc, dc):
-        data, ranges, lengths, key = tensor_splits
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=core.CreateOperator(
-                "GatherRangesToDense",
-                ["data", "ranges", "key"],
-                ["X_{}".format(i) for i in range(len(lengths))],
-                lengths=lengths,
-            ),
-            inputs=[data, ranges, key, lengths],
-            reference=gather_ranges_to_dense_with_key,
-        )
-
-    def test_shape_and_type_inference(self):
-        with hu.temp_workspace("shape_type_inf_int32"):
-            net = core.Net("test_net")
-            net.ConstantFill([], "ranges", shape=[3, 5, 2], dtype=core.DataType.INT32)
-            net.ConstantFill([], "values", shape=[64], dtype=core.DataType.INT64)
-            net.GatherRanges(["values", "ranges"], ["values_output", "lengths_output"])
-            (shapes, types) = workspace.InferShapesAndTypes([net], {})
-
-            self.assertEqual(shapes["values_output"], [64])
-            self.assertEqual(types["values_output"], core.DataType.INT64)
-            self.assertEqual(shapes["lengths_output"], [3])
-            self.assertEqual(types["lengths_output"], core.DataType.INT32)
-
-    @given(tensor_splits=_bad_tensor_splits(), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_empty_range_check(self, tensor_splits, gc, dc):
-        data, ranges, lengths, key = tensor_splits
-
-        workspace.FeedBlob("data", data)
-        workspace.FeedBlob("ranges", ranges)
-        workspace.FeedBlob("key", key)
-
-        def getOpWithThreshold(
-            min_observation=2, max_mismatched_ratio=0.5, max_empty_ratio=None
-        ):
-            return core.CreateOperator(
-                "GatherRangesToDense",
-                ["data", "ranges", "key"],
-                ["X_{}".format(i) for i in range(len(lengths))],
-                lengths=lengths,
-                min_observation=min_observation,
-                max_mismatched_ratio=max_mismatched_ratio,
-                max_empty_ratio=max_empty_ratio,
-            )
-
-        workspace.RunOperatorOnce(getOpWithThreshold())
-
-        workspace.RunOperatorOnce(
-            getOpWithThreshold(max_mismatched_ratio=0.3, min_observation=50)
-        )
-
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(
-                getOpWithThreshold(max_mismatched_ratio=0.3, min_observation=5)
-            )
-
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(
-                getOpWithThreshold(min_observation=50, max_empty_ratio=0.01)
-            )
-
-
-if __name__ == "__main__":
-    import unittest
-
-    unittest.main()
diff --git a/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py b/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py
deleted file mode 100644
index 7dea8f308783..000000000000
--- a/caffe2/python/operator_test/given_tensor_byte_string_to_uint8_fill_op_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-import numpy as np
-
-import unittest
-
-
-class TestGivenTensorByteStringToUInt8FillOps(hu.HypothesisTestCase):
-    @given(X=hu.tensor(min_dim=1, max_dim=4, dtype=np.int32),
-           **hu.gcs)
-    def test_given_tensor_byte_string_to_uint8_fill(self, X, gc, dc):
-        X = X.astype(np.uint8)
-        print('X: ', str(X))
-        op = core.CreateOperator(
-            "GivenTensorByteStringToUInt8Fill",
-            [], ["Y"],
-            shape=X.shape,
-            dtype=core.DataType.STRING,
-            values=[X.tobytes()],
-        )
-
-        def constant_fill(*args, **kw):
-            return [X]
-
-        self.assertReferenceChecks(gc, op, [], constant_fill)
-        self.assertDeviceChecks(dc, op, [], [0])
-
-    @given(**hu.gcs)
-    def test_empty_given_tensor_byte_string_to_uint8_fill(self, gc, dc):
-        X = np.array([], dtype=np.uint8)
-        print('X: ', str(X))
-        op = core.CreateOperator(
-            "GivenTensorByteStringToUInt8Fill",
-            [], ["Y"],
-            shape=X.shape,
-            values=[X.tobytes()],
-        )
-
-        def constant_fill(*args, **kw):
-            return [X]
-
-        self.assertReferenceChecks(gc, op, [], constant_fill)
-        self.assertDeviceChecks(dc, op, [], [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/given_tensor_fill_op_test.py b/caffe2/python/operator_test/given_tensor_fill_op_test.py
deleted file mode 100644
index 3d929ce5c0ee..000000000000
--- a/caffe2/python/operator_test/given_tensor_fill_op_test.py
+++ /dev/null
@@ -1,47 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given
-import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
-import numpy as np
-
-import unittest
-
-
-class TestGivenTensorFillOps(hu.HypothesisTestCase):
-    @given(X=hu.tensor(min_dim=1, max_dim=4, dtype=np.int32),
-           t=st.sampled_from([
-               (core.DataType.BOOL, np.bool_, "GivenTensorFill"),
-               (core.DataType.INT32, np.int32, "GivenTensorFill"),
-               (core.DataType.FLOAT, np.float32, "GivenTensorFill"),
-               (core.DataType.INT16, np.int16, "GivenTensorInt16Fill"),
-               (core.DataType.INT32, np.int32, "GivenTensorIntFill"),
-               (core.DataType.INT64, np.int64, "GivenTensorInt64Fill"),
-               (core.DataType.BOOL, np.bool_, "GivenTensorBoolFill"),
-               (core.DataType.DOUBLE, np.double, "GivenTensorDoubleFill"),
-               (core.DataType.INT32, np.double, "GivenTensorDoubleFill"),
-           ]),
-           **hu.gcs)
-    def test_given_tensor_fill(self, X, t, gc, dc):
-        X = X.astype(t[1])
-        print('X: ', str(X))
-        op = core.CreateOperator(
-            t[2], [], ["Y"],
-            shape=X.shape,
-            dtype=t[0],
-            values=X.reshape((1, X.size)),
-        )
-
-        def constant_fill(*args, **kw):
-            return [X]
-
-        self.assertReferenceChecks(gc, op, [], constant_fill)
-        self.assertDeviceChecks(dc, op, [], [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/glu_op_test.py b/caffe2/python/operator_test/glu_op_test.py
deleted file mode 100644
index 7b7a33dcd90a..000000000000
--- a/caffe2/python/operator_test/glu_op_test.py
+++ /dev/null
@@ -1,45 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-
-
-@st.composite
-def _glu_old_input(draw):
-    dims = draw(st.lists(st.integers(min_value=1, max_value=5), min_size=1, max_size=3))
-    axis = draw(st.integers(min_value=0, max_value=len(dims)))
-    # The axis dimension must be divisible by two
-    axis_dim = 2 * draw(st.integers(min_value=1, max_value=2))
-    dims.insert(axis, axis_dim)
-    X = draw(hu.arrays(dims, np.float32, None))
-    return (X, axis)
-
-
-class TestGlu(serial.SerializedTestCase):
-    @given(
-        X_axis=_glu_old_input(),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_glu_old(self, X_axis, gc, dc):
-        X, axis = X_axis
-
-        def glu_ref(X):
-            x1, x2 = np.split(X, [X.shape[axis] // 2], axis=axis)
-            Y = x1 * (1. / (1. + np.exp(-x2)))
-            return [Y]
-
-        op = core.CreateOperator("Glu", ["X"], ["Y"], dim=axis)
-        self.assertReferenceChecks(gc, op, [X], glu_ref)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py
deleted file mode 100644
index 8e864bb42152..000000000000
--- a/caffe2/python/operator_test/group_conv_test.py
+++ /dev/null
@@ -1,85 +0,0 @@
-
-
-
-
-import numpy as np
-from hypothesis import assume, given, settings
-import hypothesis.strategies as st
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, utils
-import caffe2.python.hip_test_util as hiputl
-import caffe2.python.hypothesis_test_util as hu
-
-import unittest
-
-class TestGroupConvolution(hu.HypothesisTestCase):
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           size=st.integers(7, 10),
-           group=st.integers(1, 4),
-           input_channels_per_group=st.integers(1, 8),
-           output_channels_per_group=st.integers(1, 8),
-           batch_size=st.integers(1, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           # Note: Eigen does not support group convolution, but it should
-           # fall back to the default engine without failing.
-           engine=st.sampled_from(["", "CUDNN", "EIGEN"]),
-           use_bias=st.booleans(),
-           **hu.gcs)
-    @settings(max_examples=2, deadline=None)
-    def test_group_convolution(
-            self, stride, pad, kernel, size, group,
-            input_channels_per_group, output_channels_per_group, batch_size,
-            order, engine, use_bias, gc, dc):
-        assume(size >= kernel)
-
-        if hiputl.run_in_hip(gc, dc):
-            if order == "NHWC":
-                assume(group == 1 and engine != "CUDNN")
-        else:
-            # TODO: Group conv in NHWC not implemented for GPU yet.
-            assume(group == 1 or order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
-
-            if group != 1 and order == "NHWC":
-                dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
-
-        # Group conv not implemented with EIGEN engine.
-        assume(group == 1 or engine != "EIGEN")
-
-        input_channels = input_channels_per_group * group
-        output_channels = output_channels_per_group * group
-
-        op = core.CreateOperator(
-            "Conv",
-            ["X", "w", "b"] if use_bias else ["X", "w"],
-            ["Y"],
-            stride=stride,
-            kernel=kernel,
-            pad=pad,
-            order=order,
-            engine=engine,
-            group=group,
-        )
-        X = np.random.rand(
-            batch_size, size, size, input_channels).astype(np.float32) - 0.5
-        w = np.random.rand(
-            output_channels, kernel, kernel,
-            input_channels_per_group).astype(np.float32)\
-            - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-            w = utils.NHWC2NCHW(w)
-
-        inputs = [X, w, b] if use_bias else [X, w]
-
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/group_norm_op_test.py b/caffe2/python/operator_test/group_norm_op_test.py
deleted file mode 100644
index 14300beed3f9..000000000000
--- a/caffe2/python/operator_test/group_norm_op_test.py
+++ /dev/null
@@ -1,153 +0,0 @@
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-
-
-class TestGroupNormOp(serial.SerializedTestCase):
-    def group_norm_nchw_ref(self, X, gamma, beta, group, epsilon):
-        dims = X.shape
-        N = dims[0]
-        C = dims[1]
-        G = group
-        D = int(C / G)
-        X = X.reshape(N, G, D, -1)
-        mu = np.mean(X, axis=(2, 3), keepdims=True)
-        std = np.sqrt((np.var(X, axis=(2, 3), keepdims=True) + epsilon))
-        gamma = gamma.reshape(G, D, 1)
-        beta = beta.reshape(G, D, 1)
-        Y = gamma * (X - mu) / std + beta
-        return [Y.reshape(dims), mu.reshape(N, G), (1.0 / std).reshape(N, G)]
-
-    def group_norm_nhwc_ref(self, X, gamma, beta, group, epsilon):
-        dims = X.shape
-        N = dims[0]
-        C = dims[-1]
-        G = group
-        D = int(C / G)
-        X = X.reshape(N, -1, G, D)
-        mu = np.mean(X, axis=(1, 3), keepdims=True)
-        std = np.sqrt((np.var(X, axis=(1, 3), keepdims=True) + epsilon))
-        gamma = gamma.reshape(G, D)
-        beta = beta.reshape(G, D)
-        Y = gamma * (X - mu) / std + beta
-        return [Y.reshape(dims), mu.reshape(N, G), (1.0 / std).reshape(N, G)]
-
-    @serial.given(
-        N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(1, 5),
-        H=st.integers(2, 5), W=st.integers(2, 5),
-        epsilon=st.floats(min_value=1e-5, max_value=1e-4),
-        order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
-    def test_group_norm_2d(
-            self, N, G, D, H, W, epsilon, order, gc, dc):
-        op = core.CreateOperator(
-            "GroupNorm",
-            ["X", "gamma", "beta"],
-            ["Y", "mean", "inv_std"],
-            group=G,
-            epsilon=epsilon,
-            order=order,
-        )
-
-        C = G * D
-        if order == "NCHW":
-            X = np.random.randn(N, C, H, W).astype(np.float32) + 1.0
-        else:
-            X = np.random.randn(N, H, W, C).astype(np.float32) + 1.0
-        gamma = np.random.randn(C).astype(np.float32)
-        beta = np.random.randn(C).astype(np.float32)
-        inputs = [X, gamma, beta]
-
-        def ref_op(X, gamma, beta):
-            if order == "NCHW":
-                return self.group_norm_nchw_ref(X, gamma, beta, G, epsilon)
-            else:
-                return self.group_norm_nhwc_ref(X, gamma, beta, G, epsilon)
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=ref_op,
-            threshold=5e-3,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
-
-    @given(N=st.integers(1, 5), G=st.integers(1, 3), D=st.integers(2, 3),
-           T=st.integers(2, 4), H=st.integers(2, 4), W=st.integers(2, 4),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-4),
-           order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
-    def test_group_norm_3d(
-            self, N, G, D, T, H, W, epsilon, order, gc, dc):
-        op = core.CreateOperator(
-            "GroupNorm",
-            ["X", "gamma", "beta"],
-            ["Y", "mean", "inv_std"],
-            group=G,
-            epsilon=epsilon,
-            order=order,
-        )
-
-        C = G * D
-        if order == "NCHW":
-            X = np.random.randn(N, C, T, H, W).astype(np.float32) + 1.0
-        else:
-            X = np.random.randn(N, T, H, W, C).astype(np.float32) + 1.0
-        gamma = np.random.randn(C).astype(np.float32)
-        beta = np.random.randn(C).astype(np.float32)
-        inputs = [X, gamma, beta]
-
-        def ref_op(X, gamma, beta):
-            if order == "NCHW":
-                return self.group_norm_nchw_ref(X, gamma, beta, G, epsilon)
-            else:
-                return self.group_norm_nhwc_ref(X, gamma, beta, G, epsilon)
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=ref_op,
-            threshold=5e-3,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
-
-    @given(N=st.integers(1, 5), G=st.integers(1, 5), D=st.integers(2, 2),
-           H=st.integers(2, 5), W=st.integers(2, 5),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-4),
-           order=st.sampled_from(["NCHW", "NHWC"]), **hu.gcs)
-    @settings(deadline=10000)
-    def test_group_norm_grad(
-            self, N, G, D, H, W, epsilon, order, gc, dc):
-        op = core.CreateOperator(
-            "GroupNorm",
-            ["X", "gamma", "beta"],
-            ["Y", "mean", "inv_std"],
-            group=G,
-            epsilon=epsilon,
-            order=order,
-        )
-
-        C = G * D
-        X = np.arange(N * C * H * W).astype(np.float32)
-        np.random.shuffle(X)
-        if order == "NCHW":
-            X = X.reshape((N, C, H, W))
-        else:
-            X = X.reshape((N, H, W, C))
-        gamma = np.random.randn(C).astype(np.float32)
-        beta = np.random.randn(C).astype(np.float32)
-        inputs = [X, gamma, beta]
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/gru_test.py b/caffe2/python/operator_test/gru_test.py
deleted file mode 100644
index 1a7db2634989..000000000000
--- a/caffe2/python/operator_test/gru_test.py
+++ /dev/null
@@ -1,389 +0,0 @@
-
-
-
-
-
-from caffe2.python import workspace, core, scope, gru_cell
-from caffe2.python.model_helper import ModelHelper
-from caffe2.python.rnn.rnn_cell_test_util import sigmoid, tanh, _prepare_rnn
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from caffe2.proto import caffe2_pb2
-
-from functools import partial
-from hypothesis import given
-from hypothesis import settings as ht_settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-def gru_unit(*args, **kwargs):
-    '''
-    Implements one GRU unit, for one time step
-
-    Shapes:
-    hidden_t_prev.shape     = (1, N, D)
-    gates_out_t.shape       = (1, N, G)
-    seq_lenths.shape        = (N,)
-    '''
-
-    drop_states = kwargs.get('drop_states', False)
-    sequence_lengths = kwargs.get('sequence_lengths', True)
-
-    if sequence_lengths:
-        hidden_t_prev, gates_out_t, seq_lengths, timestep = args
-    else:
-        hidden_t_prev, gates_out_t, timestep = args
-
-    N = hidden_t_prev.shape[1]
-    D = hidden_t_prev.shape[2]
-    G = gates_out_t.shape[2]
-    t = (timestep * np.ones(shape=(N, D))).astype(np.int32)
-    assert t.shape == (N, D)
-    assert G == 3 * D
-    # Calculate reset, update, and output gates separately
-    # because output gate depends on reset gate.
-    gates_out_t = gates_out_t.reshape(N, 3, D)
-    reset_gate_t = gates_out_t[:, 0, :].reshape(N, D)
-    update_gate_t = gates_out_t[:, 1, :].reshape(N, D)
-    output_gate_t = gates_out_t[:, 2, :].reshape(N, D)
-
-    # Calculate gate outputs.
-    reset_gate_t = sigmoid(reset_gate_t)
-    update_gate_t = sigmoid(update_gate_t)
-    output_gate_t = tanh(output_gate_t)
-
-    if sequence_lengths:
-        seq_lengths = (np.ones(shape=(N, D)) *
-                       seq_lengths.reshape(N, 1)).astype(np.int32)
-        assert seq_lengths.shape == (N, D)
-        valid = (t < seq_lengths).astype(np.int32)
-    else:
-        valid = np.ones(shape=(N, D))
-    assert valid.shape == (N, D)
-    hidden_t = update_gate_t * hidden_t_prev + \
-        (1 - update_gate_t) * output_gate_t
-    hidden_t = hidden_t * valid + hidden_t_prev * \
-        (1 - valid) * (1 - drop_states)
-    hidden_t = hidden_t.reshape(1, N, D)
-
-    return (hidden_t, )
-
-
-def gru_reference(input, hidden_input,
-                  reset_gate_w, reset_gate_b,
-                  update_gate_w, update_gate_b,
-                  output_gate_w, output_gate_b,
-                  seq_lengths, drop_states=False,
-                  linear_before_reset=False):
-    D = hidden_input.shape[hidden_input.ndim - 1]
-    T = input.shape[0]
-    N = input.shape[1]
-    G = input.shape[2]
-    print("Dimensions: T= ", T, " N= ", N, " G= ", G, " D= ", D)
-    hidden = np.zeros(shape=(T + 1, N, D))
-    hidden[0, :, :] = hidden_input
-
-    for t in range(T):
-        input_t = input[t].reshape(1, N, G)
-        hidden_t_prev = hidden[t].reshape(1, N, D)
-
-        # Split input contributions for three gates.
-        input_t = input_t.reshape(N, 3, D)
-        input_reset = input_t[:, 0, :].reshape(N, D)
-        input_update = input_t[:, 1, :].reshape(N, D)
-        input_output = input_t[:, 2, :].reshape(N, D)
-
-        reset_gate = np.dot(hidden_t_prev, reset_gate_w.T) + reset_gate_b
-        reset_gate = reset_gate + input_reset
-
-        update_gate = np.dot(hidden_t_prev, update_gate_w.T) + update_gate_b
-        update_gate = update_gate + input_update
-
-        if linear_before_reset:
-            with_linear = np.dot(
-                hidden_t_prev, output_gate_w.T) + output_gate_b
-            output_gate = sigmoid(reset_gate) * with_linear
-        else:
-            with_reset = hidden_t_prev * sigmoid(reset_gate)
-            output_gate = np.dot(with_reset, output_gate_w.T) + output_gate_b
-        output_gate = output_gate + input_output
-
-        gates_out_t = np.concatenate(
-            (reset_gate, update_gate, output_gate),
-            axis=2,
-        )
-        print(reset_gate, update_gate, output_gate, gates_out_t, sep="\n")
-
-        (hidden_t, ) = gru_unit(
-            hidden_t_prev,
-            gates_out_t,
-            seq_lengths,
-            t,
-            drop_states=drop_states
-        )
-        hidden[t + 1] = hidden_t
-
-    return (
-        hidden[1:],
-        hidden[-1].reshape(1, N, D),
-    )
-
-
-def gru_unit_op_input():
-    '''
-    Create input tensor where each dimension is from 1 to 4, ndim=3 and
-    last dimension size is a factor of 3
-
-    hidden_t_prev.shape     = (1, N, D)
-    '''
-    dims_ = st.tuples(
-        st.integers(min_value=1, max_value=1),  # 1, one timestep
-        st.integers(min_value=1, max_value=4),  # n
-        st.integers(min_value=1, max_value=4),  # d
-    )
-
-    def create_input(dims):
-        dims = list(dims)
-        dims[2] *= 3
-        return hu.arrays(dims)
-
-    return dims_.flatmap(create_input)
-
-
-def gru_input():
-    '''
-    Create input tensor where each dimension is from 1 to 4, ndim=3 and
-    last dimension size is a factor of 3
-    '''
-    dims_ = st.tuples(
-        st.integers(min_value=1, max_value=4),  # t
-        st.integers(min_value=1, max_value=4),  # n
-        st.integers(min_value=1, max_value=4),  # d
-    )
-
-    def create_input(dims):
-        dims = list(dims)
-        dims[2] *= 3
-        return hu.arrays(dims)
-
-    return dims_.flatmap(create_input)
-
-
-def _prepare_gru_unit_op(gc, n, d, outputs_with_grads,
-                         forward_only=False, drop_states=False,
-                         sequence_lengths=False,
-                         two_d_initial_states=None):
-    print("Dims: (n,d) = ({},{})".format(n, d))
-
-    def generate_input_state(n, d):
-        if two_d_initial_states:
-            return np.random.randn(n, d).astype(np.float32)
-        else:
-            return np.random.randn(1, n, d).astype(np.float32)
-
-    model = ModelHelper(name='external')
-
-    with scope.NameScope("test_name_scope"):
-        if sequence_lengths:
-            hidden_t_prev, gates_t, seq_lengths, timestep = \
-                model.net.AddScopedExternalInputs(
-                    "hidden_t_prev",
-                    "gates_t",
-                    'seq_lengths',
-                    "timestep",
-                )
-        else:
-            hidden_t_prev, gates_t, timestep = \
-                model.net.AddScopedExternalInputs(
-                    "hidden_t_prev",
-                    "gates_t",
-                    "timestep",
-                )
-        workspace.FeedBlob(
-            hidden_t_prev,
-            generate_input_state(n, d).astype(np.float32),
-            device_option=gc
-        )
-        workspace.FeedBlob(
-            gates_t,
-            generate_input_state(n, 3 * d).astype(np.float32),
-            device_option=gc
-        )
-
-        if sequence_lengths:
-            inputs = [hidden_t_prev, gates_t, seq_lengths, timestep]
-        else:
-            inputs = [hidden_t_prev, gates_t, timestep]
-
-        hidden_t = model.net.GRUUnit(
-            inputs,
-            ['hidden_t'],
-            forget_bias=0.0,
-            drop_states=drop_states,
-            sequence_lengths=sequence_lengths,
-        )
-        model.net.AddExternalOutputs(hidden_t)
-        workspace.RunNetOnce(model.param_init_net)
-
-        if sequence_lengths:
-            # 10 is used as a magic number to simulate some reasonable timestep
-            # and generate some reasonable seq. lengths
-            workspace.FeedBlob(
-                seq_lengths,
-                np.random.randint(1, 10, size=(n,)).astype(np.int32),
-                device_option=gc
-            )
-
-        workspace.FeedBlob(
-            timestep,
-            np.random.randint(1, 10, size=(1,)).astype(np.int32),
-            device_option=core.DeviceOption(caffe2_pb2.CPU),
-        )
-        print("Feed {}".format(timestep))
-
-    return hidden_t, model.net
-
-
-class GRUCellTest(serial.SerializedTestCase):
-
-    # Test just for GRUUnitOp
-    @serial.given(
-        seed=st.integers(0, 2**32 - 1),
-        input_tensor=gru_unit_op_input(),
-        fwd_only=st.booleans(),
-        drop_states=st.booleans(),
-        sequence_lengths=st.booleans(),
-        **hu.gcs
-    )
-    def test_gru_unit_op(self, seed, input_tensor, fwd_only,
-                         drop_states, sequence_lengths, gc, dc):
-        np.random.seed(seed)
-        outputs_with_grads = [0]
-        ref = gru_unit
-        ref = partial(ref)
-
-        t, n, d = input_tensor.shape
-        assert d % 3 == 0
-        d = d // 3
-        ref = partial(ref, drop_states=drop_states,
-                      sequence_lengths=sequence_lengths)
-
-        with core.DeviceScope(gc):
-            net = _prepare_gru_unit_op(gc, n, d,
-                                       outputs_with_grads=outputs_with_grads,
-                                       forward_only=fwd_only,
-                                       drop_states=drop_states,
-                                       sequence_lengths=sequence_lengths)[1]
-        # here we don't provide a real input for the net but just for one of
-        # its ops (RecurrentNetworkOp). So have to hardcode this name
-        workspace.FeedBlob("test_name_scope/external/recurrent/i2h",
-                           input_tensor,
-                           device_option=gc)
-        print(str(net.Proto()))
-        op = net._net.op[-1]
-        inputs = [workspace.FetchBlob(name) for name in op.input]
-
-        self.assertReferenceChecks(
-            gc,
-            op,
-            inputs,
-            ref,
-            input_device_options={"test_name_scope/timestep": hu.cpu_do},
-            outputs_to_check=[0],
-        )
-
-        # Checking for hidden_prev and gates gradients
-        if not fwd_only:
-            for param in range(2):
-                print("Check param {}".format(param))
-                self.assertGradientChecks(
-                    device_option=gc,
-                    op=op,
-                    inputs=inputs,
-                    outputs_to_check=param,
-                    outputs_with_grads=outputs_with_grads,
-                    threshold=0.0001,
-                    stepsize=0.005,
-                    input_device_options={
-                        "test_name_scope/timestep": hu.cpu_do},
-                )
-
-    @given(
-        seed=st.integers(0, 2**32 - 1),
-        input_tensor=gru_input(),
-        fwd_only=st.booleans(),
-        drop_states=st.booleans(),
-        linear_before_reset=st.booleans(),
-        **hu.gcs
-    )
-    @ht_settings(max_examples=20, deadline=None)
-    def test_gru_main(self, seed, **kwargs):
-        np.random.seed(seed)
-        for outputs_with_grads in [[0], [1], [0, 1]]:
-            self.gru_base(gru_cell.GRU, gru_reference,
-                          outputs_with_grads=outputs_with_grads,
-                          **kwargs)
-
-    def gru_base(self, create_rnn, ref, outputs_with_grads,
-                 input_tensor, fwd_only, drop_states, linear_before_reset, gc, dc):
-
-        print("GRU test parameters: ", locals())
-        t, n, d = input_tensor.shape
-        assert d % 3 == 0
-        d = d // 3
-        ref = partial(ref,
-                      drop_states=drop_states,
-                      linear_before_reset=linear_before_reset)
-        with core.DeviceScope(gc):
-            net = _prepare_rnn(
-                t, n, d, create_rnn,
-                outputs_with_grads=outputs_with_grads,
-                memory_optim=False,
-                forget_bias=0.0,
-                forward_only=fwd_only,
-                drop_states=drop_states,
-                linear_before_reset=linear_before_reset,
-                num_states=1,
-            )[1]
-        # here we don't provide a real input for the net but just for one of
-        # its ops (RecurrentNetworkOp). So have to hardcode this name
-        workspace.FeedBlob("test_name_scope/external/recurrent/i2h",
-                           input_tensor,
-                           device_option=gc)
-        op = net._net.op[-1]
-        inputs = [workspace.FetchBlob(name) for name in op.input]
-
-        self.assertReferenceChecks(
-            gc,
-            op,
-            inputs,
-            ref,
-            input_device_options={"test_name_scope/timestep": hu.cpu_do},
-            outputs_to_check=list(range(2)),
-        )
-
-        # Checking for input, gates_t_w and gates_t_b gradients
-        if not fwd_only:
-            for param in range(2):
-                print("Check param {}".format(param))
-                self.assertGradientChecks(
-                    device_option=gc,
-                    op=op,
-                    inputs=inputs,
-                    outputs_to_check=param,
-                    outputs_with_grads=outputs_with_grads,
-                    threshold=0.001,
-                    stepsize=0.005,
-                    input_device_options={
-                        "test_name_scope/timestep": hu.cpu_do},
-                )
-
-
-if __name__ == "__main__":
-    workspace.GlobalInit([
-        'caffe2',
-        '--caffe2_log_level=0',
-    ])
-    unittest.main()
diff --git a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py b/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
deleted file mode 100644
index 56fc8e81e199..000000000000
--- a/caffe2/python/operator_test/heatmap_max_keypoint_op_test.py
+++ /dev/null
@@ -1,145 +0,0 @@
-
-
-
-
-
-import numpy as np
-import torch
-import sys
-import unittest
-from scipy import interpolate
-
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core, utils
-from caffe2.proto import caffe2_pb2
-
-import caffe2.python.operator_test.detectron_keypoints as keypoint_utils
-
-NUM_TEST_ROI = 14
-NUM_KEYPOINTS = 19
-HEATMAP_SIZE = 56
-
-
-def heatmap_FAIR_keypoint_ref(maps, rois):
-    return [keypoint_utils.heatmaps_to_keypoints(maps, rois)]
-
-
-def heatmap_approx_keypoint_ref(maps, rois):
-    return [keypoint_utils.approx_heatmap_keypoint(maps, rois)]
-
-
-def c10_op_ref(maps, rois):
-    keypoints = torch.ops._caffe2.HeatmapMaxKeypoint(
-        torch.tensor(maps),
-        torch.tensor(rois),
-        should_output_softmax=True,
-    )
-    return [keypoints.numpy()]
-
-
-class TestHeatmapMaxKeypointOp(hu.HypothesisTestCase):
-    def setUp(self):
-        super().setUp()
-        np.random.seed(0)
-
-        # initial coordinates and interpolate HEATMAP_SIZE from it
-        HEATMAP_SMALL_SIZE = 4
-        bboxes_in = 500 * np.random.rand(NUM_TEST_ROI, 4).astype(np.float32)
-        # only bbox with smaller first coordinates
-        for i in range(NUM_TEST_ROI):
-            if bboxes_in[i][0] > bboxes_in[i][2]:
-                tmp = bboxes_in[i][2]
-                bboxes_in[i][2] = bboxes_in[i][0]
-                bboxes_in[i][0] = tmp
-            if bboxes_in[i][1] > bboxes_in[i][3]:
-                tmp = bboxes_in[i][3]
-                bboxes_in[i][3] = bboxes_in[i][1]
-                bboxes_in[i][1] = tmp
-
-        # initial randomized coordinates for heatmaps and expand it with interpolation
-        init = np.random.rand(
-            NUM_TEST_ROI,
-            NUM_KEYPOINTS,
-            HEATMAP_SMALL_SIZE,
-            HEATMAP_SMALL_SIZE).astype(np.float32)
-        heatmaps_in = np.zeros(
-            (NUM_TEST_ROI, NUM_KEYPOINTS, HEATMAP_SIZE, HEATMAP_SIZE)
-        ).astype(np.float32)
-        for roi in range(NUM_TEST_ROI):
-            for keyp in range(NUM_KEYPOINTS):
-                f = interpolate.interp2d(
-                    np.arange(0, 1, 1.0 / HEATMAP_SMALL_SIZE),
-                    np.arange(0, 1, 1.0 / HEATMAP_SMALL_SIZE),
-                    init[roi][keyp],
-                    kind='cubic')
-                heatmaps_in[roi][keyp] = f(
-                    np.arange(0, 1, 1.0 / HEATMAP_SIZE),
-                    np.arange(0, 1, 1.0 / HEATMAP_SIZE))
-
-        self.heatmaps_in = heatmaps_in
-        self.bboxes_in = bboxes_in
-
-        self.op = core.CreateOperator(
-            'HeatmapMaxKeypoint',
-            ['heatmaps_in', 'bboxes_in'],
-            ['keypoints_out'],
-            arg=[
-                utils.MakeArgument("should_output_softmax", True),
-            ],
-            device_option=caffe2_pb2.DeviceOption())
-
-    @unittest.skipIf('cv2' not in sys.modules, 'python-opencv is not installed')
-    def test_close_to_FAIR(self):
-        # 10 pixel error in scale of 500px bbox
-        self.assertReferenceChecks(
-            device_option=caffe2_pb2.DeviceOption(),
-            op=self.op,
-            inputs=[self.heatmaps_in, self.bboxes_in],
-            reference=heatmap_FAIR_keypoint_ref,
-            threshold=10,
-        )
-
-    def test_approx_heatmap_keypoint(self):
-        # C++/Python implementation should be bit-wise equal
-        self.assertReferenceChecks(
-            device_option=caffe2_pb2.DeviceOption(),
-            op=self.op,
-            inputs=[self.heatmaps_in, self.bboxes_in],
-            reference=heatmap_approx_keypoint_ref,
-        )
-
-    def test_special_cases(self):
-        example_bboxes = np.array([[0, 0, 100, 100]]).astype(np.float32)
-        heatmap_tests = []
-        # special case #1
-        heatmap_tests.append(np.array([
-            [0.14722, 0.807823, 0.447052],
-            [0.652919, 0.850923, -0.225462],
-            [0.805912, 0.75778, -0.563371],
-        ]).astype(np.float32).reshape((1, 1, 3, 3)))
-        # special case #2
-        heatmap_tests.append(np.array([
-            [3.19541, 3.69551, 3.87579],
-            [3.63094, 3.89978, 3.67606],
-            [3.78555, 3.87291, 3.28083],
-        ]).astype(np.float32).reshape((1, 1, 3, 3)))
-
-        for heatmap_test in heatmap_tests:
-            self.assertReferenceChecks(
-                device_option=caffe2_pb2.DeviceOption(),
-                op=self.op,
-                inputs=[heatmap_test, example_bboxes],
-                reference=heatmap_approx_keypoint_ref,
-            )
-
-    def test_caffe2_pytorch_eq(self):
-        self.assertReferenceChecks(
-            device_option=caffe2_pb2.DeviceOption(),
-            op=self.op,
-            inputs=[self.heatmaps_in, self.bboxes_in],
-            reference=c10_op_ref,
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/histogram_test.py b/caffe2/python/operator_test/histogram_test.py
deleted file mode 100644
index 874ff18171ee..000000000000
--- a/caffe2/python/operator_test/histogram_test.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import unittest
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, workspace
-from hypothesis import given, settings
-
-
-class TestHistogram(hu.HypothesisTestCase):
-    @given(rows=st.integers(1, 1000), cols=st.integers(1, 1000), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_histogram__device_consistency(self, rows, cols, gc, dc):
-        X = np.random.rand(rows, cols)
-        bin_edges = list(np.linspace(-2, 10, num=10000))
-        op = core.CreateOperator("Histogram", ["X"], ["histogram"], bin_edges=bin_edges)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    def test_histogram__valid_inputs_0(self):
-        workspace.FeedBlob(
-            "X", np.array([-2.0, -2.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 9.0])
-        )
-        bin_edges = [-2.0, -1.0, 0.0, 2.0, 5.0, 9.0]
-
-        net = core.Net("test_net")
-        net.Histogram(["X"], ["histogram"], bin_edges=bin_edges)
-
-        workspace.RunNetOnce(net)
-        histogram_blob = workspace.FetchBlob("histogram")
-
-        assert list(histogram_blob) == [2, 0, 4, 3, 1]
-
-    @given(num_tensors=st.integers(1, 5), num_bin_edges=st.integers(2, 10000))
-    @settings(deadline=10000)
-    def test_histogram__valid_inputs_1(self, num_tensors, num_bin_edges):
-        self._test_histogram(
-            [
-                np.random.rand(np.random.randint(1, 1000), np.random.randint(1, 1000))
-                for __ in range(num_tensors)
-            ],
-            list(np.logspace(-12, 5, num=num_bin_edges)),
-        )
-
-    def test_histogram__empty_input_tensor(self):
-        self._test_histogram([np.array([])], list(np.linspace(-2, 2, num=10)))
-
-    def test_histogram__non_increasing_bin_edges(self):
-        with self.assertRaisesRegex(
-            RuntimeError, "bin_edges must be a strictly increasing sequence of values"
-        ):
-            self._test_histogram(
-                [np.random.rand(100), np.random.rand(98)], [0.0, 0.2, 0.1, 0.1]
-            )
-
-    def test_histogram__insufficient_bin_edges(self):
-        with self.assertRaisesRegex(
-            RuntimeError, "Number of bin edges must be greater than or equal to 2"
-        ):
-            self._test_histogram([np.random.rand(111)], [1.0])
-
-    def _test_histogram(self, tensors, bin_edges):
-        total_size = 0
-        input_blob_names = []
-
-        for idx, tensor in enumerate(tensors):
-            total_size += np.size(tensor)
-            tensor_blob_name = f"X{idx}"
-            workspace.FeedBlob(tensor_blob_name, tensor)
-            input_blob_names.append(tensor_blob_name)
-
-        output_name = "histogram"
-        net = core.Net("test_net")
-        net.Histogram(input_blob_names, [output_name], bin_edges=bin_edges)
-
-        workspace.RunNetOnce(net)
-        histogram_blob = workspace.FetchBlob(output_name)
-
-        assert np.size(histogram_blob) == len(bin_edges) - 1
-        assert np.sum(histogram_blob) == total_size
-
-
-if __name__ == "__main__":
-    global_options = ["caffe2"]
-    core.GlobalInit(global_options)
-    unittest.main()
diff --git a/caffe2/python/operator_test/hsm_test.py b/caffe2/python/operator_test/hsm_test.py
deleted file mode 100644
index 8a0754b32d25..000000000000
--- a/caffe2/python/operator_test/hsm_test.py
+++ /dev/null
@@ -1,252 +0,0 @@
-
-
-
-
-from hypothesis import given, settings
-import numpy as np
-import unittest
-
-from caffe2.proto import caffe2_pb2, hsm_pb2
-from caffe2.python import workspace, core, gradient_checker
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.hsm_util as hsmu
-
-# User inputs tree using protobuf file or, in this case, python utils
-# The hierarchy in this test looks as shown below. Note that the final subtrees
-# (with word_ids as leaves) have been collapsed for visualization
-#           *
-#         /  \
-#        *    5,6,7,8
-#       / \
-#  0,1,2   3,4
-tree = hsm_pb2.TreeProto()
-words = [[0, 1, 2], [3, 4], [5, 6, 7, 8]]
-node1 = hsmu.create_node_with_words(words[0], "node1")
-node2 = hsmu.create_node_with_words(words[1], "node2")
-node3 = hsmu.create_node_with_words(words[2], "node3")
-node4 = hsmu.create_node_with_nodes([node1, node2], "node4")
-node = hsmu.create_node_with_nodes([node4, node3], "node5")
-tree.root_node.MergeFrom(node)
-
-# structure:
-# node5: [0, 2, ["node4", "node3"]] # offset, length, "node4, node3"
-# node4: [2, 2, ["node1", "node2"]]
-# node1: [4, 3, [0, 1 ,2]]
-# node2: [7, 2, [3, 4]
-# node3: [9, 4, [5, 6, 7, 8]
-struct = [[0, 2, ["node4", "node3"], "node5"],
-            [2, 2, ["node1", "node2"], "node4"],
-            [4, 3, [0, 1, 2], "node1"],
-            [7, 2, [3, 4], "node2"],
-            [9, 4, [5, 6, 7, 8], "node3"]]
-
-# Internal util to translate input tree to list of (word_id,path). serialized
-# hierarchy is passed into the operator_def as a string argument,
-hierarchy_proto = hsmu.create_hierarchy(tree)
-arg = caffe2_pb2.Argument()
-arg.name = "hierarchy"
-arg.s = hierarchy_proto.SerializeToString()
-
-beam = 5
-args_search = []
-arg_search = caffe2_pb2.Argument()
-arg_search.name = "tree"
-arg_search.s = tree.SerializeToString()
-args_search.append(arg_search)
-arg_search = caffe2_pb2.Argument()
-arg_search.name = "beam"
-arg_search.f = beam
-args_search.append(arg_search)
-
-
-class TestHsm(hu.HypothesisTestCase):
-    def test_hsm_search(self):
-        samples = 10
-        dim_in = 5
-        X = np.random.rand(samples, dim_in).astype(np.float32) - 0.5
-        w = np.random.rand(hierarchy_proto.size, dim_in) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(hierarchy_proto.size).astype(np.float32) - 0.5
-        labels = np.array([np.random.randint(0, 8) for i in range(samples)]) \
-            .astype(np.int32)
-
-        workspace.GlobalInit(['caffe2'])
-        workspace.FeedBlob("data", X)
-        workspace.FeedBlob("weights", w)
-        workspace.FeedBlob("bias", b)
-        workspace.FeedBlob("labels", labels)
-        op = core.CreateOperator(
-            'HSoftmaxSearch',
-            ['data', 'weights', 'bias'],
-            ['names', 'scores'],
-            'HSoftmaxSearch',
-            arg=args_search)
-        workspace.RunOperatorOnce(op)
-        names = workspace.FetchBlob('names')
-        scores = workspace.FetchBlob('scores')
-
-        def simulation_hsm_search():
-            names = []
-            scores = []
-            for line in struct:
-                s, e = line[0], line[0] + line[1]
-                score = np.dot(X, w[s:e].transpose()) + b[s:e]
-                score = np.exp(score - np.max(score, axis=1, keepdims=True))
-                score /= score.sum(axis=1, keepdims=True)
-                score = -np.log(score)
-
-                score = score.transpose()
-                idx = -1
-                for j, n in enumerate(names):
-                    if n == line[3]:
-                        idx = j
-                        score += scores[j]
-                if idx == -1:
-                    score[score > beam] = np.inf
-                else:
-                    score[score - scores[idx] > beam] = np.inf
-
-                for i, name in enumerate(line[2]):
-                    scores.append(score[i])
-                    names.append(name)
-            scores = np.vstack(scores)
-            return names, scores.transpose()
-
-        p_names, p_scores = simulation_hsm_search()
-        idx = np.argsort(p_scores, axis=1)
-        p_scores = np.sort(p_scores, axis=1)
-        p_names = np.array(p_names)[idx]
-        for i in range(names.shape[0]):
-            for j in range(names.shape[1]):
-                if names[i][j]:
-                    self.assertEqual(
-                        names[i][j], p_names[i][j].item().encode('utf-8'))
-                    self.assertAlmostEqual(
-                        scores[i][j], p_scores[i][j], delta=0.001)
-
-    def test_hsm_run_once(self):
-        workspace.GlobalInit(['caffe2'])
-        workspace.FeedBlob("data",
-                           np.random.randn(1000, 100).astype(np.float32))
-        workspace.FeedBlob("weights",
-                           np.random.randn(1000, 100).astype(np.float32))
-        workspace.FeedBlob("bias", np.random.randn(1000).astype(np.float32))
-        workspace.FeedBlob("labels", np.random.rand(1000).astype(np.int32) * 9)
-        op = core.CreateOperator(
-            'HSoftmax',
-            ['data', 'weights', 'bias', 'labels'],
-            ['output', 'intermediate_output'],
-            'HSoftmax',
-            arg=[arg])
-        self.assertTrue(workspace.RunOperatorOnce(op))
-
-    # Test to check value of sum of squared losses in forward pass for given
-    # input
-    def test_hsm_forward(self):
-        cpu_device_option = caffe2_pb2.DeviceOption()
-        grad_checker = gradient_checker.GradientChecker(
-            0.01, 0.05, cpu_device_option, "default")
-        samples = 9
-        dim_in = 5
-        X = np.zeros((samples, dim_in)).astype(np.float32) + 1
-        w = np.zeros((hierarchy_proto.size, dim_in)).astype(np.float32) + 1
-        b = np.array([i for i in range(hierarchy_proto.size)])\
-            .astype(np.float32)
-        labels = np.array([i for i in range(samples)]).astype(np.int32)
-
-        workspace.GlobalInit(['caffe2'])
-        workspace.FeedBlob("data", X)
-        workspace.FeedBlob("weights", w)
-        workspace.FeedBlob("bias", b)
-        workspace.FeedBlob("labels", labels)
-
-        op = core.CreateOperator(
-            'HSoftmax',
-            ['data', 'weights', 'bias', 'labels'],
-            ['output', 'intermediate_output'],
-            'HSoftmax',
-            arg=[arg])
-        grad_ops, g_input = core.GradientRegistry.GetGradientForOp(
-            op, [s + '_grad' for s in op.output])
-
-        loss, _ = grad_checker.GetLossAndGrad(
-            op, grad_ops, [X, w, b, labels], op.input, 0, g_input[0], [0]
-        )
-        self.assertAlmostEqual(loss, 44.269, delta=0.001)
-
-    # Test to compare gradient calculated using the gradient operator and the
-    # symmetric derivative calculated using Euler Method
-    # TODO : convert to both cpu and gpu test when ready.
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_hsm_gradient(self, gc, dc):
-        samples = 10
-        dim_in = 5
-        X = np.random.rand(samples, dim_in).astype(np.float32) - 0.5
-        w = np.random.rand(hierarchy_proto.size, dim_in) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(hierarchy_proto.size).astype(np.float32) - 0.5
-        labels = np.array([np.random.randint(0, 8) for i in range(samples)]) \
-            .astype(np.int32)
-
-        workspace.GlobalInit(['caffe2'])
-        workspace.FeedBlob("data", X)
-        workspace.FeedBlob("weights", w)
-        workspace.FeedBlob("bias", b)
-        workspace.FeedBlob("labels", labels)
-
-        op = core.CreateOperator(
-            'HSoftmax',
-            ['data', 'weights', 'bias', 'labels'],
-            ['output', 'intermediate_output'],
-            'HSoftmax',
-            arg=[arg])
-
-        self.assertDeviceChecks(dc, op, [X, w, b, labels], [0])
-
-        for i in range(3):
-            self.assertGradientChecks(gc, op, [X, w, b, labels], i, [0])
-
-    def test_huffman_tree_hierarchy(self):
-        workspace.GlobalInit(['caffe2'])
-        labelSet = list(range(0, 6))
-        counts = [1, 2, 3, 4, 5, 6]
-        labels = sum([[l] * c for (l, c) in zip(labelSet, counts)], [])
-        Y = np.array(labels).astype(np.int64)
-        workspace.FeedBlob("labels", Y)
-        arg = caffe2_pb2.Argument()
-        arg.name = 'num_classes'
-        arg.i = 6
-        op = core.CreateOperator(
-            'HuffmanTreeHierarchy',
-            ['labels'],
-            ['huffman_tree'],
-            'HuffmanTreeHierarchy',
-            arg=[arg])
-        workspace.RunOperatorOnce(op)
-        huffmanTreeOutput = workspace.FetchBlob('huffman_tree')
-        treeOutput = hsm_pb2.TreeProto()
-        treeOutput.ParseFromString(huffmanTreeOutput[0])
-        treePathOutput = hsmu.create_hierarchy(treeOutput)
-
-        label_to_path = {}
-        for path in treePathOutput.paths:
-            label_to_path[path.word_id] = path
-
-        def checkPath(label, indices, code):
-            path = label_to_path[label]
-            self.assertEqual(len(path.path_nodes), len(code))
-            self.assertEqual(len(path.path_nodes), len(code))
-            for path_node, index, target in \
-                    zip(path.path_nodes, indices, code):
-                self.assertEqual(path_node.index, index)
-                self.assertEqual(path_node.target, target)
-        checkPath(0, [0, 4, 6, 8], [1, 0, 0, 0])
-        checkPath(1, [0, 4, 6, 8], [1, 0, 0, 1])
-        checkPath(2, [0, 4, 6], [1, 0, 1])
-        checkPath(3, [0, 2], [0, 0])
-        checkPath(4, [0, 2], [0, 1])
-        checkPath(5, [0, 4], [1, 1])
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/operator_test/hyperbolic_ops_test.py b/caffe2/python/operator_test/hyperbolic_ops_test.py
deleted file mode 100644
index c0a1e8f49f5a..000000000000
--- a/caffe2/python/operator_test/hyperbolic_ops_test.py
+++ /dev/null
@@ -1,45 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestHyperbolicOps(serial.SerializedTestCase):
-    def _test_hyperbolic_op(self, op_name, np_ref, X, in_place, engine, gc, dc):
-        op = core.CreateOperator(
-            op_name,
-            ["X"],
-            ["X"] if in_place else ["Y"],
-            engine=engine,)
-
-        def ref(X):
-            return [np_ref(X)]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=ref,
-            ensure_outputs_are_inferred=True,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], ensure_outputs_are_inferred=True)
-
-    @serial.given(X=hu.tensor(dtype=np.float32), **hu.gcs)
-    def test_sinh(self, X, gc, dc):
-        self._test_hyperbolic_op("Sinh", np.sinh, X, False, "", gc, dc)
-
-    @serial.given(X=hu.tensor(dtype=np.float32), **hu.gcs)
-    def test_cosh(self, X, gc, dc):
-        self._test_hyperbolic_op("Cosh", np.cosh, X, False, "", gc, dc)
-
-    @serial.given(X=hu.tensor(dtype=np.float32), in_place=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
-    def test_tanh(self, X, in_place, engine, gc, dc):
-        self._test_hyperbolic_op("Tanh", np.tanh, X, in_place, engine, gc, dc)
diff --git a/caffe2/python/operator_test/im2col_col2im_test.py b/caffe2/python/operator_test/im2col_col2im_test.py
deleted file mode 100644
index 42cb1deaf8ae..000000000000
--- a/caffe2/python/operator_test/im2col_col2im_test.py
+++ /dev/null
@@ -1,138 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import assume, given, settings
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestReduceFrontSum(hu.HypothesisTestCase):
-    @given(batch_size=st.integers(1, 3),
-           stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           dilation=st.integers(1, 3),
-           size=st.integers(7, 10),
-           channels=st.integers(1, 8),
-           **hu.gcs)
-    def test_im2col_layout(self, batch_size, stride, pad, kernel, dilation,
-                           size, channels, gc, dc):
-
-        dkernel = (dilation * (kernel - 1) + 1)
-        assume(size >= dkernel)
-
-        NCHW_TO_NHWC = (0, 2, 3, 1)
-        NHWC_TO_NCHW = (0, 3, 1, 2)
-        COL_NHWC_TO_NCHW = (4, 2, 3, 0, 1)
-
-        N = batch_size
-        C = channels
-        H = size
-        W = size
-
-        out_h = int((H + (2 * pad) - dkernel) / stride + 1)
-        out_w = int((W + (2 * pad) - dkernel) / stride + 1)
-
-        im_nchw = np.random.rand(N, C, H, W).astype(np.float32) - 0.5
-        im_nhwc = im_nchw.transpose(NCHW_TO_NHWC)
-
-        op_im2col_nchw = core.CreateOperator(
-            "Im2Col",
-            ["im_nchw"], ["col_nchw"],
-            stride=stride,
-            kernel=kernel,
-            dilation=dilation,
-            pad=pad,
-            order="NCHW",
-            device_option=gc)
-
-        op_im2col_nhwc = core.CreateOperator(
-            "Im2Col",
-            ["im_nhwc"], ["col_nhwc"],
-            stride=stride,
-            kernel=kernel,
-            dilation=dilation,
-            pad=pad,
-            order="NHWC",
-            device_option=gc)
-
-        self.ws.create_blob("im_nchw").feed(im_nchw, device_option=gc)
-        self.ws.create_blob("im_nhwc").feed(im_nhwc, device_option=gc)
-        self.ws.run(op_im2col_nchw)
-        self.ws.run(op_im2col_nhwc)
-
-        # there is probably a clever way to spell this in np
-        col_nchw = self.ws.blobs["col_nchw"].fetch()
-        col_nhwc = self.ws.blobs["col_nhwc"].fetch()
-        col_nchw_ = col_nchw.reshape(N, C, kernel, kernel, out_h, out_w)
-        col_nhwc_ = col_nhwc.reshape(N, out_h, out_w, kernel, kernel, C)
-        for i in range(0, N):
-            np.testing.assert_allclose(
-                col_nchw_[i],
-                col_nhwc_[i].transpose(COL_NHWC_TO_NCHW),
-                atol=1e-4,
-                rtol=1e-4)
-
-        op_col2im_nchw = core.CreateOperator(
-            "Col2Im",
-            ["col_nchw", "im_nchw"],
-            ["out_nchw"],
-            stride=stride,
-            kernel=kernel,
-            dilation=dilation,
-            pad=pad,
-            order="NCHW",
-            device_option=gc)
-
-        op_col2im_nhwc = core.CreateOperator(
-            "Col2Im",
-            ["col_nhwc", "im_nhwc"],
-            ["out_nhwc"],
-            stride=stride,
-            kernel=kernel,
-            dilation=dilation,
-            pad=pad,
-            order="NHWC",
-            device_option=gc)
-
-        self.ws.run(op_col2im_nchw)
-        self.ws.run(op_col2im_nhwc)
-
-        out_nchw = self.ws.blobs["out_nchw"].fetch()
-        out_nhwc = self.ws.blobs["out_nhwc"].fetch()
-        np.testing.assert_allclose(
-            out_nchw,
-            out_nhwc.transpose(NHWC_TO_NCHW),
-            atol=1e-4,
-            rtol=1e-4)
-
-    @given(batch_size=st.integers(1, 3),
-           stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           dilation=st.integers(1, 3),
-           size=st.integers(7, 10),
-           channels=st.integers(1, 8),
-           order=st.sampled_from(["NCHW"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_col2im_gradients(self, batch_size, stride, pad, kernel,
-                              dilation, size, channels, order, gc, dc):
-        assume(size >= dilation * (kernel - 1) + 1)
-        op = core.CreateOperator(
-            "Im2Col",
-            ["X"], ["Y"],
-            stride=stride,
-            kernel=kernel,
-            dilation=dilation,
-            pad=pad,
-            order=order,
-            device_option=gc)
-        X = np.random.rand(batch_size, channels, size, size).astype(np.float32)
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-        return
diff --git a/caffe2/python/operator_test/image_input_op_test.py b/caffe2/python/operator_test/image_input_op_test.py
deleted file mode 100644
index 6bed69af9ae0..000000000000
--- a/caffe2/python/operator_test/image_input_op_test.py
+++ /dev/null
@@ -1,433 +0,0 @@
-
-
-
-
-
-import unittest
-try:
-    import cv2
-    import lmdb
-except ImportError:
-    pass  # Handled below
-
-from PIL import Image
-import numpy as np
-import shutil
-import io
-import sys
-import tempfile
-
-# TODO: This test does not test scaling because
-# the algorithms used by OpenCV in the C and Python
-# version seem to differ slightly. It does test
-# most other features
-
-from hypothesis import given, settings, Verbosity
-import hypothesis.strategies as st
-
-from caffe2.proto import caffe2_pb2
-import caffe2.python.hypothesis_test_util as hu
-
-from caffe2.python import workspace, core
-
-
-# Verification routines (applies transformations to image to
-# verify if the operator produces same result)
-def verify_apply_bounding_box(img, box):
-    import skimage.util
-    if any(type(box[f]) is not int or np.isnan(box[f] or box[f] < 0)
-           for f in range(0, 4)):
-        return img
-    # Box is ymin, xmin, bound_height, bound_width
-    y_bounds = (box[0], img.shape[0] - box[0] - box[2])
-    x_bounds = (box[1], img.shape[1] - box[1] - box[3])
-    c_bounds = (0, 0)
-
-    if any(el < 0 for el in list(y_bounds) + list(x_bounds) + list(c_bounds)):
-        return img
-
-    bboxed = skimage.util.crop(img, (y_bounds, x_bounds, c_bounds))
-    return bboxed
-
-
-# This function is called but not used. It will trip on assert False if
-# the arguments are wrong (improper example)
-def verify_rescale(img, minsize):
-    # Here we use OpenCV transformation to match the C code
-    scale_amount = float(minsize) / min(img.shape[0], img.shape[1])
-    if scale_amount <= 1.0:
-        return img
-
-    print("Scale amount is %f -- should be < 1.0; got shape %s" %
-          (scale_amount, str(img.shape)))
-    assert False
-    img_cv = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-    output_shape = (int(np.ceil(scale_amount * img_cv.shape[0])),
-                    int(np.ceil(scale_amount * img_cv.shape[1])))
-    resized = cv2.resize(img_cv,
-                         dsize=output_shape,
-                         interpolation=cv2.INTER_AREA)
-
-    resized = cv2.cvtColor(resized, cv2.COLOR_BGR2RGB)
-    assert resized.shape[0] >= minsize
-    assert resized.shape[1] >= minsize
-    return resized
-
-
-def verify_crop(img, crop):
-    import skimage.util
-    assert img.shape[0] >= crop
-    assert img.shape[1] >= crop
-    y_offset = 0
-    if img.shape[0] > crop:
-        y_offset = (img.shape[0] - crop) // 2
-
-    x_offset = 0
-    if img.shape[1] > crop:
-        x_offset = (img.shape[1] - crop) // 2
-
-    y_bounds = (y_offset, img.shape[0] - crop - y_offset)
-    x_bounds = (x_offset, img.shape[1] - crop - x_offset)
-    c_bounds = (0, 0)
-    cropped = skimage.util.crop(img, (y_bounds, x_bounds, c_bounds))
-    assert cropped.shape[0] == crop
-    assert cropped.shape[1] == crop
-    return cropped
-
-
-def verify_color_normalize(img, means, stds):
-    # Note the RGB/BGR inversion
-    # Operate on integers like the C version
-    img = img * 255.0
-    img[:, :, 0] = (img[:, :, 0] - means[2]) / stds[2]
-    img[:, :, 1] = (img[:, :, 1] - means[1]) / stds[1]
-    img[:, :, 2] = (img[:, :, 2] - means[0]) / stds[0]
-    return img * (1.0 / 255.0)
-
-
-# Printing function (for debugging)
-def caffe2_img(img):
-    # Convert RGB to BGR
-    img = img[:, :, (2, 1, 0)]
-    # Convert HWC to CHW
-    img = img.swapaxes(1, 2).swapaxes(0, 1)
-    img = img * 255.0
-    return img.astype(np.int32)
-
-
-# Bounding box is ymin, xmin, height, width
-def create_test(output_dir, width, height, default_bound, minsize, crop, means,
-                stds, count, label_type, num_labels, output1=None,
-                output2_size=None):
-    print("Creating a temporary lmdb database of %d pictures..." % (count))
-
-    if default_bound is None:
-        default_bound = [-1] * 4
-
-    LMDB_MAP_SIZE = 1 << 40
-    env = lmdb.open(output_dir, map_size=LMDB_MAP_SIZE, subdir=True)
-    index = 0
-    # Create images and the expected results
-    expected_results = []
-    with env.begin(write=True) as txn:
-        while index < count:
-            img_array = np.random.random_integers(
-                0, 255, [height, width, 3]).astype(np.uint8)
-            img_obj = Image.fromarray(img_array)
-            img_str = io.BytesIO()
-            img_obj.save(img_str, 'PNG')
-
-            # Create a random bounding box for every other image
-            # ymin, xmin, bound_height, bound_width
-            # TODO: To ensure that we never need to scale, we
-            # ensure that the bounding-box is larger than the
-            # minsize parameter
-            bounding_box = list(default_bound)
-            do_default_bound = True
-            if index % 2 == 0:
-                if height > minsize and width > minsize:
-                    do_default_bound = False
-                    bounding_box[0:2] = [np.random.randint(a) for a in
-                                         (height - minsize, width - minsize)]
-                    bounding_box[2:4] = [np.random.randint(a) + minsize for a in
-                                         (height - bounding_box[0] - minsize + 1,
-                                          width - bounding_box[1] - minsize + 1)]
-                    # print("Bounding box is %s" % (str(bounding_box)))
-            # Create expected result
-            img_expected = img_array.astype(np.float32) * (1.0 / 255.0)
-            # print("Orig image: %s" % (str(caffe2_img(img_expected))))
-            img_expected = verify_apply_bounding_box(
-                img_expected,
-                bounding_box)
-            # print("Bounded image: %s" % (str(caffe2_img(img_expected))))
-
-            img_expected = verify_rescale(img_expected, minsize)
-
-            img_expected = verify_crop(img_expected, crop)
-            # print("Crop image: %s" % (str(caffe2_img(img_expected))))
-
-            img_expected = verify_color_normalize(img_expected, means, stds)
-            # print("Color image: %s" % (str(caffe2_img(img_expected))))
-
-            tensor_protos = caffe2_pb2.TensorProtos()
-            image_tensor = tensor_protos.protos.add()
-            image_tensor.data_type = 4  # string data
-            image_tensor.string_data.append(img_str.getvalue())
-            img_str.close()
-
-            label_tensor = tensor_protos.protos.add()
-            label_tensor.data_type = 2  # int32 data
-            assert (label_type >= 0 and label_type <= 3)
-            if label_type == 0:
-                label_tensor.int32_data.append(index)
-                expected_label = index
-            elif label_type == 1:
-                binary_labels = np.random.randint(2, size=num_labels)
-                for idx, val in enumerate(binary_labels.tolist()):
-                    if val == 1:
-                        label_tensor.int32_data.append(idx)
-                expected_label = binary_labels
-            elif label_type == 2:
-                embedding_label = np.random.randint(100, size=num_labels)
-                for _idx, val in enumerate(embedding_label.tolist()):
-                    label_tensor.int32_data.append(val)
-                expected_label = embedding_label
-            elif label_type == 3:
-                weight_tensor = tensor_protos.protos.add()
-                weight_tensor.data_type = 1  # float weights
-                binary_labels = np.random.randint(2, size=num_labels)
-                expected_label = np.zeros(num_labels).astype(np.float32)
-                for idx, val in enumerate(binary_labels.tolist()):
-                    expected_label[idx] = val * idx
-                    if val == 1:
-                        label_tensor.int32_data.append(idx)
-                        weight_tensor.float_data.append(idx)
-
-            if output1:
-                output1_tensor = tensor_protos.protos.add()
-                output1_tensor.data_type = 1  # float data
-                output1_tensor.float_data.append(output1)
-
-            output2 = []
-            if output2_size:
-                output2_tensor = tensor_protos.protos.add()
-                output2_tensor.data_type = 2  # int32 data
-                values = np.random.randint(1024, size=output2_size)
-                for val in values.tolist():
-                    output2.append(val)
-                    output2_tensor.int32_data.append(val)
-
-            expected_results.append(
-                [caffe2_img(img_expected), expected_label, output1, output2])
-
-            if not do_default_bound:
-                bounding_tensor = tensor_protos.protos.add()
-                bounding_tensor.data_type = 2  # int32 data
-                bounding_tensor.int32_data.extend(bounding_box)
-
-            txn.put(
-                '{}'.format(index).encode('ascii'),
-                tensor_protos.SerializeToString()
-            )
-            index = index + 1
-        # End while
-    # End with
-    return expected_results
-
-
-def run_test(
-        size_tuple, means, stds, label_type, num_labels, is_test, scale_jitter_type,
-        color_jitter, color_lighting, dc, validator, output1=None, output2_size=None):
-    # TODO: Does not test on GPU and does not test use_gpu_transform
-    # WARNING: Using ModelHelper automatically does NHWC to NCHW
-    # transformation if needed.
-    width, height, minsize, crop = size_tuple
-    means = [float(m) for m in means]
-    stds = [float(s) for s in stds]
-    out_dir = tempfile.mkdtemp()
-    count_images = 2  # One with bounding box and one without
-    expected_images = create_test(
-        out_dir,
-        width=width,
-        height=height,
-        default_bound=(3, 5, height - 3, width - 5),
-        minsize=minsize,
-        crop=crop,
-        means=means,
-        stds=stds,
-        count=count_images,
-        label_type=label_type,
-        num_labels=num_labels,
-        output1=output1,
-        output2_size=output2_size
-    )
-    for device_option in dc:
-        with hu.temp_workspace():
-            reader_net = core.Net('reader')
-            reader_net.CreateDB(
-                [],
-                'DB',
-                db=out_dir,
-                db_type="lmdb"
-            )
-            workspace.RunNetOnce(reader_net)
-            outputs = ['data', 'label']
-            output_sizes = []
-            if output1:
-                outputs.append('output1')
-                output_sizes.append(1)
-            if output2_size:
-                outputs.append('output2')
-                output_sizes.append(output2_size)
-            imageop = core.CreateOperator(
-                'ImageInput',
-                ['DB'],
-                outputs,
-                batch_size=count_images,
-                color=3,
-                minsize=minsize,
-                crop=crop,
-                is_test=is_test,
-                bounding_ymin=3,
-                bounding_xmin=5,
-                bounding_height=height - 3,
-                bounding_width=width - 5,
-                mean_per_channel=means,
-                std_per_channel=stds,
-                use_gpu_transform=(device_option.device_type == 1),
-                label_type=label_type,
-                num_labels=num_labels,
-                output_sizes=output_sizes,
-                scale_jitter_type=scale_jitter_type,
-                color_jitter=color_jitter,
-                color_lighting=color_lighting
-            )
-
-            imageop.device_option.CopyFrom(device_option)
-            main_net = core.Net('main')
-            main_net.Proto().op.extend([imageop])
-            workspace.RunNetOnce(main_net)
-            validator(expected_images, device_option, count_images)
-            # End for
-        # End with
-    # End for
-    shutil.rmtree(out_dir)
-# end run_test
-
-
-@unittest.skipIf('cv2' not in sys.modules, 'python-opencv is not installed')
-@unittest.skipIf('lmdb' not in sys.modules, 'python-lmdb is not installed')
-class TestImport(hu.HypothesisTestCase):
-    def validate_image_and_label(
-            self, expected_images, device_option, count_images, label_type,
-            is_test, scale_jitter_type, color_jitter, color_lighting):
-        l = workspace.FetchBlob('label')
-        result = workspace.FetchBlob('data').astype(np.int32)
-        # If we don't use_gpu_transform, the output is in NHWC
-        # Our reference output is CHW so we swap
-        if device_option.device_type != 1:
-            expected = [img.swapaxes(0, 1).swapaxes(1, 2) for
-                        (img, _, _, _) in expected_images]
-        else:
-            expected = [img for (img, _, _, _) in expected_images]
-        for i in range(count_images):
-            if label_type == 0:
-                self.assertEqual(l[i], expected_images[i][1])
-            else:
-                self.assertEqual(
-                    (l[i] - expected_images[i][1] > 0).sum(), 0)
-            if is_test == 0:
-                # when traing data preparation is randomized (e.g. random cropping,
-                # Inception-style random sized cropping, color jittering,
-                # color lightin), we only compare blob shape
-                for (s1, s2) in zip(expected[i].shape, result[i].shape):
-                    self.assertEqual(s1, s2)
-            else:
-                self.assertEqual((expected[i] - result[i] > 1).sum(), 0)
-        # End for
-    # end validate_image_and_label
-
-    @given(size_tuple=st.tuples(
-        st.integers(min_value=8, max_value=4096),
-        st.integers(min_value=8, max_value=4096)).flatmap(lambda t: st.tuples(
-            st.just(t[0]), st.just(t[1]),
-            st.just(min(t[0] - 6, t[1] - 4)),
-            st.integers(min_value=1, max_value=min(t[0] - 6, t[1] - 4)))),
-        means=st.tuples(st.integers(min_value=0, max_value=255),
-                        st.integers(min_value=0, max_value=255),
-                        st.integers(min_value=0, max_value=255)),
-        stds=st.tuples(st.floats(min_value=1, max_value=10),
-                       st.floats(min_value=1, max_value=10),
-                       st.floats(min_value=1, max_value=10)),
-        label_type=st.integers(0, 3),
-        num_labels=st.integers(min_value=8, max_value=4096),
-        is_test=st.integers(min_value=0, max_value=1),
-        scale_jitter_type=st.integers(min_value=0, max_value=1),
-        color_jitter=st.integers(min_value=0, max_value=1),
-        color_lighting=st.integers(min_value=0, max_value=1),
-        **hu.gcs)
-    @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
-    def test_imageinput(
-            self, size_tuple, means, stds, label_type,
-            num_labels, is_test, scale_jitter_type, color_jitter, color_lighting,
-            gc, dc):
-        def validator(expected_images, device_option, count_images):
-            self.validate_image_and_label(
-                expected_images, device_option, count_images, label_type,
-                is_test, scale_jitter_type, color_jitter, color_lighting)
-        # End validator
-        run_test(
-            size_tuple, means, stds, label_type, num_labels, is_test,
-            scale_jitter_type, color_jitter, color_lighting, dc, validator)
-    # End test_imageinput
-
-    @given(size_tuple=st.tuples(
-        st.integers(min_value=8, max_value=4096),
-        st.integers(min_value=8, max_value=4096)).flatmap(lambda t: st.tuples(
-            st.just(t[0]), st.just(t[1]),
-            st.just(min(t[0] - 6, t[1] - 4)),
-            st.integers(min_value=1, max_value=min(t[0] - 6, t[1] - 4)))),
-        means=st.tuples(st.integers(min_value=0, max_value=255),
-                        st.integers(min_value=0, max_value=255),
-                        st.integers(min_value=0, max_value=255)),
-        stds=st.tuples(st.floats(min_value=1, max_value=10),
-                       st.floats(min_value=1, max_value=10),
-                       st.floats(min_value=1, max_value=10)),
-        label_type=st.integers(0, 3),
-        num_labels=st.integers(min_value=8, max_value=4096),
-        is_test=st.integers(min_value=0, max_value=1),
-        scale_jitter_type=st.integers(min_value=0, max_value=1),
-        color_jitter=st.integers(min_value=0, max_value=1),
-        color_lighting=st.integers(min_value=0, max_value=1),
-        output1=st.floats(min_value=1, max_value=10),
-        output2_size=st.integers(min_value=2, max_value=10),
-        **hu.gcs)
-    @settings(verbosity=Verbosity.verbose, max_examples=10, deadline=None)
-    def test_imageinput_with_additional_outputs(
-            self, size_tuple, means, stds, label_type,
-            num_labels, is_test, scale_jitter_type, color_jitter, color_lighting,
-            output1, output2_size, gc, dc):
-        def validator(expected_images, device_option, count_images):
-            self.validate_image_and_label(
-                expected_images, device_option, count_images, label_type,
-                is_test, scale_jitter_type, color_jitter, color_lighting)
-
-            output1_result = workspace.FetchBlob('output1')
-            output2_result = workspace.FetchBlob('output2')
-
-            for i in range(count_images):
-                self.assertEqual(output1_result[i], expected_images[i][2])
-                self.assertEqual(
-                    (output2_result[i] - expected_images[i][3] > 0).sum(), 0)
-            # End for
-        # End validator
-        run_test(
-            size_tuple, means, stds, label_type, num_labels, is_test,
-            scale_jitter_type, color_jitter, color_lighting, dc,
-            validator, output1, output2_size)
-    # End test_imageinput
-
-
-if __name__ == '__main__':
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/index_hash_ops_test.py b/caffe2/python/operator_test/index_hash_ops_test.py
deleted file mode 100644
index 1eb7ffa20691..000000000000
--- a/caffe2/python/operator_test/index_hash_ops_test.py
+++ /dev/null
@@ -1,74 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-
-
-class TestIndexHashOps(serial.SerializedTestCase):
-    @given(
-        indices=st.sampled_from([
-            np.int32, np.int64
-        ]).flatmap(lambda dtype: hu.tensor(min_dim=1, max_dim=1, dtype=dtype)),
-        seed=st.integers(min_value=0, max_value=10),
-        modulo=st.integers(min_value=100000, max_value=200000),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=10000)
-    def test_index_hash_ops(self, indices, seed, modulo, gc, dc):
-        def index_hash(indices):
-            dtype = np.array(indices).dtype
-            assert dtype == np.int32 or dtype == np.int64
-            hashed_indices = []
-            for index in indices:
-                hashed = dtype.type(0xDEADBEEF * seed)
-                indices_bytes = np.array([index], dtype).view(np.int8)
-                for b in indices_bytes:
-                    hashed = dtype.type(hashed * 65537 + b)
-                hashed = (modulo + hashed % modulo) % modulo
-                hashed_indices.append(hashed)
-            return [hashed_indices]
-
-        op = core.CreateOperator("IndexHash",
-                                 ["indices"], ["hashed_indices"],
-                                 seed=seed, modulo=modulo)
-
-        self.assertDeviceChecks(dc, op, [indices], [0])
-        self.assertReferenceChecks(gc, op, [indices], index_hash)
-
-        # In-place update
-        op = core.CreateOperator("IndexHash",
-                                 ["indices"], ["indices"],
-                                 seed=seed, modulo=modulo)
-
-        self.assertDeviceChecks(dc, op, [indices], [0])
-        self.assertReferenceChecks(gc, op, [indices], index_hash)
-
-    def test_shape_and_type_inference(self):
-        with hu.temp_workspace("shape_type_inf_int64"):
-            net = core.Net('test_net')
-            net.ConstantFill(
-                [], "values", shape=[64], dtype=core.DataType.INT64,
-            )
-            net.IndexHash(['values'], ['values_output'])
-            (shapes, types) = workspace.InferShapesAndTypes([net], {})
-
-            self.assertEqual(shapes["values_output"], [64])
-            self.assertEqual(types["values_output"], core.DataType.INT64)
-
-        with hu.temp_workspace("shape_type_inf_int32"):
-            net = core.Net('test_net')
-            net.ConstantFill(
-                [], "values", shape=[2, 32], dtype=core.DataType.INT32,
-            )
-            net.IndexHash(['values'], ['values_output'])
-            (shapes, types) = workspace.InferShapesAndTypes([net], {})
-
-            self.assertEqual(shapes["values_output"], [2, 32])
-            self.assertEqual(types["values_output"], core.DataType.INT32)
diff --git a/caffe2/python/operator_test/index_ops_test.py b/caffe2/python/operator_test/index_ops_test.py
deleted file mode 100644
index cf99128b3151..000000000000
--- a/caffe2/python/operator_test/index_ops_test.py
+++ /dev/null
@@ -1,137 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-import numpy as np
-import tempfile
-
-
-class TestIndexOps(TestCase):
-    def _test_index_ops(self, entries, dtype, index_create_op):
-        workspace.RunOperatorOnce(core.CreateOperator(
-            index_create_op,
-            [],
-            ['index'],
-            max_elements=10))
-        my_entries = np.array(
-            [entries[0], entries[1], entries[2]], dtype=dtype)
-
-        workspace.FeedBlob('entries', my_entries)
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'IndexLoad',
-            ['index', 'entries'],
-            ['index']))
-        query1 = np.array(
-            [entries[0], entries[3], entries[0], entries[4]],
-            dtype=dtype)
-
-        workspace.FeedBlob('query1', query1)
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'IndexGet',
-            ['index', 'query1'],
-            ['result1']))
-        result1 = workspace.FetchBlob('result1')
-        np.testing.assert_array_equal([1, 4, 1, 5], result1)
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'IndexFreeze',
-            ['index'],
-            ['index']))
-
-        query2 = np.array(
-            [entries[5], entries[4], entries[0], entries[6], entries[7]],
-            dtype=dtype)
-        workspace.FeedBlob('query2', query2)
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'IndexGet',
-            ['index', 'query2'],
-            ['result2']))
-        result2 = workspace.FetchBlob('result2')
-        np.testing.assert_array_equal([0, 5, 1, 0, 0], result2)
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'IndexSize',
-            ['index'],
-            ['index_size']))
-        size = workspace.FetchBlob('index_size')
-        self.assertEqual(size, 6)
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'IndexStore',
-            ['index'],
-            ['stored_entries']))
-        stored_actual = workspace.FetchBlob('stored_entries')
-        new_entries = np.array([entries[3], entries[4]], dtype=dtype)
-        expected = np.concatenate((my_entries, new_entries))
-        if dtype is str:
-            # we'll always get bytes back from Caffe2
-            expected = np.array([
-                x.item().encode('utf-8') if isinstance(x, np.str_) else x
-                for x in expected
-            ], dtype=object)
-        np.testing.assert_array_equal(expected, stored_actual)
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            index_create_op,
-            [],
-            ['index2']))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'IndexLoad',
-            ['index2', 'stored_entries'],
-            ['index2'],
-            skip_first_entry=1))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'IndexSize',
-            ['index2'],
-            ['index2_size']))
-        index2_size = workspace.FetchBlob('index2_size')
-        self.assertEqual(index2_size, 5)
-
-        # test serde
-        with tempfile.NamedTemporaryFile() as tmp:
-            workspace.RunOperatorOnce(core.CreateOperator(
-                'Save',
-                ['index'],
-                [],
-                absolute_path=1,
-                db_type='minidb',
-                db=tmp.name))
-            # frees up the blob
-            workspace.FeedBlob('index', np.array([]))
-            # reloads the index
-            workspace.RunOperatorOnce(core.CreateOperator(
-                'Load',
-                [],
-                ['index'],
-                absolute_path=1,
-                db_type='minidb',
-                db=tmp.name))
-            query3 = np.array(
-                [entries[0], entries[3], entries[0], entries[4], entries[4]],
-                dtype=dtype)
-
-            workspace.FeedBlob('query3', query3)
-            workspace.RunOperatorOnce(core.CreateOperator(
-                'IndexGet', ['index', 'query3'], ['result3']))
-            result3 = workspace.FetchBlob('result3')
-            np.testing.assert_array_equal([1, 4, 1, 5, 5], result3)
-
-    def test_string_index_ops(self):
-        self._test_index_ops([
-            'entry1', 'entry2', 'entry3', 'new_entry1',
-            'new_entry2', 'miss1', 'miss2', 'miss3',
-        ], str, 'StringIndexCreate')
-
-    def test_int_index_ops(self):
-        self._test_index_ops(list(range(8)), np.int32, 'IntIndexCreate')
-
-    def test_long_index_ops(self):
-        self._test_index_ops(list(range(8)), np.int64, 'LongIndexCreate')
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/instance_norm_test.py b/caffe2/python/operator_test/instance_norm_test.py
deleted file mode 100644
index d97385cbe215..000000000000
--- a/caffe2/python/operator_test/instance_norm_test.py
+++ /dev/null
@@ -1,276 +0,0 @@
-
-
-
-
-import numpy as np
-from hypothesis import given, assume, settings
-import hypothesis.strategies as st
-
-from caffe2.python import core, model_helper, brew, utils
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-import unittest
-
-
-class TestInstanceNorm(serial.SerializedTestCase):
-
-    def _get_inputs(self, N, C, H, W, order):
-        input_data = np.random.rand(N, C, H, W).astype(np.float32)
-        if order == 'NHWC':
-            # Allocate in the same order as NCHW and transpose to make sure
-            # the inputs are identical on freshly-seeded calls.
-            input_data = utils.NCHW2NHWC(input_data)
-        elif order != "NCHW":
-            raise Exception('unknown order type ({})'.format(order))
-
-        scale_data = np.random.rand(C).astype(np.float32)
-        bias_data = np.random.rand(C).astype(np.float32)
-        return input_data, scale_data, bias_data
-
-    def _get_op(self, device_option, store_mean, store_inv_stdev, epsilon,
-                order, inplace=False):
-        outputs = ['output' if not inplace else "input"]
-        if store_mean or store_inv_stdev:
-            outputs += ['mean']
-        if store_inv_stdev:
-            outputs += ['inv_stdev']
-        op = core.CreateOperator(
-            'InstanceNorm',
-            ['input', 'scale', 'bias'],
-            outputs,
-            order=order,
-            epsilon=epsilon,
-            device_option=device_option)
-        return op
-
-    def _feed_inputs(self, input_blobs, device_option):
-        names = ['input', 'scale', 'bias']
-        for name, blob in zip(names, input_blobs):
-            self.ws.create_blob(name).feed(blob, device_option=device_option)
-
-    @given(gc=hu.gcs['gc'],
-           dc=hu.gcs['dc'],
-           N=st.integers(1, 4),
-           C=st.integers(1, 4),
-           H=st.integers(2, 4),
-           W=st.integers(2, 4),
-           order=st.sampled_from(['NCHW', 'NHWC']),
-           epsilon=st.floats(1e-6, 1e-4),
-           store_mean=st.booleans(),
-           seed=st.integers(0, 1000),
-           store_inv_stdev=st.booleans())
-    @settings(deadline=10000)
-    def test_instance_norm_gradients(
-            self, gc, dc, N, C, H, W, order, store_mean, store_inv_stdev,
-            epsilon, seed):
-        np.random.seed(seed)
-
-        # force store_inv_stdev if store_mean to match existing forward pass
-        # implementation
-        store_inv_stdev |= store_mean
-
-        op = self._get_op(
-            device_option=gc,
-            store_mean=store_mean,
-            store_inv_stdev=store_inv_stdev,
-            epsilon=epsilon,
-            order=order)
-
-        input_data = np.arange(N * C * H * W).astype(np.float32)
-        np.random.shuffle(input_data)
-        if order == "NCHW":
-            input_data = input_data.reshape(N, C, H, W)
-        else:
-            input_data = input_data.reshape(N, H, W, C)
-        scale_data = np.random.randn(C).astype(np.float32)
-        bias_data = np.random.randn(C).astype(np.float32)
-        input_blobs = (input_data, scale_data, bias_data)
-
-        output_indices = [0]
-        # if store_inv_stdev is turned on, store_mean must also be forced on
-        if store_mean or store_inv_stdev:
-            output_indices += [1]
-        if store_inv_stdev:
-            output_indices += [2]
-        self.assertDeviceChecks(dc, op, input_blobs, output_indices)
-        # The gradient only flows from output #0 since the other two only
-        # store the temporary mean and inv_stdev buffers.
-        # Check dl/dinput
-        self.assertGradientChecks(gc, op, input_blobs, 0, [0])
-        # Check dl/dscale
-        self.assertGradientChecks(gc, op, input_blobs, 1, [0])
-        # Check dl/dbias
-        self.assertGradientChecks(gc, op, input_blobs, 2, [0])
-
-    @given(gc=hu.gcs['gc'],
-           dc=hu.gcs['dc'],
-           N=st.integers(2, 10),
-           C=st.integers(3, 10),
-           H=st.integers(5, 10),
-           W=st.integers(7, 10),
-           seed=st.integers(0, 1000),
-           epsilon=st.floats(1e-6, 1e-4),
-           store_mean=st.booleans(),
-           store_inv_stdev=st.booleans())
-    def test_instance_norm_layout(self, gc, dc, N, C, H, W, store_mean,
-                                  store_inv_stdev, epsilon, seed):
-        # force store_inv_stdev if store_mean to match existing forward pass
-        # implementation
-        store_inv_stdev |= store_mean
-
-        outputs = {}
-        for order in ('NCHW', 'NHWC'):
-            np.random.seed(seed)
-            input_blobs = self._get_inputs(N, C, H, W, order)
-            self._feed_inputs(input_blobs, device_option=gc)
-            op = self._get_op(
-                device_option=gc,
-                store_mean=store_mean,
-                store_inv_stdev=store_inv_stdev,
-                epsilon=epsilon,
-                order=order)
-            self.ws.run(op)
-            outputs[order] = self.ws.blobs['output'].fetch()
-        np.testing.assert_allclose(
-            outputs['NCHW'],
-            utils.NHWC2NCHW(outputs["NHWC"]),
-            atol=1e-4,
-            rtol=1e-4)
-
-    @serial.given(gc=hu.gcs['gc'],
-           dc=hu.gcs['dc'],
-           N=st.integers(2, 10),
-           C=st.integers(3, 10),
-           H=st.integers(5, 10),
-           W=st.integers(7, 10),
-           order=st.sampled_from(['NCHW', 'NHWC']),
-           epsilon=st.floats(1e-6, 1e-4),
-           store_mean=st.booleans(),
-           seed=st.integers(0, 1000),
-           store_inv_stdev=st.booleans(),
-           inplace=st.booleans())
-    def test_instance_norm_reference_check(
-            self, gc, dc, N, C, H, W, order, store_mean, store_inv_stdev,
-            epsilon, seed, inplace):
-        np.random.seed(seed)
-
-        # force store_inv_stdev if store_mean to match existing forward pass
-        # implementation
-        store_inv_stdev |= store_mean
-        if order != "NCHW":
-            assume(not inplace)
-
-        inputs = self._get_inputs(N, C, H, W, order)
-        op = self._get_op(
-            device_option=gc,
-            store_mean=store_mean,
-            store_inv_stdev=store_inv_stdev,
-            epsilon=epsilon,
-            order=order,
-            inplace=inplace)
-
-        def ref(input_blob, scale_blob, bias_blob):
-            if order == 'NHWC':
-                input_blob = utils.NHWC2NCHW(input_blob)
-
-            mean_blob = input_blob.reshape((N, C, -1)).mean(axis=2)
-            inv_stdev_blob = 1.0 / \
-                np.sqrt(input_blob.reshape((N, C, -1)).var(axis=2) + epsilon)
-            # _bc indicates blobs that are reshaped for broadcast
-            scale_bc = scale_blob[np.newaxis, :, np.newaxis, np.newaxis]
-            mean_bc = mean_blob[:, :, np.newaxis, np.newaxis]
-            inv_stdev_bc = inv_stdev_blob[:, :, np.newaxis, np.newaxis]
-            bias_bc = bias_blob[np.newaxis, :, np.newaxis, np.newaxis]
-            normalized_blob = scale_bc * (input_blob - mean_bc) * inv_stdev_bc \
-                + bias_bc
-
-            if order == 'NHWC':
-                normalized_blob = utils.NCHW2NHWC(normalized_blob)
-
-            if not store_mean and not store_inv_stdev:
-                return normalized_blob,
-            elif not store_inv_stdev:
-                return normalized_blob, mean_blob
-            else:
-                return normalized_blob, mean_blob, inv_stdev_blob
-
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(gc=hu.gcs['gc'],
-           dc=hu.gcs['dc'],
-           N=st.integers(2, 10),
-           C=st.integers(3, 10),
-           H=st.integers(5, 10),
-           W=st.integers(7, 10),
-           order=st.sampled_from(['NCHW', 'NHWC']),
-           epsilon=st.floats(1e-6, 1e-4),
-           store_mean=st.booleans(),
-           seed=st.integers(0, 1000),
-           store_inv_stdev=st.booleans())
-    def test_instance_norm_device_check(
-            self, gc, dc, N, C, H, W, order, store_mean, store_inv_stdev,
-            epsilon, seed):
-        np.random.seed(seed)
-
-        # force store_inv_stdev if store_mean to match existing forward pass
-        # implementation
-        store_inv_stdev |= store_mean
-
-        inputs = self._get_inputs(N, C, H, W, order)
-        op = self._get_op(
-            device_option=gc,
-            store_mean=store_mean,
-            store_inv_stdev=store_inv_stdev,
-            epsilon=epsilon,
-            order=order)
-
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-    @given(is_test=st.booleans(),
-           N=st.integers(2, 10),
-           C=st.integers(3, 10),
-           H=st.integers(5, 10),
-           W=st.integers(7, 10),
-           order=st.sampled_from(['NCHW', 'NHWC']),
-           epsilon=st.floats(1e-6, 1e-4),
-           seed=st.integers(0, 1000))
-    def test_instance_norm_model_helper(
-            self, N, C, H, W, order, epsilon, seed, is_test):
-        np.random.seed(seed)
-        model = model_helper.ModelHelper(name="test_model")
-        brew.instance_norm(
-            model,
-            'input',
-            'output',
-            C,
-            epsilon=epsilon,
-            order=order,
-            is_test=is_test)
-
-        input_blob = np.random.rand(N, C, H, W).astype(np.float32)
-        if order == 'NHWC':
-            input_blob = utils.NCHW2NHWC(input_blob)
-
-        self.ws.create_blob('input').feed(input_blob)
-
-        self.ws.create_net(model.param_init_net).run()
-        self.ws.create_net(model.net).run()
-
-        if is_test:
-            scale = self.ws.blobs['output_s'].fetch()
-            assert scale is not None
-            assert scale.shape == (C, )
-            bias = self.ws.blobs['output_b'].fetch()
-            assert bias is not None
-            assert bias.shape == (C, )
-
-        output_blob = self.ws.blobs['output'].fetch()
-        if order == 'NHWC':
-            output_blob = utils.NHWC2NCHW(output_blob)
-
-        assert output_blob.shape == (N, C, H, W)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/operator_test/integral_image_ops_test.py b/caffe2/python/operator_test/integral_image_ops_test.py
deleted file mode 100644
index 79d79ae6de21..000000000000
--- a/caffe2/python/operator_test/integral_image_ops_test.py
+++ /dev/null
@@ -1,94 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-
-
-class TestIntegralImageOps(serial.SerializedTestCase):
-    @given(batch_size=st.integers(1, 3),
-           height=st.integers(7, 10),
-           width=st.integers(7, 10),
-           channels=st.integers(1, 8),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_integral_image_ops(self, batch_size, height, width, channels, gc, dc):
-        N = batch_size
-        C = channels
-        H = height
-        W = width
-
-        im = np.random.rand(N, C, H, W).astype(np.float32)
-        op = core.CreateOperator("IntegralImage",
-                                 ["im"], ["y"])
-
-        def integral_image(im):
-            y = np.random.rand(N, C, H + 1, W + 1).astype(np.float32)
-            for i1 in range(N):
-                for i2 in range(C):
-                    for i3 in range(W + 1):
-                        y[i1, i2, 0, i3] = 0
-                    for i3 in range(H + 1):
-                        y[i1, i2, i3, 0] = 0
-                    for i3 in range(1, H + 1):
-                        for i4 in range(1, W + 1):
-                            y[i1, i2, i3, i4] = im[i1, i2, i3 - 1, i4 - 1] + \
-                                y[i1, i2, i3 - 1, i4] + \
-                                y[i1, i2, i3, i4 - 1] - \
-                                y[i1, i2, i3 - 1, i4 - 1]
-
-            return [y]
-
-        self.assertDeviceChecks(dc, op, [im], [0])
-        self.assertReferenceChecks(gc, op, [im], integral_image)
-
-    @given(batch_size=st.integers(1, 3),
-           height=st.integers(7, 10),
-           width=st.integers(7, 10),
-           channels=st.integers(1, 8),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_integral_image_gradient_ops(self, batch_size, height, width,
-                                         channels, gc, dc):
-        N = batch_size
-        C = channels
-        H = height
-        W = width
-
-        X = np.random.rand(N, C, H, W).astype(np.float32)
-        dY = np.random.rand(N, C, H + 1, W + 1).astype(np.float32)
-        op = core.CreateOperator(
-            "IntegralImageGradient",
-            ["X", "dY"],
-            ["dX"])
-
-        def integral_image_gradient(X, dY):
-            dX = np.random.rand(N, C, H, W).astype(np.float32)
-            dX1 = np.random.rand(N, C, H + 1, W).astype(np.float32)
-            #H+1,W+1=>H+1, W
-            for i1 in range(N):
-                for i2 in range(C):
-                    for i3 in range(H + 1):
-                        dX1[i1, i2, i3, 0] = dY[i1, i2, i3, 0]
-                        for i4 in range(1, W):
-                            dX1[i1, i2, i3, i4] = dY[i1, i2, i3, i4] + \
-                                dX1[i1, i2, i3, i4 - 1]
-
-            #H+1,W=>H,W
-            for i1 in range(N):
-                for i2 in range(C):
-                    for i3 in range(W):
-                        dX[i1, i2, 0, i3] = dX1[i1, i2, 0, i3]
-                        for i4 in range(1, H):
-                            dX[i1, i2, i4, i3] = dX1[i1, i2, i4, i3] + \
-                                dX[i1, i2, i4 - 1, i3]
-            return [dX]
-
-        self.assertDeviceChecks(dc, op, [X, dY], [0])
-        self.assertReferenceChecks(gc, op, [X, dY], integral_image_gradient)
diff --git a/caffe2/python/operator_test/jsd_ops_test.py b/caffe2/python/operator_test/jsd_ops_test.py
deleted file mode 100644
index f205d8e650b2..000000000000
--- a/caffe2/python/operator_test/jsd_ops_test.py
+++ /dev/null
@@ -1,41 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-def entropy(p):
-    q = 1. - p
-    return -p * np.log(p) - q * np.log(q)
-
-
-def jsd(p, q):
-    return [entropy(p / 2. + q / 2.) - entropy(p) / 2. - entropy(q) / 2.]
-
-
-def jsd_grad(go, o, pq_list):
-    p, q = pq_list
-    m = (p + q) / 2.
-    return [np.log(p * (1 - m) / (1 - p) / m) / 2. * go, None]
-
-
-class TestJSDOps(serial.SerializedTestCase):
-    @serial.given(n=st.integers(10, 100), **hu.gcs_cpu_only)
-    def test_bernoulli_jsd(self, n, gc, dc):
-        p = np.random.rand(n).astype(np.float32)
-        q = np.random.rand(n).astype(np.float32)
-        op = core.CreateOperator("BernoulliJSD", ["p", "q"], ["l"])
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[p, q],
-            reference=jsd,
-            output_to_grad='l',
-            grad_reference=jsd_grad,
-        )
diff --git a/caffe2/python/operator_test/key_split_ops_test.py b/caffe2/python/operator_test/key_split_ops_test.py
deleted file mode 100644
index 18fddff58d17..000000000000
--- a/caffe2/python/operator_test/key_split_ops_test.py
+++ /dev/null
@@ -1,45 +0,0 @@
-
-
-
-
-
-import hypothesis.strategies as st
-
-from caffe2.python import core, workspace
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-
-import numpy as np
-
-
-class TestKeySplitOps(hu.HypothesisTestCase):
-    @given(
-        X=hu.arrays(
-            dims=[1000],
-            dtype=np.int64,
-            elements=st.integers(min_value=0, max_value=100)
-        ),
-        **hu.gcs_cpu_only
-    )
-    def test_key_split_op(self, X, gc, dc):
-        categorical_limit = max(X) + 1
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('X', X)
-        output_blobs = ['Y_%d' % i for i in range(categorical_limit)]
-        op = core.CreateOperator(
-            'KeySplit', ['X'],
-            output_blobs,
-            categorical_limit=categorical_limit
-        )
-        workspace.RunOperatorOnce(op)
-        output_vecs = [
-            workspace.blobs[output_blobs[i]] for i in range(categorical_limit)
-        ]
-        expected_output_vecs = [[] for _ in range(categorical_limit)]
-        for i, x in enumerate(X):
-            expected_output_vecs[x].append(i)
-        for i in range(categorical_limit):
-            np.testing.assert_array_equal(
-                output_vecs[i],
-                np.array(expected_output_vecs[i], dtype=np.int32)
-            )
diff --git a/caffe2/python/operator_test/lars_test.py b/caffe2/python/operator_test/lars_test.py
deleted file mode 100644
index 6f976520e06b..000000000000
--- a/caffe2/python/operator_test/lars_test.py
+++ /dev/null
@@ -1,45 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestLars(hu.HypothesisTestCase):
-
-    @given(offset=st.floats(min_value=0, max_value=100),
-    lr_min=st.floats(min_value=1e-8, max_value=1e-6),
-    **hu.gcs)
-    def test_lars(self, offset, lr_min, dc, gc):
-        X = np.random.rand(6, 7, 8, 9).astype(np.float32)
-        dX = np.random.rand(6, 7, 8, 9).astype(np.float32)
-        wd = np.array([1e-4]).astype(np.float32)
-        trust = np.random.rand(1).astype(np.float32)
-        lr_max = np.random.rand(1).astype(np.float32)
-
-        def ref_lars(X, dX, wd, trust, lr_max):
-            rescale_factor = \
-                trust / (np.linalg.norm(dX) / np.linalg.norm(X) + wd + offset)
-            rescale_factor = np.minimum(rescale_factor, lr_max)
-            rescale_factor = np.maximum(rescale_factor, lr_min)
-            return [rescale_factor]
-
-        op = core.CreateOperator(
-            "Lars",
-            ["X", "dX", "wd", "trust", "lr_max"],
-            ["rescale_factor"],
-            offset=offset,
-            lr_min=lr_min,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, dX, wd, trust, lr_max],
-            reference=ref_lars
-        )
diff --git a/caffe2/python/operator_test/layer_norm_op_test.py b/caffe2/python/operator_test/layer_norm_op_test.py
deleted file mode 100644
index 31ba78be0c19..000000000000
--- a/caffe2/python/operator_test/layer_norm_op_test.py
+++ /dev/null
@@ -1,409 +0,0 @@
-
-
-
-
-
-from caffe2.python import brew, core, workspace
-from caffe2.python.model_helper import ModelHelper
-from functools import partial
-from hypothesis import given, settings
-from typing import Optional, Tuple
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-
-import numpy as np
-import torch
-
-import unittest
-
-from ._utils import assert_allclose
-
-
-def _layer_norm_ref(axis, epsilon, X):
-    left = int(np.prod(X.shape[:axis]))
-    reshaped = np.reshape(X, [left, -1])
-    mean = np.mean(reshaped, axis=1).reshape([left, 1])
-    std = np.sqrt(np.mean(np.square(reshaped), axis=1).reshape(
-        [left, 1]) - np.square(mean) + epsilon)
-    Y = (reshaped - mean) / (std)
-    Y = np.reshape(Y, X.shape)
-    mean = np.reshape(mean, X.shape[:axis] + (1,))
-    std = np.reshape(std, X.shape[:axis] + (1,))
-    return (Y, mean, std)
-
-
-def _layer_norm_with_affine_ref(axis, epsilon, X, gamma, beta):
-    Y, mean, std = _layer_norm_ref(axis, epsilon, X)
-    Y = Y * gamma + beta
-    return (Y, mean, std)
-
-
-def _layer_norm_grad_ref(axis, gout_full, norm, mean_full, stdev_full, X_full):
-    left = int(np.prod(X_full.shape[:axis]))
-    right = int(np.prod(X_full.shape[axis:]))
-    X = np.reshape(X_full, [left, right])
-    stdev = np.reshape(stdev_full, [left, 1])
-    mean = np.reshape(mean_full, [left, 1])
-    gout = np.reshape(gout_full, [left, right])
-    dstdev_end = (-1.0) / np.power(stdev, 2.0) \
-        * np.sum((X - mean) * gout, axis=1).reshape([left, 1])
-    dmean_end = np.sum(-1.0 / stdev * gout, axis=1).reshape([left, 1])
-    dx_end = 1.0 / stdev * gout
-
-    # stdev block
-    dmean_stdev = -1.0 * mean / stdev * dstdev_end
-    dx_stdev = X / (right * stdev) * dstdev_end
-
-    # mean block
-    dmean = dmean_end + dmean_stdev
-    dxmean = (1.0 / right) * dmean
-
-    # final outputs
-    dx = dx_end + dx_stdev + dxmean
-    dx = dx.reshape(X_full.shape)
-
-    return [dx]
-
-
-class TestLayerNormOp(serial.SerializedTestCase):
-    @given(X=hu.tensor(min_dim=2), **hu.gcs)
-    @settings(deadline=10000)
-    def test_layer_norm_grad_op(self, X, gc, dc):
-        axis = np.random.randint(0, len(X.shape))
-        epsilon = 1e-4
-        op = core.CreateOperator(
-            "LayerNormGradient",
-            ["gout", "out", "mean", "stdev", "in"],
-            ["gin"],
-            axis=axis,
-            epsilon=epsilon,
-        )
-
-        norm, mean, stdev = _layer_norm_ref(axis, epsilon, X)
-        gout = norm
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[gout, norm, mean, stdev, X],
-            reference=partial(_layer_norm_grad_ref, axis)
-        )
-        self.assertDeviceChecks(
-            device_options=dc,
-            op=op,
-            inputs=[gout, norm, mean, stdev, X],
-            outputs_to_check=[0],
-        )
-
-    @given(X=hu.tensor(min_dim=2),
-           eps=st.floats(1e-5, 1e-3),
-           elementwise_affine=st.booleans(),
-           **hu.gcs)
-    def test_layer_norm_op(self, X, eps, elementwise_affine, gc, dc):
-        axis = np.random.randint(0, len(X.shape))
-
-        op = core.CreateOperator(
-            "LayerNorm",
-            ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-            ["Y", "mean", "std"],
-            axis=axis,
-            epsilon=eps,
-            elementwise_affine=elementwise_affine,
-        )
-
-        if elementwise_affine:
-            ref = partial(_layer_norm_with_affine_ref, axis, eps)
-        else:
-            ref = partial(_layer_norm_ref, axis, eps)
-
-        if elementwise_affine:
-            gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            inputs = [X, gamma, beta]
-        else:
-            inputs = [X]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=ref,
-        )
-        self.assertDeviceChecks(
-            device_options=dc,
-            op=op,
-            inputs=inputs,
-            outputs_to_check=[0, 1, 2],
-        )
-
-    @given(M=st.integers(1, 10),
-           N=st.integers(10, 20),
-           axis=st.integers(0, 1),
-           eps=st.floats(1e-5, 1e-3),
-           elementwise_affine=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_layer_norm_grad(
-            self, M, N, axis, eps, elementwise_affine, gc, dc):
-        op = core.CreateOperator(
-            "LayerNorm",
-            ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-            ["Y", "mean", "std"],
-            axis=axis,
-            epsilon=eps,
-            elementwise_affine=elementwise_affine,
-        )
-
-        X = np.arange(M * N).astype(np.float32)
-        np.random.shuffle(X)
-        X = X.reshape((M, N))
-        if elementwise_affine:
-            gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            inputs = [X, gamma, beta]
-        else:
-            inputs = [X]
-
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-    @unittest.skipIf(workspace.has_hip_support,
-                     "Operator cross-calling doesn't work with hip yet")
-    @given(X=hu.tensor(min_dim=2),
-           eps=st.floats(1e-5, 1e-3),
-           elementwise_affine=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_layer_norm_op_c10(self, X, eps, elementwise_affine, gc, dc):
-        axis = np.random.randint(0, len(X.shape))
-
-        op = core.CreateOperator(
-            "C10LayerNorm_DontUseThisOpYet",
-            ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-            ["Y", "mean", "std"],
-            axis=axis,
-            epsilon=eps,
-            elementwise_affine=elementwise_affine,
-        )
-
-        if elementwise_affine:
-            ref = partial(_layer_norm_with_affine_ref, axis, eps)
-        else:
-            ref = partial(_layer_norm_ref, axis, eps)
-
-        if elementwise_affine:
-            gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            inputs = [X, gamma, beta]
-        else:
-            inputs = [X]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=ref,
-        )
-        self.assertDeviceChecks(
-            device_options=dc,
-            op=op,
-            inputs=inputs,
-            outputs_to_check=[0, 1, 2],
-        )
-
-    @unittest.skipIf(workspace.has_hip_support,
-                     "Operator cross-calling doesn't work with hip yet")
-    @given(X=hu.tensor(min_dim=2),
-           eps=st.floats(1e-5, 1e-3),
-           elementwise_affine=st.booleans(),
-           **hu.gcs)
-    def test_layer_norm_op_c10_preallocated_outputs(
-            self, X, eps, elementwise_affine, gc, dc):
-        # This test case ensures that it works correctly when output tensors are
-        # preallocated.
-        axis = np.random.randint(0, len(X.shape))
-
-        self.ws.create_blob("X").feed(X)
-        if elementwise_affine:
-            gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            self.ws.create_blob("gamma").feed(gamma)
-            self.ws.create_blob("beta").feed(beta)
-
-        m = ModelHelper(name="test")
-        m.net.C10LayerNorm_DontUseThisOpYet(
-            ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-            ["Y", "mean", "std"],
-            axis=axis,
-            epsilon=eps,
-            elementwise_affine=elementwise_affine,
-        )
-        self.ws.create_net(m.param_init_net).run()
-        net = self.ws.create_net(m.net)
-        # run two times to be extra sure that the outputs are preallocated
-        net.run()
-        net.run()
-
-        if elementwise_affine:
-            expected_norm, expected_mean, expected_std = \
-                _layer_norm_with_affine_ref(axis, eps, X, gamma, beta)
-        else:
-            expected_norm, expected_mean, expected_std = _layer_norm_ref(
-                axis, eps, X)
-        actual_norm = self.ws.fetch_blob('Y')
-        actual_mean = self.ws.fetch_blob('mean')
-        actual_std = self.ws.fetch_blob('std')
-
-        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
-        assert_allclose(expected_mean, actual_mean)
-        assert_allclose(expected_std, actual_std)
-
-    @given(X=hu.tensor(min_dim=2),
-           eps=st.floats(1e-5, 1e-3),
-           elementwise_affine=st.booleans(),
-           **hu.gcs)
-    def test_layer_norm_op_pytorch(self, X, eps, elementwise_affine, gc, dc):
-        axis = np.random.randint(0, len(X.shape))
-
-        if elementwise_affine:
-            gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            expected_norm, expected_mean, expected_std = \
-                _layer_norm_with_affine_ref(axis, eps, X, gamma, beta)
-            actual_norm, actual_mean, actual_std = torch.ops._caffe2.LayerNorm(
-                torch.tensor(X), torch.tensor(gamma), torch.tensor(beta),
-                axis, eps, True)
-        else:
-            expected_norm, expected_mean, expected_std = _layer_norm_ref(
-                axis, eps, X)
-            actual_norm, actual_mean, actual_std = torch.ops._caffe2.LayerNorm(
-                torch.tensor(X), None, None, axis, eps)
-
-        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
-        assert_allclose(expected_mean, actual_mean)
-        assert_allclose(expected_std, actual_std)
-
-    # Test case is using workspace.has_cuda_support and not
-    # workspace.has_gpu_support to exclude it from HIP because tensor interop
-    # doesn't work for HIP tensors yet
-    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
-    @given(X=hu.tensor(min_dim=2),
-           eps=st.floats(1e-5, 1e-3),
-           elementwise_affine=st.booleans())
-    def test_layer_norm_op_pytorch_cuda(self, X, eps, elementwise_affine):
-        axis = np.random.randint(0, len(X.shape))
-
-        if elementwise_affine:
-            gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            expected_norm, expected_mean, expected_std = \
-                _layer_norm_with_affine_ref(axis, eps, X, gamma, beta)
-            actual_norm, actual_mean, actual_std = torch.ops._caffe2.LayerNorm(
-                torch.tensor(X).cuda(),
-                torch.tensor(gamma).cuda(),
-                torch.tensor(beta).cuda(),
-                axis,
-                eps,
-                True)
-        else:
-            expected_norm, expected_mean, expected_std = _layer_norm_ref(
-                axis, eps, X)
-            actual_norm, actual_mean, actual_std = torch.ops._caffe2.LayerNorm(
-                torch.tensor(X).cuda(), None, None, axis, eps)
-
-        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
-        assert_allclose(expected_mean, actual_mean)
-        assert_allclose(expected_std, actual_std)
-
-    @given(X=hu.tensor(min_dim=2),
-           eps=st.floats(1e-5, 1e-3),
-           elementwise_affine=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_layer_norm_op_jit(self, X, eps, elementwise_affine, gc, dc):
-        @torch.jit.script
-        def jit_layer_norm(
-                X: torch.Tensor,
-                gamma: Optional[torch.Tensor] = None,
-                beta: Optional[torch.Tensor] = None,
-                axis: int = 1,
-                eps: float = 1e-5,
-                elementwise_affine: bool = False,
-        ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-            return torch.ops._caffe2.LayerNorm(
-                X, gamma, beta, axis, eps, elementwise_affine)
-
-        axis = np.random.randint(0, len(X.shape))
-
-        if elementwise_affine:
-            gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
-            expected_norm, expected_mean, expected_std = \
-                _layer_norm_with_affine_ref(axis, eps, X, gamma, beta)
-            actual_norm, actual_mean, actual_std = jit_layer_norm(
-                torch.tensor(X), torch.tensor(gamma), torch.tensor(beta),
-                axis, eps, elementwise_affine)
-        else:
-            expected_norm, expected_mean, expected_std = _layer_norm_ref(
-                axis, eps, X)
-            actual_norm, actual_mean, actual_std = jit_layer_norm(
-                torch.tensor(X), None, None, axis, eps, elementwise_affine)
-
-        assert_allclose(expected_norm, actual_norm, rtol=1e-4, atol=1e-4)
-        assert_allclose(expected_mean, actual_mean)
-        assert_allclose(expected_std, actual_std)
-
-    @given(X=hu.tensor(min_dim=2), **hu.gcs)
-    def test_layer_norm_brew_wrapper(self, X, gc, dc):
-        axis = np.random.randint(0, len(X.shape))
-        scale_dim = [1] * np.ndim(X)
-        scale_dim[axis] = X.shape[axis]
-
-        self.ws.create_blob('input').feed(X)
-
-        model = ModelHelper(name='test_layer_norm_brew_wrapper')
-        brew.layer_norm(
-            model,
-            'input',
-            'output',
-            dim_in=X.shape[axis:],
-            axis=axis,
-            epsilon=1e-4,
-        )
-
-        self.ws.create_net(model.param_init_net).run()
-        self.ws.create_net(model.net).run()
-
-    @given(N=st.integers(1, 10), elementwise_affine=st.booleans(), **hu.gcs)
-    @settings(deadline=None)
-    def test_layer_norm_with_empty_batch(self, N, elementwise_affine, gc, dc):
-        X = np.random.randn(0, N).astype(np.float32)
-        gamma = np.random.rand(N).astype(np.float32)
-        beta = np.random.rand(N).astype(np.float32)
-
-        op = core.CreateOperator(
-            "LayerNorm",
-            ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-            ["Y", "mean", "sigma"],
-            elementwise_affine=elementwise_affine,
-        )
-
-        def ref(X, gamma=None, beta=None):
-            Y = np.zeros_like(X)
-            axis = 1
-            mean = np.zeros(X.shape[:axis] + (1,), dtype=X.dtype)
-            sigma = np.zeros(X.shape[:axis] + (1,), dtype=X.dtype)
-            return Y, mean, sigma
-
-
-        inputs = [X, gamma, beta] if elementwise_affine else [X]
-        self.assertReferenceChecks(gc, op, inputs, ref)
-        self.assertDeviceChecks(dc, op, inputs, [0, 1])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/leaky_relu_test.py b/caffe2/python/operator_test/leaky_relu_test.py
deleted file mode 100644
index 9a888cac7901..000000000000
--- a/caffe2/python/operator_test/leaky_relu_test.py
+++ /dev/null
@@ -1,177 +0,0 @@
-
-
-
-
-import numpy as np
-from hypothesis import given, assume
-import hypothesis.strategies as st
-
-from caffe2.python import core, model_helper, utils
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestLeakyRelu(hu.HypothesisTestCase):
-
-    def _get_inputs(self, N, C, H, W, order):
-        input_data = np.random.rand(N, C, H, W).astype(np.float32) - 0.5
-
-        # default step size is 0.05
-        input_data[np.logical_and(
-            input_data >= 0, input_data <= 0.051)] = 0.051
-        input_data[np.logical_and(
-            input_data <= 0, input_data >= -0.051)] = -0.051
-
-        if order == 'NHWC':
-            input_data = utils.NCHW2NHWC(input_data)
-
-        return input_data,
-
-    def _get_op(self, device_option, alpha, order, inplace=False):
-        outputs = ['output' if not inplace else "input"]
-        op = core.CreateOperator(
-            'LeakyRelu',
-            ['input'],
-            outputs,
-            alpha=alpha,
-            device_option=device_option)
-        return op
-
-    def _feed_inputs(self, input_blobs, device_option):
-        names = ['input', 'scale', 'bias']
-        for name, blob in zip(names, input_blobs):
-            self.ws.create_blob(name).feed(blob, device_option=device_option)
-
-    @given(gc=hu.gcs['gc'],
-           dc=hu.gcs['dc'],
-           N=st.integers(2, 3),
-           C=st.integers(2, 3),
-           H=st.integers(2, 3),
-           W=st.integers(2, 3),
-           alpha=st.floats(0, 1),
-           order=st.sampled_from(['NCHW', 'NHWC']),
-           seed=st.integers(0, 1000))
-    def test_leaky_relu_gradients(self, gc, dc, N, C, H, W, order, alpha, seed):
-        np.random.seed(seed)
-
-        op = self._get_op(
-            device_option=gc,
-            alpha=alpha,
-            order=order)
-        input_blobs = self._get_inputs(N, C, H, W, order)
-
-        self.assertDeviceChecks(dc, op, input_blobs, [0])
-        self.assertGradientChecks(gc, op, input_blobs, 0, [0])
-
-    @given(gc=hu.gcs['gc'],
-           dc=hu.gcs['dc'],
-           N=st.integers(2, 10),
-           C=st.integers(3, 10),
-           H=st.integers(5, 10),
-           W=st.integers(7, 10),
-           alpha=st.floats(0, 1),
-           seed=st.integers(0, 1000))
-    def test_leaky_relu_layout(self, gc, dc, N, C, H, W, alpha, seed):
-        outputs = {}
-        for order in ('NCHW', 'NHWC'):
-            np.random.seed(seed)
-            input_blobs = self._get_inputs(N, C, H, W, order)
-            self._feed_inputs(input_blobs, device_option=gc)
-            op = self._get_op(
-                device_option=gc,
-                alpha=alpha,
-                order=order)
-            self.ws.run(op)
-            outputs[order] = self.ws.blobs['output'].fetch()
-        np.testing.assert_allclose(
-            outputs['NCHW'],
-            utils.NHWC2NCHW(outputs["NHWC"]),
-            atol=1e-4,
-            rtol=1e-4)
-
-    @given(gc=hu.gcs['gc'],
-           dc=hu.gcs['dc'],
-           N=st.integers(2, 10),
-           C=st.integers(3, 10),
-           H=st.integers(5, 10),
-           W=st.integers(7, 10),
-           order=st.sampled_from(['NCHW', 'NHWC']),
-           alpha=st.floats(0, 1),
-           seed=st.integers(0, 1000),
-           inplace=st.booleans())
-    def test_leaky_relu_reference_check(self, gc, dc, N, C, H, W, order, alpha,
-                                        seed, inplace):
-        np.random.seed(seed)
-
-        if order != "NCHW":
-            assume(not inplace)
-
-        inputs = self._get_inputs(N, C, H, W, order)
-        op = self._get_op(
-            device_option=gc,
-            alpha=alpha,
-            order=order,
-            inplace=inplace)
-
-        def ref(input_blob):
-            result = input_blob.copy()
-            result[result < 0] *= alpha
-            return result,
-
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(gc=hu.gcs['gc'],
-           dc=hu.gcs['dc'],
-           N=st.integers(2, 10),
-           C=st.integers(3, 10),
-           H=st.integers(5, 10),
-           W=st.integers(7, 10),
-           order=st.sampled_from(['NCHW', 'NHWC']),
-           alpha=st.floats(0, 1),
-           seed=st.integers(0, 1000))
-    def test_leaky_relu_device_check(self, gc, dc, N, C, H, W, order, alpha,
-                                     seed):
-        np.random.seed(seed)
-
-        inputs = self._get_inputs(N, C, H, W, order)
-        op = self._get_op(
-            device_option=gc,
-            alpha=alpha,
-            order=order)
-
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-    @given(N=st.integers(2, 10),
-           C=st.integers(3, 10),
-           H=st.integers(5, 10),
-           W=st.integers(7, 10),
-           order=st.sampled_from(['NCHW', 'NHWC']),
-           alpha=st.floats(0, 1),
-           seed=st.integers(0, 1000))
-    def test_leaky_relu_model_helper_helper(self, N, C, H, W, order, alpha, seed):
-        np.random.seed(seed)
-        arg_scope = {'order': order}
-        model = model_helper.ModelHelper(name="test_model", arg_scope=arg_scope)
-        model.LeakyRelu(
-            'input',
-            'output',
-            alpha=alpha)
-
-        input_blob = np.random.rand(N, C, H, W).astype(np.float32)
-        if order == 'NHWC':
-            input_blob = utils.NCHW2NHWC(input_blob)
-
-        self.ws.create_blob('input').feed(input_blob)
-
-        self.ws.create_net(model.param_init_net).run()
-        self.ws.create_net(model.net).run()
-
-        output_blob = self.ws.blobs['output'].fetch()
-        if order == 'NHWC':
-            output_blob = utils.NHWC2NCHW(output_blob)
-
-        assert output_blob.shape == (N, C, H, W)
-
-
-if __name__ == '__main__':
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/learning_rate_adaption_op_test.py b/caffe2/python/operator_test/learning_rate_adaption_op_test.py
deleted file mode 100644
index 1891171b80d8..000000000000
--- a/caffe2/python/operator_test/learning_rate_adaption_op_test.py
+++ /dev/null
@@ -1,81 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestLearningRateAdaption(serial.SerializedTestCase):
-    @given(inputs=hu.tensors(n=2),
-           lr=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           lr_alpha=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           **hu.gcs_cpu_only)
-    @settings(deadline=None, max_examples=50)
-    def test_learning_rate_adaption_op_normalization(self, inputs, lr, lr_alpha,
-                                                     gc, dc):
-        grad, effgrad = inputs
-        lr = np.array([lr], dtype=np.float32)
-
-        op = core.CreateOperator(
-            'LearningRateAdaption',
-            ['lr', 'grad', 'effgrad'],
-            ['output_lr'],
-            lr_alpha=lr_alpha)
-
-        def ref(lr, grad, effgrad):
-            flattened_grad = grad.flatten()
-            flattened_effgrad = effgrad.flatten()
-            x = np.dot(flattened_grad, flattened_effgrad)
-            kEps = 1e-12
-            y = np.linalg.norm(flattened_grad, ord=2)
-            y = np.maximum(y, kEps)
-            z = np.linalg.norm(flattened_effgrad, ord=2)
-            z = np.maximum(z, kEps)
-            output_lr = lr
-            output_lr[0] -= lr[0] * lr_alpha * float(x / (y * z))
-            return output_lr,
-
-        self.assertReferenceChecks(
-            gc, op,
-            [lr, grad, effgrad],
-            ref)
-
-    @given(inputs=hu.tensors(n=2),
-           lr=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           lr_alpha=st.floats(min_value=0.01, max_value=0.99,
-                           allow_nan=False, allow_infinity=False),
-           **hu.gcs_cpu_only)
-    def test_learning_rate_adaption_op_without_normalization(self, inputs, lr,
-                                                             lr_alpha, gc, dc):
-        grad, effgrad = inputs
-        lr = np.array([lr], dtype=np.float32)
-
-        op = core.CreateOperator(
-            'LearningRateAdaption',
-            ['lr', 'grad', 'effgrad'],
-            ['output_lr'],
-            lr_alpha=lr_alpha,
-            normalized_lr_adaption=False)
-
-        def ref(lr, grad, effgrad):
-            flattened_grad = grad.flatten()
-            flattened_effgrad = effgrad.flatten()
-            x = np.dot(flattened_grad, flattened_effgrad)
-            output_lr = lr
-            output_lr[0] -= lr_alpha * x
-            return output_lr,
-
-        self.assertReferenceChecks(
-            gc, op,
-            [lr, grad, effgrad],
-            ref)
diff --git a/caffe2/python/operator_test/learning_rate_op_test.py b/caffe2/python/operator_test/learning_rate_op_test.py
deleted file mode 100644
index 8d17c0c7ef08..000000000000
--- a/caffe2/python/operator_test/learning_rate_op_test.py
+++ /dev/null
@@ -1,261 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-
-import copy
-from functools import partial
-import math
-import numpy as np
-
-
-class TestLearningRate(serial.SerializedTestCase):
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=None, max_examples=50)
-    def test_alter_learning_rate_op(self, gc, dc):
-        iter = np.random.randint(low=1, high=1e5, size=1)
-        active_period = int(np.random.randint(low=1, high=1e3, size=1))
-        inactive_period = int(np.random.randint(low=1, high=1e3, size=1))
-        base_lr = float(np.random.random(1))
-
-        def ref(iter):
-            iter = float(iter)
-            reminder = iter % (active_period + inactive_period)
-            if reminder < active_period:
-                return (np.array(base_lr), )
-            else:
-                return (np.array(0.), )
-
-        op = core.CreateOperator(
-            'LearningRate',
-            'iter',
-            'lr',
-            policy="alter",
-            active_first=True,
-            base_lr=base_lr,
-            active_period=active_period,
-            inactive_period=inactive_period
-        )
-
-        self.assertReferenceChecks(gc, op, [iter], ref)
-
-    @given(**hu.gcs_cpu_only)
-    def test_hill_learning_rate_op(self, gc, dc):
-        iter = np.random.randint(low=1, high=1e5, size=1)
-
-        num_iter = int(np.random.randint(low=1e2, high=1e8, size=1))
-        start_multiplier = 1e-4
-        gamma = 1.0
-        power = 0.5
-        end_multiplier = 1e-2
-        base_lr = float(np.random.random(1))
-
-        def ref(iter):
-            iter = float(iter)
-            if iter < num_iter:
-                lr = start_multiplier + (
-                    1.0 - start_multiplier
-                ) * iter / num_iter
-            else:
-                iter -= num_iter
-                lr = math.pow(1.0 + gamma * iter, -power)
-                lr = max(lr, end_multiplier)
-            return (np.array(base_lr * lr), )
-
-        op = core.CreateOperator(
-            'LearningRate',
-            'data',
-            'out',
-            policy="hill",
-            base_lr=base_lr,
-            num_iter=num_iter,
-            start_multiplier=start_multiplier,
-            gamma=gamma,
-            power=power,
-            end_multiplier=end_multiplier,
-        )
-        self.assertReferenceChecks(gc, op, [iter], ref)
-
-    @given(**hu.gcs_cpu_only)
-    def test_slope_learning_rate_op(self, gc, dc):
-        iter = np.random.randint(low=1, high=1e5, size=1)
-
-        num_iter_1 = int(np.random.randint(low=1e2, high=1e3, size=1))
-        multiplier_1 = 1.0
-        num_iter_2 = num_iter_1 + int(np.random.randint(low=1e2, high=1e3, size=1))
-        multiplier_2 = 0.5
-        base_lr = float(np.random.random(1))
-
-        def ref(iter):
-            iter = float(iter)
-            if iter < num_iter_1:
-                lr = multiplier_1
-            else:
-                lr = max(
-                    multiplier_1 + (iter - num_iter_1) * (multiplier_2 - multiplier_1) / (num_iter_2 - num_iter_1),
-                    multiplier_2
-                )
-            return (np.array(base_lr * lr), )
-
-        op = core.CreateOperator(
-            'LearningRate',
-            'data',
-            'out',
-            policy="slope",
-            base_lr=base_lr,
-            num_iter_1=num_iter_1,
-            multiplier_1=multiplier_1,
-            num_iter_2=num_iter_2,
-            multiplier_2=multiplier_2,
-        )
-        self.assertReferenceChecks(gc, op, [iter], ref)
-
-    @given(
-        **hu.gcs_cpu_only
-    )
-    @settings(max_examples=10)
-    def test_gate_learningrate(self, gc, dc):
-        iter = np.random.randint(low=1, high=1e5, size=1)
-        num_iter = int(np.random.randint(low=1e2, high=1e3, size=1))
-        base_lr = float(np.random.uniform(-1, 1))
-        multiplier_1 = float(np.random.uniform(-1, 1))
-        multiplier_2 = float(np.random.uniform(-1, 1))
-
-        def ref(iter):
-            iter = float(iter)
-            if iter < num_iter:
-                return (np.array(multiplier_1 * base_lr), )
-            else:
-                return (np.array(multiplier_2 * base_lr), )
-
-        op = core.CreateOperator(
-            'LearningRate',
-            'data',
-            'out',
-            policy="gate",
-            num_iter=num_iter,
-            multiplier_1=multiplier_1,
-            multiplier_2=multiplier_2,
-            base_lr=base_lr,
-        )
-
-        self.assertReferenceChecks(gc, op, [iter], ref)
-
-    @given(
-        gc=hu.gcs['gc'],
-        min_num_iter=st.integers(min_value=10, max_value=20),
-        max_num_iter=st.integers(min_value=50, max_value=100),
-    )
-    @settings(max_examples=2, deadline=None)
-    def test_composite_learning_rate_op(self, gc, min_num_iter, max_num_iter):
-        np.random.seed(65535)
-        # Generate the iteration numbers for sub policy
-        # The four sub policies are as follows:
-        # 1. exp; 2. step; 3. fix; 4. exp
-        num_lr_policy = 4
-        iter_nums = np.random.randint(
-            low=min_num_iter, high=max_num_iter, size=num_lr_policy)
-        accu_iter_num = copy.deepcopy(iter_nums)
-        for i in range(1, num_lr_policy):
-            accu_iter_num[i] += accu_iter_num[i - 1]
-        total_iter_nums = accu_iter_num[-1]
-
-        policy_lr_scale = np.random.uniform(low=0.1, high=2.0, size=num_lr_policy)
-
-        # args for StepLRPolicy
-        step_size = np.random.randint(low=2, high=min_num_iter // 2)
-        step_gamma = np.random.random()
-        # args for ExpLRPolicy
-        exp_gamma = np.random.random()
-        # common args
-        base_lr = 0.1
-
-        # StepLRPolicy
-        def step_lr(iter, lr_scale):
-            return math.pow(step_gamma, iter // step_size) * lr_scale
-
-        # ExpLRPolicy
-        def exp_lr(iter, lr_scale):
-            return math.pow(exp_gamma, iter) * lr_scale
-
-        # FixedLRPolicy
-        def fixed_lr(iter, lr_scale):
-            return lr_scale
-
-        # test one sub policy case
-        def one_policy_check_ref(iter, lr_scale):
-            iter = int(iter)
-            exp_lr_val = exp_lr(iter, lr_scale=lr_scale)
-            return (np.array(base_lr * exp_lr_val), )
-
-        op = core.CreateOperator(
-            'LearningRate',
-            'data',
-            'out',
-            policy='composite',
-            sub_policy_num_iters=iter_nums[:1],
-            sub_policy_0_lr_scale=policy_lr_scale[0],
-            sub_policy_0_policy='exp',
-            sub_policy_0_gamma=exp_gamma,
-            base_lr=base_lr,
-        )
-        for iter_idx in range(1, total_iter_nums + 1):
-            self.assertReferenceChecks(
-                gc, op, [np.asarray([iter_idx])],
-                partial(one_policy_check_ref, lr_scale=policy_lr_scale[0]))
-
-        # all the case with all four sub policies
-        def all_sub_policy_check_ref(iter, lr_scale):
-            assert iter <= accu_iter_num[3]
-            if iter <= accu_iter_num[0]:
-                lr = exp_lr(iter, lr_scale=lr_scale)
-            elif iter <= accu_iter_num[1]:
-                lr = step_lr(iter, lr_scale=lr_scale)
-            elif iter <= accu_iter_num[2]:
-                lr = fixed_lr(iter, lr_scale=lr_scale)
-            else:
-                lr = exp_lr(iter, lr_scale=lr_scale)
-            return (np.array(base_lr * lr), )
-
-        op = core.CreateOperator(
-            'LearningRate',
-            'data',
-            'out',
-            policy='composite',
-            sub_policy_num_iters=iter_nums,
-            sub_policy_0_policy='exp',
-            sub_policy_0_lr_scale=policy_lr_scale[0],
-            sub_policy_0_gamma=exp_gamma,
-            sub_policy_1_policy='step',
-            sub_policy_1_lr_scale=policy_lr_scale[1],
-            sub_policy_1_stepsize=step_size,
-            sub_policy_1_gamma=step_gamma,
-            sub_policy_2_policy='fixed',
-            sub_policy_2_lr_scale=policy_lr_scale[2],
-            sub_policy_3_policy='exp',
-            sub_policy_3_gamma=exp_gamma,
-            sub_policy_3_lr_scale=policy_lr_scale[3],
-            base_lr=base_lr,
-        )
-
-        iter_policy = 0
-        for iter_idx in range(1, total_iter_nums + 1):
-            if iter_idx > accu_iter_num[iter_policy]:
-                iter_policy += 1
-            self.assertReferenceChecks(
-                gc, op, [np.asarray([iter_idx])],
-                partial(all_sub_policy_check_ref,
-                        lr_scale=policy_lr_scale[iter_policy])
-            )
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/length_split_op_test.py b/caffe2/python/operator_test/length_split_op_test.py
deleted file mode 100644
index 3f20ff1f4585..000000000000
--- a/caffe2/python/operator_test/length_split_op_test.py
+++ /dev/null
@@ -1,157 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestLengthSplitOperator(serial.SerializedTestCase):
-
-    def _length_split_op_ref(self, input_lengths, n_split_array):
-        output = []
-        n_split = n_split_array[0]
-        for x in input_lengths:
-            mod = x % n_split
-            val = x // n_split + 1
-            for _ in range(n_split):
-                if mod > 0:
-                    output.append(val)
-                    mod -= 1
-                else:
-                    output.append(val - 1)
-        return [np.array(output).astype(np.int32)]
-
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_length_split_edge(self, gc, dc):
-        input_lengths = np.array([3, 4, 5]).astype(np.int32)
-        n_split_ = np.array([5]).astype(np.int32)
-        # Expected output:
-        # [1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1]
-        op = core.CreateOperator(
-            'LengthsSplit',
-            ['input_lengths',
-             'n_split'],
-            ['Y'],
-        )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[input_lengths,
-                    n_split_],
-            reference=self._length_split_op_ref,
-        )
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0])
-
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_length_split_arg(self, gc, dc):
-        input_lengths = np.array([9, 4, 5]).astype(np.int32)
-        n_split = 3
-        # Expected output:
-        # [3, 3, 3, 2, 1, 1, 2, 2, 1]
-        op = core.CreateOperator(
-            'LengthsSplit',
-            ['input_lengths'],
-            ['Y'], n_split=n_split
-        )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[input_lengths],
-            reference=lambda x : self._length_split_op_ref(x, [n_split]),
-        )
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [input_lengths], [0])
-
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_length_split_override_arg(self, gc, dc):
-        input_lengths = np.array([9, 4, 5]).astype(np.int32)
-        n_split_ignored = 2
-        n_split_used = np.array([3]).astype(np.int32)
-
-        op = core.CreateOperator(
-            'LengthsSplit',
-            ['input_lengths',
-             'n_split'],
-            ['Y'], n_split=n_split_ignored
-        )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[input_lengths,
-                    n_split_used],
-            reference=self._length_split_op_ref,
-        )
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [input_lengths, n_split_used], [0])
-
-    @given(m=st.integers(1, 100), n_split=st.integers(1, 20),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_length_split_even_divide(self, m, n_split, gc, dc):
-        # multiples of n_split
-        input_lengths = np.random.randint(100, size=m).astype(np.int32) * n_split
-        n_split_ = np.array([n_split]).astype(np.int32)
-
-        op = core.CreateOperator(
-            'LengthsSplit',
-            ['input_lengths',
-             'n_split'],
-            ['Y'],
-        )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[input_lengths,
-                    n_split_],
-            reference=self._length_split_op_ref,
-        )
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0])
-
-    @given(m=st.integers(1, 100), n_split=st.integers(1, 20),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_length_split_random(self, m, n_split, gc, dc):
-        input_lengths = np.random.randint(100, size=m).astype(np.int32)
-        n_split_ = np.array([n_split]).astype(np.int32)
-
-        op = core.CreateOperator(
-            'LengthsSplit',
-            ['input_lengths',
-             'n_split'],
-            ['Y'],
-        )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[input_lengths,
-                    n_split_],
-            reference=self._length_split_op_ref,
-        )
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [input_lengths, n_split_], [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/lengths_pad_op_test.py b/caffe2/python/operator_test/lengths_pad_op_test.py
deleted file mode 100644
index cda2f7da323e..000000000000
--- a/caffe2/python/operator_test/lengths_pad_op_test.py
+++ /dev/null
@@ -1,57 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestLengthsPadOp(serial.SerializedTestCase):
-
-    @serial.given(
-        inputs=hu.lengths_tensor(
-            dtype=np.float32,
-            min_value=1,
-            max_value=5,
-            allow_empty=True,
-        ),
-        delta_length=st.integers(0, 10),
-        padding_value=st.floats(-10.0, 10.0),
-        **hu.gcs
-    )
-    def test_lengths_pad(self, inputs, delta_length, padding_value, gc, dc):
-        data, lengths = inputs
-        max_length = np.max(lengths) if len(lengths) > 0 else 0
-        target_length = max(max_length + delta_length, 1)
-
-        def lengths_pad_op(data, lengths):
-            N = len(lengths)
-            output = np.ndarray(
-                shape=(target_length * N, ) + data.shape[1:], dtype=np.float32)
-            output.fill(padding_value)
-            ptr1, ptr2 = 0, 0
-            for i in range(N):
-                output[ptr1:ptr1 + lengths[i]] = data[ptr2:ptr2 + lengths[i]]
-                ptr1 += target_length
-                ptr2 += lengths[i]
-
-            return [output]
-
-        op = core.CreateOperator(
-            "LengthsPad",
-            ["data", "lengths"],
-            ["data_padded"],
-            target_length=target_length,
-            padding_value=padding_value,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[data, lengths],
-            reference=lengths_pad_op,
-        )
diff --git a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py b/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
deleted file mode 100644
index 49b0ba7ec22c..000000000000
--- a/caffe2/python/operator_test/lengths_reducer_fused_nbit_rowwise_ops_test.py
+++ /dev/null
@@ -1,405 +0,0 @@
-
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, workspace
-from hypothesis import given
-
-
-class TestLengthsReducerOpsFusedNBitRowwise(hu.HypothesisTestCase):
-    @given(
-        num_rows=st.integers(1, 20),
-        blocksize=st.sampled_from([8, 12, 16, 32, 64, 96, 128]),
-        weighted=st.booleans(),
-        seed=st.integers(0, 2 ** 32 - 1),
-        empty_indices=st.booleans(),
-        engine=st.sampled_from(["", "GREEDY"]),
-        bit_rate=st.sampled_from([2, 4]),
-    )
-    def test_sparse_lengths_sum(
-        self, num_rows, blocksize, weighted, seed, empty_indices, engine, bit_rate
-    ):
-        net = core.Net("bench")
-
-        np.random.seed(seed)
-
-        input_data = np.random.rand(num_rows, blocksize).astype(np.float32)
-        if empty_indices:
-            lengths = np.zeros(num_rows, dtype=np.int32)
-            num_indices = 0
-        else:
-            num_indices = np.random.randint(len(input_data))
-            # the number of indices per sample
-            lengths_split = np.clip(num_indices // 2, 1, 10)
-            lengths = (
-                np.ones([num_indices // lengths_split], dtype=np.int32) * lengths_split
-            )
-            # readjust num_indices when lengths_split doesn't divide num_indices
-            num_indices = num_indices // lengths_split * lengths_split
-        indices = np.random.randint(
-            low=0, high=len(input_data), size=[num_indices], dtype=np.int64
-        )
-        weights = np.random.uniform(size=[len(indices)]).astype(np.float32)
-
-        op = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
-            "input_data",
-            "quantized_data",
-            engine=engine,
-        )
-        net.Proto().op.extend([op])
-        op = core.CreateOperator(
-            "Fused" + str(bit_rate) + "BitRowwiseQuantizedToFloat",
-            "quantized_data",
-            "dequantized_data",
-        )
-        net.Proto().op.extend([op])
-        op = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitFakeRowwiseQuantized",
-            "input_data",
-            "fake_quantized_data",
-            engine=engine,
-        )
-        net.Proto().op.extend([op])
-
-        if weighted:
-            net.SparseLengthsWeightedSum(
-                ["dequantized_data", "weights", "indices", "lengths"], "sum_reference"
-            )
-            net.SparseLengthsWeightedSumFused8BitRowwise(
-                ["fake_quantized_data", "weights", "indices", "lengths"],
-                "sum_fake_quantized",
-            )
-            op = core.CreateOperator(
-                "SparseLengthsWeightedSumFused" + str(bit_rate) + "BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                "sum_quantized",
-            )
-            net.Proto().op.extend([op])
-        else:
-            net.SparseLengthsSum(
-                ["dequantized_data", "indices", "lengths"], "sum_reference"
-            )
-            net.SparseLengthsSumFused8BitRowwise(
-                ["fake_quantized_data", "indices", "lengths"], "sum_fake_quantized"
-            )
-            op = core.CreateOperator(
-                "SparseLengthsSumFused" + str(bit_rate) + "BitRowwise",
-                ["quantized_data", "indices", "lengths"],
-                "sum_quantized",
-            )
-            net.Proto().op.extend([op])
-        net.Proto().external_input.extend(["input_data"])
-
-        workspace.FeedBlob("input_data", input_data)
-        workspace.FeedBlob("weights", weights)
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-
-        workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"])
-        workspace.RunNetOnce(net)
-
-        sum_reference = workspace.FetchBlob("sum_reference")
-        sum_fake_quantized = workspace.FetchBlob("sum_fake_quantized")
-        sum_quantized = workspace.FetchBlob("sum_quantized")
-
-        np.testing.assert_array_almost_equal(sum_reference, sum_quantized)
-        np.testing.assert_array_equal(sum_fake_quantized, sum_quantized)
-
-    @given(
-        num_rows=st.integers(1, 20),
-        blocksize=st.sampled_from([8, 12, 16, 32, 64, 96, 128]),
-        seed=st.integers(0, 2 ** 32 - 1),
-        empty_indices=st.booleans(),
-        engine=st.sampled_from(["", "GREEDY"]),
-        bit_rate=st.sampled_from([2, 4]),
-    )
-    def test_sparse_lengths_mean(
-        self, num_rows, blocksize, seed, empty_indices, engine, bit_rate
-    ):
-        net = core.Net("bench")
-
-        np.random.seed(seed)
-
-        input_data = np.random.rand(num_rows, blocksize).astype(np.float32)
-        if empty_indices:
-            lengths = np.zeros(num_rows, dtype=np.int32)
-            num_indices = 0
-        else:
-            num_indices = np.random.randint(len(input_data))
-            # the number of indices per sample
-            lengths_split = np.clip(num_indices // 2, 1, 10)
-            lengths = (
-                np.ones([num_indices // lengths_split], dtype=np.int32) * lengths_split
-            )
-            # readjust num_indices when lengths_split doesn't divide num_indices
-            num_indices = num_indices // lengths_split * lengths_split
-        #  Use int32 here because int64 is covered by test_sparse_lengths_sum
-        indices = np.random.randint(
-            low=0, high=len(input_data), size=[num_indices], dtype=np.int32
-        )
-
-        op = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
-            "input_data",
-            "quantized_data",
-            engine=engine,
-        )
-        net.Proto().op.extend([op])
-        op = core.CreateOperator(
-            "Fused" + str(bit_rate) + "BitRowwiseQuantizedToFloat",
-            "quantized_data",
-            "dequantized_data",
-        )
-        net.Proto().op.extend([op])
-        op = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitFakeRowwiseQuantized",
-            "input_data",
-            "fake_quantized_data",
-            engine=engine,
-        )
-        net.Proto().op.extend([op])
-
-        net.SparseLengthsMean(
-            ["dequantized_data", "indices", "lengths"], "mean_reference"
-        )
-        net.SparseLengthsMeanFused8BitRowwise(
-            ["fake_quantized_data", "indices", "lengths"], "mean_fake_quantized"
-        )
-        op = core.CreateOperator(
-            "SparseLengthsMeanFused" + str(bit_rate) + "BitRowwise",
-            ["quantized_data", "indices", "lengths"],
-            "mean_quantized",
-        )
-        net.Proto().op.extend([op])
-        net.Proto().external_input.extend(["input_data"])
-
-        workspace.FeedBlob("input_data", input_data)
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-
-        workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"])
-        workspace.RunNetOnce(net)
-
-        mean_reference = workspace.FetchBlob("mean_reference")
-        mean_fake_quantized = workspace.FetchBlob("mean_fake_quantized")
-        mean_quantized = workspace.FetchBlob("mean_quantized")
-
-        np.testing.assert_array_almost_equal(mean_reference, mean_quantized)
-        np.testing.assert_array_equal(mean_fake_quantized, mean_quantized)
-
-    @given(
-        num_rows=st.integers(1, 20),
-        blocksize=st.sampled_from([8, 12, 16, 32, 64, 96, 128]),
-        weighted=st.booleans(),
-        empty_indices=st.booleans(),
-        bit_rate=st.sampled_from([2, 4, 8]),
-        indices_64bit=st.booleans(),
-    )
-    def test_sparse_lengths_sum_rowwise_sparse(
-        self, num_rows, blocksize, weighted, empty_indices, bit_rate, indices_64bit
-    ):
-        net = core.Net("bench")
-
-        input_data = np.random.rand(num_rows, blocksize).astype(np.float32)
-        if empty_indices:
-            lengths = np.zeros(num_rows, dtype=np.int32)
-            num_indices = 0
-        else:
-            num_indices = np.random.randint(len(input_data))
-            # the number of indices per sample
-            lengths_split = np.clip(num_indices // 2, 1, 10)
-            lengths = (
-                np.ones([num_indices // lengths_split], dtype=np.int32) * lengths_split
-            )
-            # readjust num_indices when lengths_split doesn't divide num_indices
-            num_indices = num_indices // lengths_split * lengths_split
-        #  Use int32 here because int64 is covered by test_sparse_lengths_sum
-        index_type = np.int64 if indices_64bit else np.int32
-        indices = np.random.randint(
-            low=0, high=len(input_data), size=[num_indices], dtype=index_type
-        )
-        weights = np.random.uniform(size=[len(indices)]).astype(np.float32)
-
-        op = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
-            "input_data",
-            "quantized_data",
-        )
-        workspace.FeedBlob("input_data", input_data)
-        workspace.RunOperatorOnce(op)
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        # Prune and generate mapping table
-        sparsity = 0.7
-        mapping_table = np.zeros(num_rows, dtype=np.int32)
-        num_compressed_rows = 0
-        unpruned_ids = []
-        for i in range(num_rows):
-            if np.random.uniform() < sparsity:
-                mapping_table[i] = -1
-                quantized_data[i, :] = 0
-            else:
-                mapping_table[i] = num_compressed_rows
-                num_compressed_rows += 1
-                unpruned_ids.append(i)
-
-        pruned_quantized_data = quantized_data[unpruned_ids]
-
-        inputs = (
-            ["quantized_data"]
-            + (["weights"] if weighted else [])
-            + ["indices", "lengths"]
-        )
-        op = core.CreateOperator(
-            "SparseLengths"
-            + ("Weighted" if weighted else "")
-            + "SumFused"
-            + str(bit_rate)
-            + "BitRowwise",
-            inputs,
-            "sum_reference",
-        )
-        net.Proto().op.extend([op])
-
-        inputs[0] = "pruned_quantized_data"
-        op = core.CreateOperator(
-            "SparseLengths"
-            + ("Weighted" if weighted else "")
-            + "Sum"
-            + str(bit_rate)
-            + "BitRowwiseSparse",
-            inputs + ["mapping_table"],
-            "sum_pruned",
-        )
-        net.Proto().op.extend([op])
-
-        op = core.CreateOperator(
-            "SparseLengthsSumSparseLookup",
-            ["indices", "lengths", "mapping_table"] + (["weights"] if weighted else []),
-            ["new_indices", "new_lengths"] + (["new_weights"] if weighted else []),
-        )
-        net.Proto().op.extend([op])
-        inputs = (
-            ["pruned_quantized_data"]
-            + (["new_weights"] if weighted else [])
-            + ["new_indices", "new_lengths"]
-        )
-        op = core.CreateOperator(
-            "SparseLengths"
-            + ("Weighted" if weighted else "")
-            + "SumFused"
-            + str(bit_rate)
-            + "BitRowwise",
-            inputs,
-            "sum_split",
-        )
-        net.Proto().op.extend([op])
-
-        workspace.FeedBlob("quantized_data", quantized_data)
-        workspace.FeedBlob("pruned_quantized_data", pruned_quantized_data)
-        workspace.FeedBlob("weights", weights)
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("mapping_table", mapping_table)
-
-        workspace.RunNetOnce(net)
-
-        sum_reference = workspace.FetchBlob("sum_reference")
-        sum_pruned = workspace.FetchBlob("sum_pruned")
-        sum_split = workspace.FetchBlob("sum_split")
-
-        np.testing.assert_array_equal(sum_reference, sum_pruned)
-        np.testing.assert_array_equal(sum_reference, sum_split)
-
-    @given(
-        num_rows=st.integers(1, 20),
-        blocksize=st.sampled_from([8, 12, 16, 32, 64, 96, 128]),
-        seed=st.integers(0, 2 ** 32 - 1),
-        empty_indices=st.booleans(),
-        engine=st.sampled_from(["", "GREEDY"]),
-        bit_rate=st.sampled_from([2, 4]),
-    )
-    def test_sparse_lengths_mean_rowwise_sparse_with_skipped_pruning(
-        self, num_rows, blocksize, seed, empty_indices, engine, bit_rate
-    ):
-        net = core.Net("bench")
-
-        np.random.seed(seed)
-
-        input_data = np.random.rand(num_rows, blocksize).astype(np.float32)
-        if empty_indices:
-            lengths = np.zeros(num_rows, dtype=np.int32)
-            num_indices = 0
-        else:
-            num_indices = np.random.randint(len(input_data))
-            # the number of indices per sample
-            lengths_split = np.clip(num_indices // 2, 1, 10)
-            lengths = (
-                np.ones([num_indices // lengths_split], dtype=np.int32) * lengths_split
-            )
-            # readjust num_indices when lengths_split doesn't divide num_indices
-            num_indices = num_indices // lengths_split * lengths_split
-        #  Use int32 here because int64 is covered by test_sparse_lengths_sum
-        indices = np.random.randint(
-            low=0, high=len(input_data), size=[num_indices], dtype=np.int32
-        )
-
-        op = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitRowwiseQuantized",
-            "input_data",
-            "quantized_data",
-            engine=engine,
-        )
-        net.Proto().op.extend([op])
-        op = core.CreateOperator(
-            "Fused" + str(bit_rate) + "BitRowwiseQuantizedToFloat",
-            "quantized_data",
-            "dequantized_data",
-        )
-        net.Proto().op.extend([op])
-        op = core.CreateOperator(
-            "FloatToFused" + str(bit_rate) + "BitFakeRowwiseQuantized",
-            "input_data",
-            "fake_quantized_data",
-            engine=engine,
-        )
-        net.Proto().op.extend([op])
-
-        net.SparseLengthsMean(
-            ["dequantized_data", "indices", "lengths"], "mean_reference"
-        )
-        net.SparseLengthsMeanFused8BitRowwise(
-            ["fake_quantized_data", "indices", "lengths"], "mean_fake_quantized"
-        )
-        op1 = core.CreateOperator(
-            "SparseLengthsMeanFused" + str(bit_rate) + "BitRowwise",
-            ["quantized_data", "indices", "lengths"],
-            "mean_quantized",
-        )
-        op2 = core.CreateOperator(
-            "SparseLengthsMean" + str(bit_rate) + "BitRowwiseSparse",
-            ["quantized_data", "indices", "lengths"] + ["mapping_table"],
-            "mean_quantized_pruned",
-        )
-        net.Proto().op.extend([op1, op2])
-        net.Proto().external_input.extend(["input_data", "mapping_table"])
-
-        workspace.FeedBlob("input_data", input_data)
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        mapping_table = np.array([0]).astype(dtype=np.int32)
-        workspace.FeedBlob("mapping_table", mapping_table)
-
-        workspace.GlobalInit(["caffe2", "--caffe2_log_level=0"])
-        workspace.RunNetOnce(net)
-
-        mean_reference = workspace.FetchBlob("mean_reference")
-        mean_fake_quantized = workspace.FetchBlob("mean_fake_quantized")
-        mean_quantized = workspace.FetchBlob("mean_quantized")
-        mean_quantized_pruned = workspace.FetchBlob("mean_quantized_pruned")
-
-        np.testing.assert_array_almost_equal(mean_reference, mean_quantized)
-        np.testing.assert_array_equal(mean_fake_quantized, mean_quantized)
-        np.testing.assert_array_equal(mean_quantized_pruned, mean_quantized)
diff --git a/caffe2/python/operator_test/lengths_tile_op_test.py b/caffe2/python/operator_test/lengths_tile_op_test.py
deleted file mode 100644
index 441fcc747835..000000000000
--- a/caffe2/python/operator_test/lengths_tile_op_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestLengthsTileOp(serial.SerializedTestCase):
-
-    @serial.given(
-        inputs=st.integers(min_value=1, max_value=20).flatmap(
-            lambda size: st.tuples(
-                hu.arrays([size], dtype=np.float32),
-                hu.arrays([size], dtype=np.int32,
-                          elements=st.integers(min_value=0, max_value=20)),
-            )
-        ),
-        **hu.gcs)
-    def test_lengths_tile(self, inputs, gc, dc):
-        data, lengths = inputs
-
-        def lengths_tile_op(data, lengths):
-            return [np.concatenate([
-                [d] * l for d, l in zip(data, lengths)
-            ])]
-
-        op = core.CreateOperator(
-            "LengthsTile",
-            ["data", "lengths"],
-            ["output"],
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[data, lengths],
-            reference=lengths_tile_op,
-        )
-
-        self.assertGradientChecks(
-            device_option=gc,
-            op=op,
-            inputs=[data, lengths],
-            outputs_to_check=0,
-            outputs_with_grads=[0]
-        )
diff --git a/caffe2/python/operator_test/lengths_top_k_ops_test.py b/caffe2/python/operator_test/lengths_top_k_ops_test.py
deleted file mode 100644
index d2d55c531ec0..000000000000
--- a/caffe2/python/operator_test/lengths_top_k_ops_test.py
+++ /dev/null
@@ -1,60 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestLengthsTopKOps(serial.SerializedTestCase):
-    @serial.given(N=st.integers(min_value=0, max_value=10),
-           K=st.integers(min_value=1, max_value=10),
-           **hu.gcs_cpu_only)
-    def test_lengths_top_k_op(self, N, K, gc, dc):
-        lens = np.random.randint(low=1, high=2 * K + 1, size=N).astype(np.int32)
-        X = []
-        for i in lens:
-            X.extend(x / 100.0 for x in range(0, 6 * i, 6))
-        X = np.array(X, dtype=np.float32)
-        op = core.CreateOperator("LengthsTopK", ["X", "Y"], ["values", "indices"], k=K)
-
-        def lengths_top_k(X, lens):
-            N, si = lens.shape[0], 0
-            values, indices = [], []
-            for i in range(N):
-                cur_indices = X[si:si + lens[i]].argsort()[-K:][::-1]
-                cur_values = X[si:si + lens[i]][cur_indices]
-                values.extend(cur_values)
-                indices.extend(cur_indices)
-                si += lens[i]
-                if lens[i] < K:
-                    values.extend([0] * (K - lens[i]))
-                    indices.extend([-1] * (K - lens[i]))
-
-            return (np.array(values, dtype=np.float32).reshape(-1, K),
-                    np.array(indices, dtype=np.int32).reshape(-1, K))
-
-        self.assertDeviceChecks(dc, op, [X, lens], [0, 1])
-        self.assertReferenceChecks(gc, op, [X, lens], lengths_top_k)
-        self.assertGradientChecks(gc, op, [X, lens], 0, [0])
-
-    @given(N=st.integers(min_value=0, max_value=10),
-           K=st.integers(min_value=1, max_value=10),
-           **hu.gcs_cpu_only)
-    def test_lengths_top_k_empty_op(self, N, K, gc, dc):
-        lens = np.zeros((N, ), dtype=np.int32)
-        X = np.array([], dtype=np.float32)
-        op = core.CreateOperator("LengthsTopK", ["X", "Y"], ["values", "indices"], k=K)
-
-        def lengths_top_k(X, lens):
-            return (np.zeros((N, K), dtype=np.float32),
-                    -1 * np.ones((N, K), dtype=np.int32))
-
-        self.assertDeviceChecks(dc, op, [X, lens], [0, 1])
-        self.assertReferenceChecks(gc, op, [X, lens], lengths_top_k)
-        self.assertGradientChecks(gc, op, [X, lens], 0, [0])
diff --git a/caffe2/python/operator_test/listwise_l2r_operator_test.py b/caffe2/python/operator_test/listwise_l2r_operator_test.py
deleted file mode 100644
index c08f1180a920..000000000000
--- a/caffe2/python/operator_test/listwise_l2r_operator_test.py
+++ /dev/null
@@ -1,242 +0,0 @@
-
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, workspace
-from hypothesis import given
-
-
-class TestListwiseL2rOps(hu.HypothesisTestCase):
-    def ref_lambda_rank_loss(
-        self, y, r, use_ndcg_as_loss, use_idcg_normalization, use_exp_gain
-    ):
-        n = len(y)
-
-        def get_discounts(v):
-            x = np.argsort(v)
-            d = [0 for _ in range(n)]
-            for i in range(n):
-                d[x[i]] = 1.0 / np.log2(n - i + 1.0)
-            return d
-
-        def sigm(x):
-            return 1 / (1 + np.exp(-x))
-
-        def log_sigm(x):
-            return -np.log(1 + np.exp(-x))
-
-        dy = np.zeros(n)
-        loss = 0
-        if np.sum(np.abs(r)) < 1e-6:
-            return loss, dy
-        if use_ndcg_as_loss and (not use_exp_gain):
-            g = [r[i] for i in range(n)]
-        else:
-            g = [2 ** r[i] for i in range(n)]
-        d = get_discounts(r)
-        idcg = sum([g[i] * d[i] for i in range(n)])
-
-        if use_idcg_normalization:
-            session_weight = max(idcg, 1e-5)
-        else:
-            session_weight = 1
-
-        d = get_discounts(y)
-
-        if use_ndcg_as_loss:
-            dcg = sum(g[i] * d[i] for i in range(n))
-            loss = (idcg - dcg) / session_weight
-        for i in range(n):
-            for j in range(n):
-                if i == j:
-                    continue
-                lambda_weight = np.abs((g[i] - g[j]) * (d[i] - d[j]))
-                rank_loss = -log_sigm(y[i] - y[j] if r[i] > r[j] else y[j] - y[i])
-                rank_dy = (0.0 if r[i] > r[j] else 1.0) - sigm(-y[i] + y[j])
-                if not use_ndcg_as_loss:
-                    loss += lambda_weight * rank_loss / session_weight
-                dy[i] += lambda_weight * rank_dy / session_weight
-
-        return loss, dy
-
-    @given(n=st.integers(1, 20), k=st.integers(2, 5), m=st.integers(3, 5))
-    def test_lambda_rank_loss(self, n, k, m):
-        y = np.random.rand(n * m).astype(np.float32)
-        r = np.random.randint(k, size=n * m).astype(np.float32)
-        # m sessions of length n
-        session_lengths = np.repeat(n, m).astype(np.int32)
-        ref_loss = np.empty(0)
-        ref_ndcg_loss = np.empty(0)
-        ref_ndcg_loss_no_exp = np.empty(0)
-        ref_dcg_loss = np.empty(0)
-        ref_dcg_loss_no_exp = np.empty(0)
-        ref_dy = np.empty(0)
-        ref_dy_no_exp = np.empty(0)
-        ref_dcg_dy = np.empty(0)
-        ref_dcg_dy_no_exp = np.empty(0)
-        for i in range(m):
-            r_loss, r_dy = self.ref_lambda_rank_loss(
-                y[(i) * n : (i + 1) * n], r[(i) * n : (i + 1) * n], False, True, False
-            )
-            r_ndcg_loss, _ = self.ref_lambda_rank_loss(
-                y[(i) * n : (i + 1) * n], r[(i) * n : (i + 1) * n], True, True, True
-            )
-            r_ndcg_loss_no_exp, r_dy_no_exp = self.ref_lambda_rank_loss(
-                y[(i) * n : (i + 1) * n], r[(i) * n : (i + 1) * n], True, True, False
-            )
-            r_dcg_loss, r_dcg_dy = self.ref_lambda_rank_loss(
-                y[(i) * n : (i + 1) * n], r[(i) * n : (i + 1) * n], True, False, True
-            )
-            r_dcg_loss_no_exp, r_dcg_dy_no_exp = self.ref_lambda_rank_loss(
-                y[(i) * n : (i + 1) * n], r[(i) * n : (i + 1) * n], True, False, False
-            )
-            ref_loss = np.append(ref_loss, r_loss)
-            ref_dy = np.append(ref_dy, r_dy)
-            ref_ndcg_loss = np.append(ref_ndcg_loss, r_ndcg_loss)
-
-            ref_ndcg_loss_no_exp = np.append(ref_ndcg_loss_no_exp, r_ndcg_loss_no_exp)
-            ref_dy_no_exp = np.append(ref_dy_no_exp, r_dy_no_exp)
-
-            ref_dcg_loss = np.append(ref_dcg_loss, r_dcg_loss)
-            ref_dcg_dy = np.append(ref_dcg_dy, r_dcg_dy)
-
-            ref_dcg_loss_no_exp = np.append(ref_dcg_loss_no_exp, r_dcg_loss_no_exp)
-            ref_dcg_dy_no_exp = np.append(ref_dcg_dy_no_exp, r_dcg_dy_no_exp)
-
-        dloss = np.random.random(m).astype(np.float32)
-
-        workspace.blobs["y"] = y
-        workspace.blobs["r"] = r
-        workspace.blobs["session_lengths"] = session_lengths
-        workspace.blobs["dloss"] = dloss
-
-        op = core.CreateOperator(
-            "LambdaRankNdcg",
-            ["y", "r", "session_lengths"],
-            ["loss", "dy"],
-            use_ndcg_as_loss=False,
-            use_idcg_normalization=True,
-            use_exp_gain=False,
-        )
-        workspace.RunOperatorOnce(op)
-        loss = workspace.blobs["loss"]
-        dy = workspace.blobs["dy"]
-        np.testing.assert_allclose(loss, ref_loss, rtol=1e-5, atol=1e-6)
-        np.testing.assert_allclose(dy, ref_dy, rtol=1e-5, atol=1e-6)
-
-        op = core.CreateOperator(
-            "LambdaRankNdcg",
-            ["y", "r", "session_lengths"],
-            ["loss", "dy"],
-            use_ndcg_as_loss=True,
-            use_idcg_normalization=True,
-            use_exp_gain=True,
-        )
-        workspace.RunOperatorOnce(op)
-        loss = workspace.blobs["loss"]
-        dy = workspace.blobs["dy"]
-        np.testing.assert_allclose(loss, ref_ndcg_loss, rtol=1e-5, atol=1e-6)
-        np.testing.assert_allclose(dy, ref_dy, rtol=1e-5, atol=1e-6)
-
-        op = core.CreateOperator(
-            "LambdaRankNdcgGradient",
-            ["y", "session_lengths", "dy", "dloss"],
-            ["dy_back"],
-        )
-        workspace.RunOperatorOnce(op)
-        dy_back = workspace.blobs["dy_back"]
-        for i in range(m):
-            np.testing.assert_allclose(
-                dy_back[i * n : (i + 1) * n],
-                dloss[i] * ref_dy[i * n : (i + 1) * n],
-                rtol=1e-5,
-                atol=1e-6,
-            )
-
-        op = core.CreateOperator(
-            "LambdaRankNdcg",
-            ["y", "r", "session_lengths"],
-            ["loss", "dy"],
-            use_ndcg_as_loss=True,
-            use_idcg_normalization=True,
-            use_exp_gain=False,
-        )
-        workspace.RunOperatorOnce(op)
-        loss = workspace.blobs["loss"]
-        dy = workspace.blobs["dy"]
-        np.testing.assert_allclose(loss, ref_ndcg_loss_no_exp, rtol=1e-5, atol=1e-6)
-        np.testing.assert_allclose(dy, ref_dy_no_exp, rtol=1e-5, atol=1e-6)
-
-        op = core.CreateOperator(
-            "LambdaRankNdcgGradient",
-            ["y", "session_lengths", "dy", "dloss"],
-            ["dy_back"],
-        )
-        workspace.RunOperatorOnce(op)
-        dy_back = workspace.blobs["dy_back"]
-        for i in range(m):
-            np.testing.assert_allclose(
-                dy_back[i * n : (i + 1) * n],
-                dloss[i] * ref_dy_no_exp[i * n : (i + 1) * n],
-                rtol=1e-5,
-                atol=1e-6,
-            )
-
-        op = core.CreateOperator(
-            "LambdaRankNdcg",
-            ["y", "r", "session_lengths"],
-            ["loss", "dy"],
-            use_ndcg_as_loss=True,
-            use_idcg_normalization=False,
-            use_exp_gain=True,
-        )
-        workspace.RunOperatorOnce(op)
-        loss = workspace.blobs["loss"]
-        dy = workspace.blobs["dy"]
-        np.testing.assert_allclose(loss, ref_dcg_loss, rtol=1e-5, atol=1e-6)
-        np.testing.assert_allclose(dy, ref_dcg_dy, rtol=1e-5, atol=1e-6)
-
-        op = core.CreateOperator(
-            "LambdaRankNdcgGradient",
-            ["y", "session_lengths", "dy", "dloss"],
-            ["dy_back"],
-        )
-        workspace.RunOperatorOnce(op)
-        dy_back = workspace.blobs["dy_back"]
-        for i in range(m):
-            np.testing.assert_allclose(
-                dy_back[i * n : (i + 1) * n],
-                dloss[i] * ref_dcg_dy[i * n : (i + 1) * n],
-                rtol=1e-5,
-                atol=1e-6,
-            )
-
-        op = core.CreateOperator(
-            "LambdaRankNdcg",
-            ["y", "r", "session_lengths"],
-            ["loss", "dy"],
-            use_ndcg_as_loss=True,
-            use_idcg_normalization=False,
-            use_exp_gain=False,
-        )
-        workspace.RunOperatorOnce(op)
-        loss = workspace.blobs["loss"]
-        dy = workspace.blobs["dy"]
-        np.testing.assert_allclose(loss, ref_dcg_loss_no_exp, rtol=1e-5, atol=1e-6)
-        np.testing.assert_allclose(dy, ref_dcg_dy_no_exp, rtol=1e-5, atol=1e-6)
-
-        op = core.CreateOperator(
-            "LambdaRankNdcgGradient",
-            ["y", "session_lengths", "dy", "dloss"],
-            ["dy_back"],
-        )
-        workspace.RunOperatorOnce(op)
-        dy_back = workspace.blobs["dy_back"]
-        for i in range(m):
-            np.testing.assert_allclose(
-                dy_back[i * n : (i + 1) * n],
-                dloss[i] * ref_dcg_dy_no_exp[i * n : (i + 1) * n],
-                rtol=1e-5,
-                atol=1e-6,
-            )
diff --git a/caffe2/python/operator_test/load_save_test.py b/caffe2/python/operator_test/load_save_test.py
deleted file mode 100644
index 05319956f60f..000000000000
--- a/caffe2/python/operator_test/load_save_test.py
+++ /dev/null
@@ -1,868 +0,0 @@
-import hypothesis.strategies as st
-from hypothesis import given, assume, settings
-import io
-import math
-import numpy as np
-import os
-import struct
-import unittest
-from pathlib import Path
-from typing import Dict, Generator, List, NamedTuple, Optional, Tuple, Type
-from caffe2.proto import caffe2_pb2
-from caffe2.proto.caffe2_pb2 import BlobSerializationOptions
-from caffe2.python import core, test_util, workspace
-
-if workspace.has_gpu_support:
-    DEVICES = [caffe2_pb2.CPU, workspace.GpuDeviceType]
-    max_gpuid = workspace.NumGpuDevices() - 1
-else:
-    DEVICES = [caffe2_pb2.CPU]
-    max_gpuid = 0
-
-
-class MiniDBEntry(NamedTuple):
-    key: str
-    value_size: int
-
-
-# Utility class for other loading tests, don't add test functions here
-# Inherit from this test instead. If you add a test here,
-# each derived class will inherit it as well and cause test duplication
-class TestLoadSaveBase(test_util.TestCase):
-
-    def __init__(self, methodName, db_type='minidb'):
-        super().__init__(methodName)
-        self._db_type = db_type
-
-    @settings(deadline=None)
-    @given(src_device_type=st.sampled_from(DEVICES),
-           src_gpu_id=st.integers(min_value=0, max_value=max_gpuid),
-           dst_device_type=st.sampled_from(DEVICES),
-           dst_gpu_id=st.integers(min_value=0, max_value=max_gpuid))
-    def load_save(self, src_device_type, src_gpu_id,
-                  dst_device_type, dst_gpu_id):
-        workspace.ResetWorkspace()
-        dtypes = [np.float16, np.float32, np.float64, bool, np.int8,
-                  np.int16, np.int32, np.int64, np.uint8, np.uint16]
-        arrays = [np.random.permutation(6).reshape(2, 3).astype(T)
-                  for T in dtypes]
-        assume(core.IsGPUDeviceType(src_device_type) or src_gpu_id == 0)
-        assume(core.IsGPUDeviceType(dst_device_type) or dst_gpu_id == 0)
-        src_device_option = core.DeviceOption(
-            src_device_type, src_gpu_id)
-        dst_device_option = core.DeviceOption(
-            dst_device_type, dst_gpu_id)
-
-        for i, arr in enumerate(arrays):
-            self.assertTrue(workspace.FeedBlob(str(i), arr, src_device_option))
-            self.assertTrue(workspace.HasBlob(str(i)))
-
-        # Saves the blobs to a local db.
-        tmp_folder = self.make_tempdir()
-        op = core.CreateOperator(
-            "Save",
-            [str(i) for i in range(len(arrays))], [],
-            absolute_path=1,
-            db=str(tmp_folder / "db"), db_type=self._db_type)
-        self.assertTrue(workspace.RunOperatorOnce(op))
-
-        # Reset the workspace so that anything we load is surely loaded
-        # from the serialized proto.
-        workspace.ResetWorkspace()
-        self.assertEqual(len(workspace.Blobs()), 0)
-
-        def _LoadTest(keep_device, device_type, gpu_id, blobs, loadAll):
-            """A helper subfunction to test keep and not keep."""
-            op = core.CreateOperator(
-                "Load",
-                [], blobs,
-                absolute_path=1,
-                db=str(tmp_folder / "db"), db_type=self._db_type,
-                device_option=dst_device_option,
-                keep_device=keep_device,
-                load_all=loadAll)
-            self.assertTrue(workspace.RunOperatorOnce(op))
-            for i, arr in enumerate(arrays):
-                self.assertTrue(workspace.HasBlob(str(i)))
-                fetched = workspace.FetchBlob(str(i))
-                self.assertEqual(fetched.dtype, arr.dtype)
-                np.testing.assert_array_equal(
-                    workspace.FetchBlob(str(i)), arr)
-                proto = caffe2_pb2.BlobProto()
-                proto.ParseFromString(workspace.SerializeBlob(str(i)))
-                self.assertTrue(proto.HasField('tensor'))
-                self.assertEqual(proto.tensor.device_detail.device_type,
-                                 device_type)
-                if core.IsGPUDeviceType(device_type):
-                    self.assertEqual(proto.tensor.device_detail.device_id,
-                                     gpu_id)
-
-        blobs = [str(i) for i in range(len(arrays))]
-        # Load using device option stored in the proto, i.e.
-        # src_device_option
-        _LoadTest(1, src_device_type, src_gpu_id, blobs, 0)
-        # Load again, but this time load into dst_device_option.
-        _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0)
-        # Load back to the src_device_option to see if both paths are able
-        # to reallocate memory.
-        _LoadTest(1, src_device_type, src_gpu_id, blobs, 0)
-        # Reset the workspace, and load directly into the dst_device_option.
-        workspace.ResetWorkspace()
-        _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 0)
-
-        # Test load all which loads all blobs in the db into the workspace.
-        workspace.ResetWorkspace()
-        _LoadTest(1, src_device_type, src_gpu_id, [], 1)
-        # Load again making sure that overwrite functionality works.
-        _LoadTest(1, src_device_type, src_gpu_id, [], 1)
-        # Load again with different device.
-        _LoadTest(0, dst_device_type, dst_gpu_id, [], 1)
-        workspace.ResetWorkspace()
-        _LoadTest(0, dst_device_type, dst_gpu_id, [], 1)
-        workspace.ResetWorkspace()
-        _LoadTest(1, src_device_type, src_gpu_id, blobs, 1)
-        workspace.ResetWorkspace()
-        _LoadTest(0, dst_device_type, dst_gpu_id, blobs, 1)
-
-    def saveFile(
-        self, tmp_folder: Path, db_name: str, db_type: str, start_blob_id: int
-    ) -> Tuple[str, List[np.ndarray]]:
-        dtypes = [np.float16, np.float32, np.float64, bool, np.int8,
-                  np.int16, np.int32, np.int64, np.uint8, np.uint16]
-        arrays = [np.random.permutation(6).reshape(2, 3).astype(T)
-                  for T in dtypes]
-
-        for i, arr in enumerate(arrays):
-            self.assertTrue(workspace.FeedBlob(str(i + start_blob_id), arr))
-            self.assertTrue(workspace.HasBlob(str(i + start_blob_id)))
-
-        # Saves the blobs to a local db.
-        tmp_file = str(tmp_folder / db_name)
-        op = core.CreateOperator(
-            "Save",
-            [str(i + start_blob_id) for i in range(len(arrays))], [],
-            absolute_path=1,
-            db=tmp_file, db_type=db_type)
-        workspace.RunOperatorOnce(op)
-        return tmp_file, arrays
-
-
-class TestLoadSave(TestLoadSaveBase):
-
-    def testLoadSave(self):
-        self.load_save()
-
-    def testRepeatedArgs(self):
-        dtypes = [np.float16, np.float32, np.float64, bool, np.int8,
-                  np.int16, np.int32, np.int64, np.uint8, np.uint16]
-        arrays = [np.random.permutation(6).reshape(2, 3).astype(T)
-                  for T in dtypes]
-
-        for i, arr in enumerate(arrays):
-            self.assertTrue(workspace.FeedBlob(str(i), arr))
-            self.assertTrue(workspace.HasBlob(str(i)))
-
-        # Saves the blobs to a local db.
-        tmp_folder = self.make_tempdir()
-        op = core.CreateOperator(
-            "Save",
-            [str(i) for i in range(len(arrays))] * 2, [],
-            absolute_path=1,
-            db=str(tmp_folder / "db"), db_type=self._db_type)
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(op)
-
-    def testLoadExcessblobs(self):
-        tmp_folder = self.make_tempdir()
-        tmp_file, arrays = self.saveFile(tmp_folder, "db", self._db_type, 0)
-
-        op = core.CreateOperator(
-            "Load",
-            [], [str(i) for i in range(len(arrays))] * 2,
-            absolute_path=1,
-            db=tmp_file, db_type=self._db_type,
-            load_all=False)
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(op)
-
-        op = core.CreateOperator(
-            "Load",
-            [], [str(len(arrays) + i) for i in [-1, 0]],
-            absolute_path=1,
-            db=tmp_file, db_type=self._db_type,
-            load_all=True)
-        with self.assertRaises(RuntimeError):
-            workspace.ResetWorkspace()
-            workspace.RunOperatorOnce(op)
-
-        op = core.CreateOperator(
-            "Load",
-            [], [str(len(arrays) + i) for i in range(2)],
-            absolute_path=1,
-            db=tmp_file, db_type=self._db_type,
-            load_all=True)
-        with self.assertRaises(RuntimeError):
-            workspace.ResetWorkspace()
-            workspace.RunOperatorOnce(op)
-
-    def testTruncatedFile(self):
-        tmp_folder = self.make_tempdir()
-        tmp_file, arrays = self.saveFile(tmp_folder, "db", self._db_type, 0)
-
-        with open(tmp_file, 'wb+') as fdest:
-            fdest.seek(20, os.SEEK_END)
-            fdest.truncate()
-
-        op = core.CreateOperator(
-            "Load",
-            [], [str(i) for i in range(len(arrays))],
-            absolute_path=1,
-            db=tmp_file, db_type=self._db_type,
-            load_all=False)
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(op)
-
-        op = core.CreateOperator(
-            "Load",
-            [], [],
-            absolute_path=1,
-            db=tmp_file, db_type=self._db_type,
-            load_all=True)
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(op)
-
-    def testBlobNameOverrides(self):
-        original_names = ['blob_a', 'blob_b', 'blob_c']
-        new_names = ['x', 'y', 'z']
-        blobs = [np.random.permutation(6) for i in range(3)]
-        for i, blob in enumerate(blobs):
-            self.assertTrue(workspace.FeedBlob(original_names[i], blob))
-            self.assertTrue(workspace.HasBlob(original_names[i]))
-        self.assertEqual(len(workspace.Blobs()), 3)
-
-        # Saves the blobs to a local db.
-        tmp_folder = self.make_tempdir()
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "Save", original_names, [],
-                    absolute_path=1,
-                    strip_prefix='.temp',
-                    blob_name_overrides=new_names,
-                    db=str(tmp_folder / "db"),
-                    db_type=self._db_type
-                )
-            )
-        self.assertTrue(
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "Save", original_names, [],
-                    absolute_path=1,
-                    blob_name_overrides=new_names,
-                    db=str(tmp_folder / "db"),
-                    db_type=self._db_type
-                )
-            )
-        )
-        self.assertTrue(workspace.ResetWorkspace())
-        self.assertEqual(len(workspace.Blobs()), 0)
-        self.assertTrue(
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "Load", [], [],
-                    absolute_path=1,
-                    db=str(tmp_folder / "db"),
-                    db_type=self._db_type,
-                    load_all=1
-                )
-            )
-        )
-        self.assertEqual(len(workspace.Blobs()), 3)
-        for i, name in enumerate(new_names):
-            self.assertTrue(workspace.HasBlob(name))
-            self.assertTrue((workspace.FetchBlob(name) == blobs[i]).all())
-        # moved here per @cxj's suggestion
-        load_new_names = ['blob_x', 'blob_y', 'blob_z']
-        # load 'x' into 'blob_x'
-        self.assertTrue(
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "Load", [], load_new_names[0:1],
-                    absolute_path=1,
-                    db=str(tmp_folder / "db"),
-                    db_type=self._db_type,
-                    source_blob_names=new_names[0:1]
-                )
-            )
-        )
-        # we should have 'blob_a/b/c/' and 'blob_x' now
-        self.assertEqual(len(workspace.Blobs()), 4)
-        for i, name in enumerate(load_new_names[0:1]):
-            self.assertTrue(workspace.HasBlob(name))
-            self.assertTrue((workspace.FetchBlob(name) == blobs[i]).all())
-        self.assertTrue(
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "Load", [], load_new_names[0:3],
-                    absolute_path=1,
-                    db=str(tmp_folder / "db"),
-                    db_type=self._db_type,
-                    source_blob_names=new_names[0:3]
-                )
-            )
-        )
-        # we should have 'blob_a/b/c/' and 'blob_x/y/z' now
-        self.assertEqual(len(workspace.Blobs()), 6)
-        for i, name in enumerate(load_new_names[0:3]):
-            self.assertTrue(workspace.HasBlob(name))
-            self.assertTrue((workspace.FetchBlob(name) == blobs[i]).all())
-
-    def testMissingFile(self):
-        tmp_folder = self.make_tempdir()
-        tmp_file = tmp_folder / "missing_db"
-
-        op = core.CreateOperator(
-            "Load",
-            [], [],
-            absolute_path=1,
-            db=str(tmp_file), db_type=self._db_type,
-            load_all=True)
-        with self.assertRaises(RuntimeError):
-            try:
-                workspace.RunOperatorOnce(op)
-            except RuntimeError as e:
-                print(e)
-                raise
-
-    def testLoadMultipleFilesGivenSourceBlobNames(self):
-        tmp_folder = self.make_tempdir()
-        db_file_1, arrays_1 = self.saveFile(tmp_folder, "db1", self._db_type, 0)
-        db_file_2, arrays_2 = self.saveFile(
-            tmp_folder, "db2", self._db_type, len(arrays_1)
-        )
-        db_files = [db_file_1, db_file_2]
-        blobs_names = [str(i) for i in range(len(arrays_1) + len(arrays_2))]
-
-        workspace.ResetWorkspace()
-        self.assertEqual(len(workspace.Blobs()), 0)
-        self.assertTrue(
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "Load",
-                    [], blobs_names,
-                    absolute_path=1,
-                    dbs=db_files, db_type=self._db_type,
-                    source_blob_names=blobs_names
-                )
-            )
-        )
-        self.assertEqual(len(workspace.Blobs()), len(blobs_names))
-        for i in range(len(arrays_1)):
-            np.testing.assert_array_equal(
-                workspace.FetchBlob(str(i)), arrays_1[i]
-            )
-        for i in range(len(arrays_2)):
-            np.testing.assert_array_equal(
-                workspace.FetchBlob(str(i + len(arrays_1))), arrays_2[i]
-            )
-
-    def testLoadAllMultipleFiles(self):
-        tmp_folder = self.make_tempdir()
-        db_file_1, arrays_1 = self.saveFile(tmp_folder, "db1", self._db_type, 0)
-        db_file_2, arrays_2 = self.saveFile(
-            tmp_folder, "db2", self._db_type, len(arrays_1)
-        )
-        db_files = [db_file_1, db_file_2]
-
-        workspace.ResetWorkspace()
-        self.assertEqual(len(workspace.Blobs()), 0)
-        self.assertTrue(
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "Load",
-                    [], [],
-                    absolute_path=1,
-                    dbs=db_files, db_type=self._db_type,
-                    load_all=True
-                )
-            )
-        )
-        self.assertEqual(len(workspace.Blobs()), len(arrays_1) + len(arrays_2))
-        for i in range(len(arrays_1)):
-            np.testing.assert_array_equal(
-                workspace.FetchBlob(str(i)), arrays_1[i]
-            )
-        for i in range(len(arrays_2)):
-            np.testing.assert_array_equal(
-                workspace.FetchBlob(str(i + len(arrays_1))), arrays_2[i]
-            )
-
-    def testLoadAllMultipleFilesWithSameKey(self):
-        tmp_folder = self.make_tempdir()
-        db_file_1, arrays_1 = self.saveFile(tmp_folder, "db1", self._db_type, 0)
-        db_file_2, arrays_2 = self.saveFile(tmp_folder, "db2", self._db_type, 0)
-
-        db_files = [db_file_1, db_file_2]
-        workspace.ResetWorkspace()
-        self.assertEqual(len(workspace.Blobs()), 0)
-        op = core.CreateOperator(
-            "Load",
-            [], [],
-            absolute_path=1,
-            dbs=db_files, db_type=self._db_type,
-            load_all=True)
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(op)
-
-    def testLoadRepeatedFiles(self):
-        tmp_folder = self.make_tempdir()
-        tmp_file, arrays = self.saveFile(tmp_folder, "db", self._db_type, 0)
-
-        db_files = [tmp_file, tmp_file]
-        workspace.ResetWorkspace()
-        self.assertEqual(len(workspace.Blobs()), 0)
-        op = core.CreateOperator(
-            "Load",
-            [], [str(i) for i in range(len(arrays))],
-            absolute_path=1,
-            dbs=db_files, db_type=self._db_type,
-            load_all=False)
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(op)
-
-    def testLoadWithDBOptions(self) -> None:
-        tmp_folder = self.make_tempdir()
-        tmp_file, arrays = self.saveFile(tmp_folder, "db", self._db_type, 0)
-
-        db_files = [tmp_file, tmp_file]
-        workspace.ResetWorkspace()
-        self.assertEqual(len(workspace.Blobs()), 0)
-
-        db_options = b"test_db_options"
-        op = core.CreateOperator(
-            "Load",
-            [], [str(i) for i in range(len(arrays))],
-            absolute_path=1,
-            dbs=db_files, db_type=self._db_type,
-            load_all=False,
-            db_options=db_options,
-        )
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(op)
-
-    def create_test_blobs(
-        self, size: int = 1234, feed: bool = True
-    ) -> List[Tuple[str, np.ndarray]]:
-        def int_array(dtype: Type[np.integer], size: int) -> np.ndarray:
-            info = np.iinfo(dtype)
-            return np.random.randint(info.min, info.max, size, dtype=dtype)
-
-        def float_array(dtype: Type[np.floating], size: int) -> np.ndarray:
-            return np.random.random_sample(size).astype(dtype)
-
-        blobs = [
-            ("int8_data", int_array(np.int8, size)),
-            ("int16_data", int_array(np.int16, size)),
-            ("int32_data", int_array(np.int32, size)),
-            ("int64_data", int_array(np.int64, size)),
-            ("uint8_data", int_array(np.uint8, size)),
-            ("uint16_data", int_array(np.uint16, size)),
-            ("float16_data", float_array(np.float16, size)),
-            ("float32_data", float_array(np.float32, size)),
-            ("float64_data", float_array(np.float64, size)),
-        ]
-
-        if feed:
-            for name, data in blobs:
-                workspace.FeedBlob(name, data)
-
-        return blobs
-
-    def load_blobs(
-        self,
-        blob_names: List[str],
-        dbs: List[str],
-        db_type: Optional[str] = None
-    ) -> None:
-        workspace.ResetWorkspace()
-        self.assertEqual(len(workspace.Blobs()), 0)
-        load_op = core.CreateOperator(
-            "Load",
-            [],
-            blob_names,
-            absolute_path=1,
-            dbs=dbs,
-            db_type=db_type or self._db_type,
-        )
-        self.assertTrue(workspace.RunOperatorOnce(load_op))
-        self.assertEqual(len(workspace.Blobs()), len(blob_names))
-
-    def load_and_check_blobs(
-        self,
-        blobs: List[Tuple[str, np.ndarray]],
-        dbs: List[str],
-        db_type: Optional[str] = None
-    ) -> None:
-        self.load_blobs([name for name, data in blobs], dbs, db_type)
-        for name, data in blobs:
-            np.testing.assert_array_equal(workspace.FetchBlob(name), data)
-
-    def _read_minidb_entries(
-        self, path: Path
-    ) -> Generator[MiniDBEntry, None, None]:
-        """Read the entry information out of a minidb file.
-        """
-        header = struct.Struct("=ii")
-        with path.open("rb") as f:
-            while True:
-                buf = f.read(header.size)
-                if not buf:
-                    break
-                if len(buf) < header.size:
-                    raise Exception("early EOF in minidb header")
-                (key_len, value_len) = header.unpack(buf)
-                if key_len < 0 or value_len < 0:
-                    raise Exception(
-                        f"invalid minidb header: ({key_len}, {value_len})"
-                    )
-                key = f.read(key_len)
-                if len(key) < key_len:
-                    raise Exception("early EOF in minidb key")
-                f.seek(value_len, io.SEEK_CUR)
-                yield MiniDBEntry(key=key.decode("utf-8"), value_size=value_len)
-
-    def _read_chunk_info(self, path: Path) -> Dict[str, List[MiniDBEntry]]:
-        """Read a minidb file and return the names of each blob and how many
-        chunks are stored for that blob.
-        """
-        chunk_id_separator = "#%"
-        results: Dict[str, List[MiniDBEntry]] = {}
-        for entry in self._read_minidb_entries(path):
-            parts = entry.key.rsplit(chunk_id_separator, 1)
-            if len(parts) == 0:
-                assert entry.key not in results
-                results[entry.key] = [entry]
-            else:
-                blob_name = parts[0]
-                results.setdefault(blob_name, [])
-                results[blob_name].append(entry)
-
-        return results
-
-    def _test_save_with_chunk_size(
-        self, num_elems: int, chunk_size: int, expected_num_chunks: int,
-    ) -> None:
-        tmp_folder = self.make_tempdir()
-        tmp_file = str(tmp_folder / "save.output")
-
-        blobs = self.create_test_blobs(num_elems)
-
-        # Saves the blobs to a local db.
-        save_op = core.CreateOperator(
-            "Save",
-            [name for name, data in blobs],
-            [],
-            absolute_path=1,
-            db=tmp_file,
-            db_type=self._db_type,
-            chunk_size=chunk_size,
-        )
-        self.assertTrue(workspace.RunOperatorOnce(save_op))
-
-        self.load_and_check_blobs(blobs, [tmp_file])
-
-        blob_chunks = self._read_chunk_info(Path(tmp_file))
-        for blob_name, chunks in blob_chunks.items():
-            self.assertEqual(len(chunks), expected_num_chunks)
-
-    def testSaveWithChunkSize(self) -> None:
-        num_elems = 1234
-        chunk_size = 32
-        expected_num_chunks = math.ceil(num_elems / chunk_size)
-        self._test_save_with_chunk_size(
-            num_elems=num_elems,
-            chunk_size=chunk_size,
-            expected_num_chunks=expected_num_chunks,
-        )
-
-    def testSaveWithDefaultChunkSize(self) -> None:
-        # This is the default value of the --caffe2_tensor_chunk_size flag from
-        # core/blob_serialization.cc
-        #
-        # Test with just slightly more than this to ensure that 2 chunks are
-        # used.
-        default_chunk_size = 1000000
-        self._test_save_with_chunk_size(
-            num_elems=default_chunk_size + 10,
-            chunk_size=-1,
-            expected_num_chunks=2,
-        )
-
-    def testSaveWithNoChunking(self) -> None:
-        default_chunk_size = 1000000
-        self._test_save_with_chunk_size(
-            num_elems=default_chunk_size + 10,
-            chunk_size=0,
-            expected_num_chunks=1,
-        )
-
-    def testSaveWithOptions(self) -> None:
-        tmp_folder = self.make_tempdir()
-        tmp_file = str(tmp_folder / "save.output")
-
-        num_elems = 1234
-        blobs = self.create_test_blobs(num_elems)
-
-        # Saves the blobs to a local db.
-        save_op = core.CreateOperator(
-            "Save",
-            [name for name, data in blobs],
-            [],
-            absolute_path=1,
-            db=tmp_file,
-            db_type=self._db_type,
-            chunk_size=40,
-            options=caffe2_pb2.SerializationOptions(
-                options=[
-                    BlobSerializationOptions(
-                        blob_name_regex="int16_data", chunk_size=10
-                    ),
-                    BlobSerializationOptions(
-                        blob_name_regex=".*16_data", chunk_size=20
-                    ),
-                    BlobSerializationOptions(
-                        blob_name_regex="float16_data", chunk_size=30
-                    ),
-                ],
-            ),
-        )
-        self.assertTrue(workspace.RunOperatorOnce(save_op))
-
-        self.load_and_check_blobs(blobs, [tmp_file])
-
-        blob_chunks = self._read_chunk_info(Path(tmp_file))
-        # We explicitly set a chunk_size of 10 for int16_data
-        self.assertEqual(
-            len(blob_chunks["int16_data"]), math.ceil(num_elems / 10)
-        )
-        # uint16_data should match the .*16_data pattern, and get a size of 20
-        self.assertEqual(
-            len(blob_chunks["uint16_data"]), math.ceil(num_elems / 20)
-        )
-        # float16_data should also match the .*16_data pattern, and get a size
-        # of 20.  The explicitly float16_data rule came after the .*16_data
-        # pattern, so it has lower precedence and will be ignored.
-        self.assertEqual(
-            len(blob_chunks["float16_data"]), math.ceil(num_elems / 20)
-        )
-        # int64_data will get the default chunk_size of 40
-        self.assertEqual(
-            len(blob_chunks["int64_data"]), math.ceil(num_elems / 40)
-        )
-
-
-    def testSaveWithDBOptions(self) -> None:
-        num_elems = 1234
-        chunk_size = 32
-        expected_num_chunks = math.ceil(num_elems / chunk_size)
-
-        tmp_folder = self.make_tempdir()
-        tmp_file = str(tmp_folder / "save.output")
-
-        blobs = self.create_test_blobs(num_elems)
-
-        db_options = b"test_db_options"
-        # Saves the blobs to a local db.
-        save_op = core.CreateOperator(
-            "Save",
-            [name for name, data in blobs],
-            [],
-            absolute_path=1,
-            db=tmp_file,
-            db_type=self._db_type,
-            chunk_size=chunk_size,
-            db_options=db_options,
-        )
-        self.assertTrue(workspace.RunOperatorOnce(save_op))
-
-        self.load_and_check_blobs(blobs, [tmp_file])
-
-        blob_chunks = self._read_chunk_info(Path(tmp_file))
-        for blob_name, chunks in blob_chunks.items():
-            self.assertEqual(len(chunks), expected_num_chunks)
-
-    def testSaveFloatToBfloat16(self) -> None:
-        tmp_folder = self.make_tempdir()
-        tmp_file = str(tmp_folder / "save.output")
-
-        # Create 2 blobs with the same float data
-        float_data = np.random.random_sample(4000).astype(np.float32)
-        workspace.FeedBlob("float1", float_data)
-        workspace.FeedBlob("float2", float_data)
-        blob_names = ["float1", "float2"]
-
-        # Serialize the data, using bfloat16 serialization for one of the blobs
-        save_op = core.CreateOperator(
-            "Save",
-            blob_names,
-            [],
-            absolute_path=1,
-            db=tmp_file,
-            db_type=self._db_type,
-            options=caffe2_pb2.SerializationOptions(
-                options=[
-                    BlobSerializationOptions(
-                        blob_name_regex="float1",
-                        float_format=BlobSerializationOptions.FLOAT_BFLOAT16,
-                    ),
-                ],
-            ),
-        )
-        self.assertTrue(workspace.RunOperatorOnce(save_op))
-
-        # As long as fbgemm was available for us to perform bfloat16 conversion,
-        # the serialized data for float1 should be almost half the size of float2
-        if workspace.has_fbgemm:
-            blob_chunks = self._read_chunk_info(Path(tmp_file))
-            self.assertEqual(len(blob_chunks["float1"]), 1, blob_chunks["float1"])
-            self.assertEqual(len(blob_chunks["float2"]), 1, blob_chunks["float2"])
-            self.assertLess(
-                blob_chunks["float1"][0].value_size,
-                0.6 * blob_chunks["float2"][0].value_size
-            )
-
-        self.load_blobs(blob_names, [tmp_file])
-
-        # float2 should be exactly the same as the input data
-        np.testing.assert_array_equal(workspace.FetchBlob("float2"), float_data)
-        # float2 should be close-ish to the input data
-        np.testing.assert_array_almost_equal(
-            workspace.FetchBlob("float1"), float_data, decimal=2
-        )
-
-    def testEstimateBlobSizes(self) -> None:
-        # Create some blobs to test with
-        float_data = np.random.random_sample(4000).astype(np.float32)
-        workspace.FeedBlob("float1", float_data)
-        workspace.FeedBlob("float2", float_data)
-        workspace.FeedBlob(
-            "float3", np.random.random_sample(2).astype(np.float32)
-        )
-        workspace.FeedBlob(
-            "ui16", np.random.randint(0, 0xffff, size=1024, dtype=np.uint16)
-        )
-
-        # Estimate the serialized size of the data.
-        # Request bfloat16 serialization for one of the float blobs, just to
-        # exercise size estimation when using this option.
-        options = caffe2_pb2.SerializationOptions(
-            options=[
-                BlobSerializationOptions(
-                    blob_name_regex="float1",
-                    float_format=BlobSerializationOptions.FLOAT_BFLOAT16,
-                    chunk_size=500,
-                ),
-            ],
-        )
-        get_blobs_op = core.CreateOperator(
-            "EstimateAllBlobSizes",
-            [],
-            ["blob_names", "blob_sizes"],
-            options=options,
-        )
-        self.assertTrue(workspace.RunOperatorOnce(get_blobs_op))
-        blob_names = workspace.FetchBlob("blob_names")
-        blob_sizes = workspace.FetchBlob("blob_sizes")
-
-        sizes_by_name: Dict[str, int] = {}
-        for idx, name in enumerate(blob_names):
-            sizes_by_name[name.decode("utf-8")] = blob_sizes[idx]
-
-        # Note that the output blob list will include our output blob names.
-        expected_blobs = [
-            "float1", "float2", "float3", "ui16",
-            "blob_names", "blob_sizes"
-        ]
-        self.assertEqual(set(sizes_by_name.keys()), set(expected_blobs))
-
-        def check_expected_blob_size(
-            name: str, num_elems: int, elem_size: int, num_chunks: int = 1
-        ) -> None:
-            # The estimation code applies a fixed 40 byte per-chunk overhead to
-            # account for the extra space required for other fixed TensorProto
-            # message fields.
-            per_chunk_overhead = 50
-            expected_size = (
-                (num_chunks * (len(name) + per_chunk_overhead))
-                + (num_elems * elem_size)
-            )
-            self.assertEqual(
-                sizes_by_name[name],
-                expected_size,
-                f"expected size mismatch for {name}"
-            )
-
-        check_expected_blob_size("ui16", 1024, 3)
-        check_expected_blob_size("float2", 4000, 4)
-        check_expected_blob_size("float3", 2, 4)
-
-        # Our serialization options request to split float1 into 500-element
-        # chunks when saving it.  If fbgemm is available then the float1 blob
-        # will be serialized using 2 bytes per element instead of 4 bytes.
-        float1_num_chunks = 4000 // 500
-        if workspace.has_fbgemm:
-            check_expected_blob_size("float1", 4000, 2, float1_num_chunks)
-        else:
-            check_expected_blob_size("float1", 4000, 4, float1_num_chunks)
-
-        check_expected_blob_size("blob_names", len(expected_blobs), 50)
-        check_expected_blob_size("blob_sizes", len(expected_blobs), 8)
-
-        # Now actually save the blobs so we can compare our estimates
-        # to how big the serialized data actually is.
-        tmp_folder = self.make_tempdir()
-        tmp_file = str(tmp_folder / "save.output")
-        save_op = core.CreateOperator(
-            "Save",
-            list(sizes_by_name.keys()),
-            [],
-            absolute_path=1,
-            db=tmp_file,
-            db_type=self._db_type,
-            options=options,
-        )
-        self.assertTrue(workspace.RunOperatorOnce(save_op))
-
-        blob_chunks = self._read_chunk_info(Path(tmp_file))
-        saved_sizes: Dict[str, int] = {}
-        for blob_name, chunks in blob_chunks.items():
-            total_size = sum(chunk.value_size for chunk in chunks)
-            saved_sizes[blob_name] = total_size
-
-        # For sanity checking, ensure that our estimates aren't
-        # extremely far off
-        for name in expected_blobs:
-            estimated_size = sizes_by_name[name]
-            saved_size = saved_sizes[name]
-            difference = abs(estimated_size - saved_size)
-            error_pct = 100.0 * (difference / saved_size)
-            print(
-                f"{name}: estimated={estimated_size} actual={saved_size} "
-                f"error={error_pct:.2f}%"
-            )
-            # Don't check the blob_names blob.  It is a string tensor, and we
-            # can't estimate string tensor sizes very well without knowing the
-            # individual string lengths.  (Currently it requires 102 bytes to
-            # save, but we estimate 360).
-            if name == "blob_names":
-                continue
-            # Check that we are within 100 bytes, or within 25%
-            # We are generally quite close for tensors with fixed-width fields
-            # (like float), but a little farther off for tensors that use varint
-            # encoding.
-            if difference > 100:
-                self.assertLess(error_pct, 25.0)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/operator_test/locally_connected_op_test.py b/caffe2/python/operator_test/locally_connected_op_test.py
deleted file mode 100644
index 445c3641573f..000000000000
--- a/caffe2/python/operator_test/locally_connected_op_test.py
+++ /dev/null
@@ -1,228 +0,0 @@
-
-
-
-
-import numpy as np
-from hypothesis import given, settings, assume
-import hypothesis.strategies as st
-
-from caffe2.python import core, utils, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-
-class TestLocallyConnectedOp(serial.SerializedTestCase):
-    @given(N=st.integers(1, 3),
-           C=st.integers(1, 3),
-           H=st.integers(1, 5),
-           W=st.integers(1, 5),
-           M=st.integers(1, 3),
-           kernel=st.integers(1, 3),
-           op_name=st.sampled_from(["LC", "LC2D"]),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           use_bias=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_lc_2d(
-            self, N, C, H, W, M, kernel, op_name, order, use_bias, gc, dc):
-        if H < kernel:
-            kernel = H
-        if W < kernel:
-            kernel = W
-
-        assume(C == kernel * N)
-
-        op = core.CreateOperator(
-            op_name,
-            ["X", "W", "b"] if use_bias else ["X", "W"],
-            ["Y"],
-            kernels=[kernel, kernel],
-            order=order,
-            engine="",
-        )
-
-        Y_H = H - kernel + 1
-        Y_W = W - kernel + 1
-        if order == "NCHW":
-            X = np.random.rand(N, C, H, W).astype(np.float32) - 0.5
-            W = np.random.rand(Y_H, Y_W, M, C, kernel,
-                               kernel).astype(np.float32) - 0.5
-        else:
-            X = np.random.rand(N, H, W, C).astype(np.float32) - 0.5
-            W = np.random.rand(Y_H, Y_W, M, kernel, kernel,
-                               C).astype(np.float32) - 0.5
-        b = np.random.rand(Y_H, Y_W, M).astype(np.float32) - 0.5
-        inputs = [X, W, b] if use_bias else [X, W]
-
-        def lc_2d_nchw(X, W, b=None):
-            N, C, XH, XW = X.shape
-            YH, YW, M, _, KH, KW = W.shape
-
-            def conv(n, m, yh, yw):
-                sum = b[yh, yw, m] if b is not None else 0
-                for c in range(C):
-                    for kh in range(KH):
-                        for kw in range(KW):
-                            hh = yh + kh
-                            ww = yw + kw
-                            sum += X[n, c, hh, ww] * W[yh, yw, m, c, kh, kw]
-                return sum
-
-            output = np.zeros((N, M, YH, YW), dtype=np.float32)
-            for n in range(N):
-                for m in range(M):
-                    for yh in range(YH):
-                        for yw in range(YW):
-                            output[n, m, yh, yw] = conv(n, m, yh, yw)
-            return [output]
-
-        def lc_2d_nhwc(X, W, b=None):
-            XT = utils.NHWC2NCHW(X)
-            WT = np.transpose(W, [0, 1, 2, 5, 3, 4])
-            output = lc_2d_nchw(XT, WT, b)
-            return [utils.NCHW2NHWC(output[0])]
-
-        ref_op = lc_2d_nchw if order == "NCHW" else lc_2d_nhwc
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=ref_op,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-    @given(N=st.integers(1, 3),
-           C=st.integers(1, 3),
-           size=st.integers(1, 5),
-           M=st.integers(1, 3),
-           kernel=st.integers(1, 3),
-           op_name=st.sampled_from(["LC", "LC1D"]),
-           use_bias=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=None)
-    # Increased timeout from 1 second to 5 for ROCM
-    def test_lc_1d(self, N, C, size, M, kernel, op_name, use_bias, gc, dc):
-        if size < kernel:
-            kernel = size
-
-        op = core.CreateOperator(
-            op_name,
-            ["X", "W", "b"] if use_bias else ["X", "W"],
-            ["Y"],
-            kernels=[kernel],
-            order="NCHW",
-            engine="",
-        )
-
-        L = size - kernel + 1
-        X = np.random.rand(N, C, size).astype(np.float32) - 0.5
-        W = np.random.rand(L, M, C, kernel).astype(np.float32) - 0.5
-        b = np.random.rand(L, M).astype(np.float32) - 0.5
-        inputs = [X, W, b] if use_bias else [X, W]
-
-        def lc_1d_nchw(X, W, b=None):
-            N, C, XL = X.shape
-            YL, M, _, KL = W.shape
-
-            def conv(n, m, yl):
-                sum = b[yl, m] if b is not None else 0
-                for c in range(C):
-                    for kl in range(KL):
-                        ll = yl + kl
-                        sum += X[n, c, ll] * W[yl, m, c, kl]
-                return sum
-
-            output = np.zeros((N, M, YL), dtype=np.float32)
-            for n in range(N):
-                for m in range(M):
-                    for yl in range(YL):
-                        output[n, m, yl] = conv(n, m, yl)
-            return [output]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=lc_1d_nchw,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-    @given(N=st.integers(1, 1),
-           C=st.integers(1, 1),
-           T=st.integers(2, 2),
-           H=st.integers(2, 2),
-           W=st.integers(2, 2),
-           M=st.integers(1, 1),
-           kernel=st.integers(2, 2),
-           op_name=st.sampled_from(["LC", "LC3D"]),
-           use_bias=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=None)
-    def test_lc_3d(self, N, C, T, H, W, M, kernel, op_name, use_bias, gc, dc):
-        if T < kernel:
-            kernel = T
-        if H < kernel:
-            kernel = H
-        if W < kernel:
-            kernel = W
-
-        op = core.CreateOperator(
-            op_name,
-            ["X", "W", "b"] if use_bias else ["X", "W"],
-            ["Y"],
-            kernels=[kernel, kernel, kernel],
-            order="NCHW",
-            engine="",
-        )
-
-        Y_T = T - kernel + 1
-        Y_H = H - kernel + 1
-        Y_W = W - kernel + 1
-        X = np.random.rand(N, C, T, H, W).astype(np.float32) - 0.5
-        W = np.random.rand(Y_T, Y_H, Y_W, M, C, kernel,
-                           kernel, kernel).astype(np.float32) - 0.5
-        b = np.random.rand(Y_T, Y_H, Y_W, M).astype(np.float32) - 0.5
-        inputs = [X, W, b] if use_bias else [X, W]
-
-        def lc_3d_nchw(X, W, b=None):
-            N, C, XT, XH, XW = X.shape
-            YT, YH, YW, M, _, KT, KH, KW = W.shape
-
-            def conv(n, m, yt, yh, yw):
-                sum = b[yt, yh, yw, m] if b is not None else 0
-                for c in range(C):
-                    for kt in range(KT):
-                        for kh in range(KH):
-                            for kw in range(KW):
-                                tt = yt + kt
-                                hh = yh + kh
-                                ww = yw + kw
-                                sum += X[n, c, tt, hh, ww] * \
-                                    W[yt, yh, yw, m, c, kt, kh, kw]
-                return sum
-
-            output = np.zeros((N, M, YT, YH, YW), dtype=np.float32)
-            for n in range(N):
-                for m in range(M):
-                    for yt in range(YT):
-                        for yh in range(YH):
-                            for yw in range(YW):
-                                output[n, m, yt, yh, yw] = conv(
-                                    n, m, yt, yh, yw)
-            return [output]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=lc_3d_nchw,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
diff --git a/caffe2/python/operator_test/loss_ops_test.py b/caffe2/python/operator_test/loss_ops_test.py
deleted file mode 100644
index f6a07ead3cf9..000000000000
--- a/caffe2/python/operator_test/loss_ops_test.py
+++ /dev/null
@@ -1,41 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestLossOps(serial.SerializedTestCase):
-
-    @serial.given(n=st.integers(1, 8), **hu.gcs)
-    def test_averaged_loss(self, n, gc, dc):
-        X = np.random.rand(n).astype(np.float32)
-
-        def avg_op(X):
-            return [np.mean(X)]
-
-        op = core.CreateOperator(
-            "AveragedLoss",
-            ["X"],
-            ["y"],
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=avg_op,
-        )
-
-        self.assertGradientChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            outputs_to_check=0,
-            outputs_with_grads=[0],
-        )
diff --git a/caffe2/python/operator_test/lpnorm_op_test.py b/caffe2/python/operator_test/lpnorm_op_test.py
deleted file mode 100644
index 2899ba929470..000000000000
--- a/caffe2/python/operator_test/lpnorm_op_test.py
+++ /dev/null
@@ -1,101 +0,0 @@
-
-
-
-
-
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-from hypothesis import given, settings
-import hypothesis.strategies as st
-
-
-class LpnormTest(hu.HypothesisTestCase):
-    def _test_Lp_Norm(self, inputs, gc, dc):
-        X = inputs[0]
-        # avoid kinks by moving away from 0
-        X += 0.02 * np.sign(X)
-        X[X == 0.0] += 0.02
-        self.ws.create_blob("X").feed(X)
-        op = core.CreateOperator(
-            'LpNorm',
-            ['X'],
-            ['l1_norm'],
-            p=1,
-        )
-        self.ws.run(op)
-
-        np.testing.assert_allclose(self.ws.blobs[("l1_norm")].fetch(),
-                                     np.linalg.norm((X).flatten(), ord=1),
-                                    rtol=1e-4, atol=1e-4)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        # Gradient check wrt X
-        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2, threshold=1e-2)
-
-        op = core.CreateOperator(
-            'LpNorm',
-            ['X'],
-            ['l2_norm'],
-            p=2,
-        )
-        self.ws.run(op)
-
-        np.testing.assert_allclose(
-            self.ws.blobs[("l2_norm")].fetch(),
-            np.linalg.norm((X).flatten(), ord=2)**2,
-            rtol=1e-4,
-            atol=1e-4
-        )
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        # Gradient check wrt X
-        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2, threshold=1e-2)
-
-        op = core.CreateOperator(
-            'LpNorm',
-            ['X'],
-            ['l2_averaged_norm'],
-            p=2,
-            average=True
-        )
-        self.ws.run(op)
-
-        np.testing.assert_allclose(
-            self.ws.blobs[("l2_averaged_norm")].fetch(),
-            np.linalg.norm((X).flatten(), ord=2)**2 / X.size,
-            rtol=1e-4,
-            atol=1e-4
-        )
-
-    @given(inputs=hu.tensors(n=1,
-                             min_dim=1,
-                             max_dim=3,
-                             dtype=np.float32),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_Lp_Norm(self, inputs, gc, dc):
-        self._test_Lp_Norm(inputs, gc, dc)
-
-    def test_Lp_Norm_empty(self):
-        self._test_Lp_Norm([np.array([], dtype=np.float32)], hu.cpu_do, [hu.cpu_do])
-        self.assertEqual(self.ws.blobs["l1_norm"].fetch()[0], 0.0)
-        self.assertEqual(self.ws.blobs["l2_norm"].fetch()[0], 0.0)
-        self.assertTrue(np.isnan(self.ws.blobs["l2_averaged_norm"].fetch()[0]))
-
-    @given(x=hu.tensor(
-        min_dim=1, max_dim=10, dtype=np.float32,
-        elements=st.integers(min_value=-100, max_value=100)),
-        p=st.integers(1, 2),
-        average=st.integers(0, 1)
-    )
-    def test_lpnorm_shape_inference(self, x, p, average):
-        workspace.FeedBlob('x', x)
-
-        net = core.Net("lpnorm_test")
-        result = net.LpNorm(['x'], p=p, average=bool(average))
-        (shapes, types) = workspace.InferShapesAndTypes([net])
-        workspace.RunNetOnce(net)
-
-        self.assertEqual(shapes[result], list(workspace.blobs[result].shape))
-        self.assertEqual(types[result], core.DataType.FLOAT)
diff --git a/caffe2/python/operator_test/map_ops_test.py b/caffe2/python/operator_test/map_ops_test.py
deleted file mode 100644
index dcc8b295f7c3..000000000000
--- a/caffe2/python/operator_test/map_ops_test.py
+++ /dev/null
@@ -1,70 +0,0 @@
-
-
-
-
-
-import itertools
-import numpy as np
-import tempfile
-import unittest
-import os
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestMap(hu.HypothesisTestCase):
-
-    def test_create_map(self):
-        dtypes = [core.DataType.INT32, core.DataType.INT64]
-        for key_dtype, value_dtype in itertools.product(dtypes, dtypes):
-            op = core.CreateOperator(
-                'CreateMap',
-                [],
-                ['map'],
-                key_dtype=key_dtype,
-                value_dtype=value_dtype,
-            )
-            workspace.RunOperatorOnce(op)
-            self.assertTrue(workspace.HasBlob('map'))
-
-    def test_map(self):
-
-        def test_map_func(KEY_T, VALUE_T):
-            model_file = os.path.join(tempfile.mkdtemp(), 'db')
-            key_data = np.asarray([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=KEY_T)
-            value_data = np.asarray([2, 3, 3, 3, 3, 2, 3, 3, 3, 3], dtype=VALUE_T)
-            workspace.FeedBlob("key_data", key_data)
-            workspace.FeedBlob("value_data", value_data)
-            save_net = core.Net("save_net")
-            save_net.KeyValueToMap(["key_data", "value_data"], "map_data")
-            save_net.Save(
-                ["map_data"], [],
-                db=model_file,
-                db_type="minidb",
-                absolute_path=True
-            )
-            workspace.RunNetOnce(save_net)
-            workspace.ResetWorkspace()
-            load_net = core.Net("load_net")
-            load_net.Load(
-                [], ["map_data"],
-                db=model_file,
-                db_type="minidb",
-                load_all=True,
-                absolute_path=True
-            )
-            load_net.MapToKeyValue("map_data", ["key_data", "value_data"])
-            workspace.RunNetOnce(load_net)
-            key_data2 = workspace.FetchBlob("key_data")
-            value_data2 = workspace.FetchBlob("value_data")
-            assert(set(zip(key_data, value_data)) == set(zip(key_data2, value_data2)))
-
-        test_map_func(np.int64, np.int64)
-        test_map_func(np.int64, np.int32)
-        test_map_func(np.int32, np.int32)
-        test_map_func(np.int32, np.int64)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/margin_loss_l2r_operator_test.py b/caffe2/python/operator_test/margin_loss_l2r_operator_test.py
deleted file mode 100644
index 1001335c181f..000000000000
--- a/caffe2/python/operator_test/margin_loss_l2r_operator_test.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, workspace
-from hypothesis import given
-
-
-class TestMarginLossL2rOps(hu.HypothesisTestCase):
-    def ref_margin_loss(self, y, r, margin):
-        n = len(y)
-        dy = np.zeros(n)
-        loss = 0
-        if np.sum(np.abs(r)) < 1e-6:
-            return loss, dy
-
-        for i in range(n):
-            for j in range(i + 1, n):
-                weight = 1.0 / n
-                diff = 1 if r[i] - r[j] > 0 else -1
-                if (margin > (y[i] - y[j]) * diff) and (r[i] != r[j]):
-                    loss += weight * (margin - (y[i] - y[j]) * diff)
-                    dy[i] += -diff * weight
-                    dy[j] += diff * weight
-        return loss, dy
-
-    @given(
-        n=st.integers(10, 10),
-        k=st.integers(2, 5),
-        m=st.integers(1, 5),
-        **hu.gcs_cpu_only
-    )
-    def test_session_margin_loss(self, n, k, m, gc, dc):
-        y = np.random.rand(n * m).astype(np.float32)
-        r = np.random.randint(k, size=n * m).astype(np.float32)
-        # m sessions of length n
-        session_lengths = np.repeat(n, m).astype(np.int32)
-        ref_loss = np.empty(0)
-        ref_scale_loss = np.empty(0)
-        ref_dy = np.empty(0)
-        ref_scale_dy = np.empty(0)
-        for i in range(m):
-            r_loss, r_dy = self.ref_margin_loss(
-                y[(i) * n : (i + 1) * n], r[(i) * n : (i + 1) * n], 0.06
-            )
-            r_scale_loss, r_scale_dy = self.ref_margin_loss(
-                y[(i) * n : (i + 1) * n], r[(i) * n : (i + 1) * n], 0.04
-            )
-            ref_loss = np.append(ref_loss, r_loss)
-            ref_dy = np.append(ref_dy, r_dy)
-            ref_scale_loss = np.append(ref_scale_loss, r_scale_loss)
-            ref_scale_dy = np.append(ref_scale_dy, r_scale_dy)
-
-        dloss = np.random.random(m).astype(np.float32)
-
-        workspace.blobs["pred"] = y
-        workspace.blobs["label"] = r
-        workspace.blobs["session_lengths"] = session_lengths
-        workspace.blobs["dloss"] = dloss
-
-        # Test scale = 1
-        op = core.CreateOperator(
-            "SessionMarginLoss",
-            ["pred", "label", "session_lengths"],
-            ["loss", "dpred"],
-            margin=0.06,
-        )
-        workspace.RunOperatorOnce(op)
-        loss = workspace.blobs["loss"]
-        dy = workspace.blobs["dpred"]
-        np.testing.assert_allclose(loss, ref_loss, rtol=1e-5, atol=1e-6)
-        np.testing.assert_allclose(dy, ref_dy, rtol=1e-5, atol=1e-6)
-        name = op.output[0]
-        arr = workspace.FetchBlob(name)
-        self.assertGradientChecks(
-            gc, op, [y, r, session_lengths], 0, [0], stepsize=1e-3, threshold=2e-1
-        )
-
-        # Test scale > 1
-        op = core.CreateOperator(
-            "SessionMarginLoss",
-            ["pred", "label", "session_lengths"],
-            ["loss", "dpred"],
-            margin=0.04,
-        )
-        workspace.RunOperatorOnce(op)
-        loss = workspace.blobs["loss"]
-        dy = workspace.blobs["dpred"]
-        np.testing.assert_allclose(loss, ref_scale_loss, rtol=1e-5, atol=1e-6)
-        np.testing.assert_allclose(dy, ref_scale_dy, rtol=1e-5, atol=1e-6)
-        self.assertGradientChecks(
-            gc, op, [y, r, session_lengths], 0, [0], stepsize=1e-3, threshold=2e-1
-        )
diff --git a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py b/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
deleted file mode 100644
index a91de60a8c19..000000000000
--- a/caffe2/python/operator_test/margin_ranking_criterion_op_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestMarginRankingCriterion(serial.SerializedTestCase):
-    @given(N=st.integers(min_value=10, max_value=20),
-           seed=st.integers(min_value=0, max_value=65535),
-           margin=st.floats(min_value=-0.5, max_value=0.5),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_margin_ranking_criterion(self, N, seed, margin, gc, dc):
-        np.random.seed(seed)
-        X1 = np.random.randn(N).astype(np.float32)
-        X2 = np.random.randn(N).astype(np.float32)
-        Y = np.random.choice([-1, 1], size=N).astype(np.int32)
-        op = core.CreateOperator(
-            "MarginRankingCriterion", ["X1", "X2", "Y"], ["loss"],
-            margin=margin)
-
-        def ref_cec(X1, X2, Y):
-            result = np.maximum(-Y * (X1 - X2) + margin, 0)
-            return (result, )
-
-        inputs = [X1, X2, Y]
-        # This checks the op implementation against a reference function in
-        # python.
-        self.assertReferenceChecks(gc, op, inputs, ref_cec)
-        # This checks the op implementation over multiple device options (e.g.
-        # CPU and CUDA). [0] means that the 0-th output is checked.
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-        # Make singular points less sensitive
-        X1[np.abs(margin - Y * (X1 - X2)) < 0.1] += 0.1
-        X2[np.abs(margin - Y * (X1 - X2)) < 0.1] -= 0.1
-
-        # Check dX1
-        self.assertGradientChecks(gc, op, inputs, 0, [0])
-        # Check dX2
-        self.assertGradientChecks(gc, op, inputs, 1, [0])
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/math_ops_test.py b/caffe2/python/operator_test/math_ops_test.py
deleted file mode 100644
index 4849b83648f8..000000000000
--- a/caffe2/python/operator_test/math_ops_test.py
+++ /dev/null
@@ -1,55 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-from hypothesis import strategies as st
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-import numpy as np
-import unittest
-
-
-class TestMathOps(serial.SerializedTestCase):
-
-    @given(X=hu.tensor(),
-           exponent=st.floats(min_value=2.0, max_value=3.0),
-           **hu.gcs)
-    def test_elementwise_power(self, X, exponent, gc, dc):
-        # negative integer raised with non-integer exponent is domain error
-        X = np.abs(X)
-        def powf(X):
-            return (X ** exponent,)
-
-        def powf_grad(g_out, outputs, fwd_inputs):
-            return (exponent * (fwd_inputs[0] ** (exponent - 1)) * g_out,)
-
-        op = core.CreateOperator(
-            "Pow", ["X"], ["Y"], exponent=exponent)
-
-        self.assertReferenceChecks(gc, op, [X], powf,
-                                   output_to_grad="Y",
-                                   grad_reference=powf_grad,
-                                   ensure_outputs_are_inferred=True)
-
-    @given(X=hu.tensor(),
-           exponent=st.floats(min_value=-3.0, max_value=3.0),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_sign(self, X, exponent, gc, dc):
-        def signf(X):
-            return [np.sign(X)]
-
-        op = core.CreateOperator(
-            "Sign", ["X"], ["Y"])
-
-        self.assertReferenceChecks(
-            gc, op, [X], signf, ensure_outputs_are_inferred=True)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/matmul_op_test.py b/caffe2/python/operator_test/matmul_op_test.py
deleted file mode 100644
index 067eeabbe2d9..000000000000
--- a/caffe2/python/operator_test/matmul_op_test.py
+++ /dev/null
@@ -1,289 +0,0 @@
-
-
-
-
-
-import inspect
-
-import numpy as np
-
-from hypothesis import assume, given, settings
-import hypothesis.strategies as st
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-class TestMatMul(serial.SerializedTestCase):
-    @serial.given(
-        M=st.integers(min_value=1, max_value=10),
-        K=st.integers(min_value=1, max_value=10),
-        N=st.integers(min_value=1, max_value=10),
-        trans_a=st.booleans(),
-        trans_b=st.booleans(),
-        **hu.gcs
-    )
-    def test_matmul(self, M, K, N, trans_a, trans_b, gc, dc):
-        X = np.random.rand(M, K).astype(np.float32) - 0.5
-        if trans_a:
-            X = X.transpose()
-
-        Y = np.random.rand(K, N).astype(np.float32) - 0.5
-        if trans_b:
-            Y = Y.transpose()
-
-        op = core.CreateOperator(
-            'MatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b
-        )
-
-        def matmul_ref(X, Y, trans_a, trans_b):
-            XX = X.transpose() if trans_a else X
-            YY = Y.transpose() if trans_b else Y
-            return (XX.dot(YY), )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(gc, op, [X, Y, trans_a, trans_b], matmul_ref)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-        # Gradient check wrt X
-        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
-        # Gradient check wrt Y
-        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
-
-    @given(
-        M=st.integers(min_value=1, max_value=10),
-        K=st.integers(min_value=1, max_value=10),
-        N=st.integers(min_value=1, max_value=10),
-        axis_a=st.sampled_from([-3, -2, -1, 1, 2, 3]),
-        axis_b=st.sampled_from([-3, -2, -1, 1, 2, 3]),
-        trans_a=st.booleans(),
-        trans_b=st.booleans(),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_matmul_axis(
-        self, M, K, N, axis_a, axis_b, trans_a, trans_b, gc, dc
-    ):
-        X = np.random.rand(M, K).astype(np.float32) - 0.5
-        if trans_a:
-            X = X.transpose()
-        shape_x = [X.shape[0], 1, 1, 1]
-        shape_x[axis_a] = X.shape[1]
-        X = X.reshape(*shape_x)
-
-        Y = np.random.rand(K, N).astype(np.float32) - 0.5
-        if trans_b:
-            Y = Y.transpose()
-        shape_y = [Y.shape[0], 1, 1, 1]
-        shape_y[axis_b] = Y.shape[1]
-        Y = Y.reshape(*shape_y)
-        op = core.CreateOperator(
-            'MatMul', ['X', 'Y'],
-            'out',
-            axis_a=axis_a,
-            axis_b=axis_b,
-            trans_a=trans_a,
-            trans_b=trans_b
-        )
-
-        def size_to_dim(X, axis):
-            dim = 1
-            for i in range(axis):
-                dim *= X.shape[i]
-            return dim
-
-        def size_from_dim(X, axis):
-            dim = 1
-            for i in range(axis, X.ndim):
-                dim *= X.shape[i]
-            return dim
-
-        def reshape(X, axis):
-            dim_0, dim_1 = size_to_dim(X, axis), size_from_dim(X, axis)
-            return X.reshape(dim_0, dim_1)
-
-        def canonical_axis(axis, ndim):
-            return ndim + axis if axis < 0 else axis
-
-        def matmul_ref(X, Y, axis_a, axis_b, trans_a, trans_b):
-            can_axis_a = canonical_axis(axis_a, X.ndim)
-            can_axis_b = canonical_axis(axis_b, Y.ndim)
-            X, Y = reshape(X, can_axis_a), reshape(Y, can_axis_b)
-            XX = X.transpose() if trans_a else X
-            YY = Y.transpose() if trans_b else Y
-            return (XX.dot(YY), )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(
-            gc, op, [X, Y, axis_a, axis_b, trans_a, trans_b], matmul_ref
-        )
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-        # Gradient check wrt X
-        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
-        # Gradient check wrt Y
-        self.assertGradientChecks(gc, op, [X, Y], 1, [0])
-
-
-class TestBatchMatMul(serial.SerializedTestCase):
-    @settings(max_examples=30, deadline=None)
-    @given(
-        C=st.integers(min_value=0, max_value=3),  # number of batch dims
-        M=st.integers(min_value=1, max_value=10),
-        K=st.integers(min_value=1, max_value=10),
-        N=st.integers(min_value=1, max_value=10),
-        trans_a=st.booleans(),
-        trans_b=st.booleans(),
-        dtype=st.sampled_from([np.float32, np.float16]),
-        **hu.gcs
-    )
-    def test_batch_matmul(self, C, M, K, N, trans_a, trans_b, dtype, gc, dc):
-        if dtype == np.float16:
-            # fp16 is only supported with CUDA/HIP
-            assume(core.IsGPUDeviceType(gc.device_type))
-            dc = [d for d in dc if core.IsGPUDeviceType(d.device_type)]
-
-        batch_dims = np.random.randint(
-            low=1,
-            high=3,
-            size=C,
-            dtype=np.int64).tolist()
-        X = np.random.rand(*(batch_dims + [M, K])).astype(dtype) - 0.5
-        if trans_a:
-            X = X.swapaxes(-1, -2)
-        Y = np.random.rand(*(batch_dims + [K, N])).astype(dtype) - 0.5
-        if trans_b:
-            Y = Y.swapaxes(-1, -2)
-
-        op = core.CreateOperator(
-            'BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b
-        )
-
-        def matmul_ref(X, Y, trans_a, trans_b, dtype):
-            XX = (X.swapaxes(-1, -2) if trans_a else X).astype(np.float32)
-            YY = (Y.swapaxes(-1, -2) if trans_b else Y).astype(np.float32)
-            return (np.matmul(XX, YY).astype(dtype),)
-
-        # relaxing the "threshold" for fp16 to 150x of the default
-        def relax_fp16_check(check_func, *args, **kwargs):
-            # inspect the default "threshold" value in check_func
-            argspec = inspect.getargspec(check_func)
-            threshold = argspec.defaults[
-                argspec.args.index('threshold') -
-                (len(argspec.args) - len(argspec.defaults))]
-
-            if dtype == np.float16:
-                threshold = 150 * threshold
-            check_func(*args, threshold=threshold, **kwargs)
-
-        # Check against numpy reference
-        relax_fp16_check(self.assertReferenceChecks, gc, op, [X, Y, trans_a, trans_b, dtype], matmul_ref)
-        # Check over multiple devices
-        relax_fp16_check(self.assertDeviceChecks, dc, op, [X, Y], [0])
-        # Gradient check wrt X
-        relax_fp16_check(self.assertGradientChecks, gc, op, [X, Y], 0, [0])
-        # Gradient check wrt Y
-        relax_fp16_check(self.assertGradientChecks, gc, op, [X, Y], 1, [0])
-
-    def _test_batch_matmul_with_broadcast_common(
-        self,
-        X,
-        Y,
-        dtype,
-        gc,
-        dc,
-        trans_a=None,
-        trans_b=None,
-    ):
-        if trans_a is not None and trans_b is not None:
-            op = core.CreateOperator(
-                'BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b, broadcast=1
-            )
-        else:
-            op = core.CreateOperator(
-                'BatchMatMul', ['X', 'Y'], 'out', broadcast=1
-            )
-
-        def matmul_ref(X, Y, trans_a, trans_b, dtype):
-            XX = (X.swapaxes(-1, -2) if trans_a else X).astype(np.float32)
-            YY = (Y.swapaxes(-1, -2) if trans_b else Y).astype(np.float32)
-            return (np.matmul(XX, YY).astype(dtype),)
-
-        # Check against numpy reference
-        self.assertReferenceChecks(gc, op, [X, Y, trans_a, trans_b, dtype], matmul_ref)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-
-    @given(
-        C_1=st.integers(min_value=0, max_value=3),  # number of batch dims
-        C_2=st.integers(min_value=0, max_value=3),
-        M=st.integers(min_value=1, max_value=10),
-        K=st.integers(min_value=1, max_value=10),
-        N=st.integers(min_value=1, max_value=10),
-        trans_a=st.booleans(),
-        trans_b=st.booleans(),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_numpy_batch_matmul(self, C_1, C_2, M, K, N, trans_a, trans_b, gc, dc):
-        dtype = np.float32
-        batch_dims = np.random.randint(
-            low=0,
-            high=3,
-            size=max(C_1, C_2),
-            dtype=np.int64).tolist()
-        lbd = len(batch_dims)
-        X = np.random.rand(*(batch_dims[lbd - C_1:] + [M, K])).astype(dtype) - 0.5
-        if trans_a:
-            X = X.swapaxes(-1, -2)
-        Y = np.random.rand(*(batch_dims[lbd - C_2:] + [K, N])).astype(dtype) - 0.5
-        if trans_b:
-            Y = Y.swapaxes(-1, -2)
-
-        self._test_batch_matmul_with_broadcast_common(X, Y, dtype, gc, dc, trans_a, trans_b)
-
-    @settings(max_examples=30, deadline=None)
-    @given(
-        K=st.integers(min_value=1, max_value=10),
-        **hu.gcs
-    )
-    def test_numpy_batch_matmul_1d(self, K, gc, dc):
-        dtype = np.float32
-        X = np.random.rand(K).astype(dtype) - 0.5
-        # TODO: test trans_a and trans_b
-        Y = np.random.rand(K).astype(dtype) - 0.5
-
-        self._test_batch_matmul_with_broadcast_common(X, Y, dtype, gc, dc)
-
-    @settings(max_examples=30, deadline=None)
-    @given(
-        K=st.integers(min_value=1, max_value=10),
-        N=st.integers(min_value=1, max_value=10),
-        **hu.gcs
-    )
-    def test_numpy_batch_matmul_1d_2d(self, K, N, gc, dc):
-        dtype = np.float32
-        X = np.random.rand(K).astype(dtype) - 0.5
-        # TODO: test trans_a and trans_b
-        Y = np.random.rand(*[K, N]).astype(dtype) - 0.5
-
-        self._test_batch_matmul_with_broadcast_common(X, Y, dtype, gc, dc)
-
-    @settings(max_examples=30, deadline=None)
-    @given(
-        M=st.integers(min_value=1, max_value=10),
-        K=st.integers(min_value=1, max_value=10),
-        **hu.gcs
-    )
-    def test_numpy_batch_matmul_2d_1d(self, M, K, gc, dc):
-        dtype = np.float32
-        X = np.random.rand(*[M, K]).astype(dtype) - 0.5
-        # TODO: test trans_a and trans_b
-        Y = np.random.rand(K).astype(dtype) - 0.5
-
-        self._test_batch_matmul_with_broadcast_common(X, Y, dtype, gc, dc)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/mean_op_test.py b/caffe2/python/operator_test/mean_op_test.py
deleted file mode 100644
index a549426170ce..000000000000
--- a/caffe2/python/operator_test/mean_op_test.py
+++ /dev/null
@@ -1,62 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-class TestMean(serial.SerializedTestCase):
-    @serial.given(
-        k=st.integers(1, 5),
-        n=st.integers(1, 10),
-        m=st.integers(1, 10),
-        in_place=st.booleans(),
-        seed=st.integers(0, 2**32 - 1),
-        **hu.gcs
-    )
-    def test_mean(self, k, n, m, in_place, seed, gc, dc):
-        np.random.seed(seed)
-        input_names = []
-        input_vars = []
-
-        for i in range(k):
-            X_name = 'X' + str(i)
-            input_names.append(X_name)
-            var = np.random.randn(n, m).astype(np.float32)
-            input_vars.append(var)
-
-        def mean_ref(*args):
-            return [np.mean(args, axis=0)]
-
-        op = core.CreateOperator(
-            "Mean",
-            input_names,
-            ['Y' if not in_place else 'X0'],
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=input_vars,
-            reference=mean_ref,
-        )
-
-        self.assertGradientChecks(
-            device_option=gc,
-            op=op,
-            inputs=input_vars,
-            outputs_to_check=0,
-            outputs_with_grads=[0],
-        )
-
-        self.assertDeviceChecks(dc, op, input_vars, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/merge_id_lists_op_test.py b/caffe2/python/operator_test/merge_id_lists_op_test.py
deleted file mode 100644
index 36b765557505..000000000000
--- a/caffe2/python/operator_test/merge_id_lists_op_test.py
+++ /dev/null
@@ -1,81 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-import hypothesis.extra.numpy as hnp
-import hypothesis.strategies as st
-import numpy as np
-
-
-@st.composite
-def id_list_batch(draw):
-    num_inputs = draw(st.integers(1, 3))
-    batch_size = draw(st.integers(5, 10))
-    values_dtype = draw(st.sampled_from([np.int32, np.int64]))
-    inputs = []
-    for _ in range(num_inputs):
-        size = draw(st.integers(5, 10))
-        values = draw(hnp.arrays(values_dtype, size, st.integers(1, 10)))
-        lengths = draw(hu.lengths(len(values),
-                                  min_segments=batch_size,
-                                  max_segments=batch_size))
-        inputs.append(lengths)
-        inputs.append(values)
-    return inputs
-
-
-def merge_id_lists_ref(*args):
-    n = len(args)
-    assert n > 0
-    assert n % 2 == 0
-    batch_size = len(args[0])
-    num_inputs = int(n / 2)
-    lengths = np.array([np.insert(args[2 * i], 0, 0)
-                        for i in range(num_inputs)])
-    values = [args[2 * i + 1] for i in range(num_inputs)]
-    offsets = [np.cumsum(lengths[j]) for j in range(num_inputs)]
-
-    def merge_arrays(vs, offs, j):
-        concat = np.concatenate([vs[i][offs[i][j]:offs[i][j + 1]]
-                                for i in range(num_inputs)])
-        return np.sort(np.unique(concat))
-
-    merged = [merge_arrays(values, offsets, j) for j in range(batch_size)]
-    merged_lengths = np.array([len(x) for x in merged])
-    merged_values = np.concatenate(merged)
-    return merged_lengths, merged_values
-
-
-class TestMergeIdListsOp(serial.SerializedTestCase):
-    def test_merge_id_lists_ref(self):
-        # Verify that the reference implementation is correct!
-        lengths_0 = np.array([3, 0, 4], dtype=np.int32)
-        values_0 = np.array([1, 5, 6, 2, 4, 5, 6], dtype=np.int64)
-        lengths_1 = np.array([3, 2, 1], dtype=np.int32)
-        values_1 = np.array([5, 8, 9, 14, 9, 5], dtype=np.int64)
-
-        merged_lengths, merged_values = merge_id_lists_ref(
-            lengths_0, values_0, lengths_1, values_1)
-        expected_lengths = np.array([5, 2, 4], dtype=np.int32)
-        expected_values = np.array([1, 5, 6, 8, 9, 9, 14, 2, 4, 5, 6], dtype=np.int64)
-
-        np.testing.assert_array_equal(merged_lengths, expected_lengths)
-        np.testing.assert_array_equal(merged_values, expected_values)
-
-    @serial.given(inputs=id_list_batch(), **hu.gcs_cpu_only)
-    def test_merge_id_lists_op(self, inputs, gc, dc):
-        num_inputs = int(len(inputs) / 2)
-        op = core.CreateOperator(
-            "MergeIdLists",
-            ["{prefix}_{i}".format(prefix=p, i=i)
-                for i in range(num_inputs)
-                for p in ["lengths", "values"]],
-            ["merged_lengths", "merged_values"]
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        self.assertReferenceChecks(gc, op, inputs, merge_id_lists_ref)
diff --git a/caffe2/python/operator_test/mkl_conv_op_test.py b/caffe2/python/operator_test/mkl_conv_op_test.py
deleted file mode 100644
index 595debf977fe..000000000000
--- a/caffe2/python/operator_test/mkl_conv_op_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, settings
-import numpy as np
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.mkl_test_util as mu
-
-
-@unittest.skipIf(not workspace.C.has_mkldnn,
-                 "Skipping as we do not have mkldnn.")
-class MKLConvTest(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(8, 8),
-           input_channels=st.integers(1, 3),
-           output_channels=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           **mu.gcs)
-    @settings(max_examples=2, deadline=100)
-    def test_mkl_convolution(self, stride, pad, kernel, size,
-                             input_channels, output_channels,
-                             batch_size, gc, dc):
-        op = core.CreateOperator(
-            "Conv",
-            ["X", "w", "b"],
-            ["Y"],
-            stride=stride,
-            pad=pad,
-            kernel=kernel,
-        )
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-                output_channels, input_channels, kernel, kernel) \
-            .astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-
-        inputs = [X, w, b]
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/mkl_packed_fc_op_test.py b/caffe2/python/operator_test/mkl_packed_fc_op_test.py
deleted file mode 100644
index 2f889d693444..000000000000
--- a/caffe2/python/operator_test/mkl_packed_fc_op_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given
-import numpy as np
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-
-
-@unittest.skipIf(not core.IsOperator("PackedFC"),
-                 "PackedFC is not supported in this caffe2 build.")
-class PackedFCTest(hu.HypothesisTestCase):
-    @given(seed=st.integers(0, 65536),
-           M=st.integers(16, 32),
-           K=st.integers(128, 1024),
-           N=st.integers(128, 1024),
-           **hu.gcs_cpu_only)
-    @unittest.skipIf(not core.C.builtin_cpu_supports_avx2(),
-                     "Intel MKL sgemm_pack has a known numerical issue with "
-                     "non-avx2 machines that will be fixed in a later build.")
-    def test_packed_fc(self, seed, M, K, N, gc, dc):
-        np.random.seed(seed)
-        X = np.random.rand(M, K).astype(np.float32) - 0.5
-        W = np.random.rand(N, K).astype(np.float32) - 0.5
-        b = np.random.rand(N).astype(np.float32) - 0.5
-
-        # If you are debugging, the following hard-coded ones might help.
-        # X = np.ones((24, 256)).astype(np.float32)
-        # W = np.ones((128, 256)).astype(np.float32)
-        # b = np.zeros(128).astype(np.float32)
-
-        def ref(X, W, b):
-            return (np.dot(X, W.T) + b,)
-
-        for name in ["FC", "PackedFC"]:
-            op = core.CreateOperator(
-                name,
-                ["X", "W", "b"],
-                ["Y"],
-            )
-            self.assertReferenceChecks(gc, op, [X, W, b], ref)
-
-    @unittest.skipIf(not core.C.builtin_cpu_supports_avx2(),
-                     "Intel MKL sgemm_pack has a known numerical issue with "
-                     "non-avx2 machines that will be fixed in a later build.")
-    @given(axis=st.integers(min_value=1, max_value=4),
-           num_output=st.integers(min_value=4, max_value=8),
-           **hu.gcs_cpu_only)
-    def test_packed_fc_axis(self, axis, num_output, gc, dc):
-        np.random.seed(1701)
-        X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)
-        K = np.prod(X.shape[axis:])
-        N = num_output
-        W = np.random.randn(N, K).astype(np.float32)
-        b = np.random.randn(N).astype(np.float32)
-
-        op = core.CreateOperator(
-            "PackedFC",
-            ["X", "W", "b"],
-            ["Y"],
-            axis=axis)
-
-        def ref(X, W, b):
-            output_axes = list(X.shape[:axis]) + [N]
-            return (
-                np.dot(X.reshape(int(X.size / K), K), W.T).reshape(output_axes) + b,)
-
-        self.assertReferenceChecks(gc, op, [X, W, b], ref)
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/mod_op_test.py b/caffe2/python/operator_test/mod_op_test.py
deleted file mode 100644
index 03ff766c11e4..000000000000
--- a/caffe2/python/operator_test/mod_op_test.py
+++ /dev/null
@@ -1,61 +0,0 @@
-import numpy
-
-from caffe2.python import core
-from hypothesis import given, settings
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-
-@st.composite
-def _data(draw):
-    return draw(
-        hu.tensor(
-            dtype=np.int64,
-            elements=st.integers(
-                min_value=np.iinfo(np.int64).min, max_value=np.iinfo(np.int64).max
-            )
-        )
-    )
-
-
-class TestMod(hu.HypothesisTestCase):
-    @settings(deadline=None)
-    @given(
-        data=_data(),
-        divisor=st.integers(
-            min_value=np.iinfo(np.int64).min, max_value=np.iinfo(np.int64).max
-        ),
-        inplace=st.booleans(),
-        sign_follow_divisor=st.booleans(),
-        **hu.gcs
-    )
-    def test_mod(
-        self, data, divisor, inplace, sign_follow_divisor, gc, dc
-    ):
-        if divisor == 0:
-            # invalid test case
-            return None
-
-        def ref(data):
-            if sign_follow_divisor:
-                output = data % divisor
-            else:
-                output = numpy.fmod(data, divisor)
-            return [output]
-
-        op = core.CreateOperator(
-            'Mod',
-            ['data'],
-            ['data' if inplace else 'output'],
-            divisor=divisor,
-            sign_follow_divisor=sign_follow_divisor
-        )
-
-        self.assertReferenceChecks(gc, op, [data], ref)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/moments_op_test.py b/caffe2/python/operator_test/moments_op_test.py
deleted file mode 100644
index bee44e360e3f..000000000000
--- a/caffe2/python/operator_test/moments_op_test.py
+++ /dev/null
@@ -1,53 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import itertools as it
-import numpy as np
-
-
-class TestMomentsOp(serial.SerializedTestCase):
-    def run_moments_test(self, X, axes, keepdims, gc, dc):
-        if axes is None:
-            op = core.CreateOperator(
-                "Moments",
-                ["X"],
-                ["mean", "variance"],
-                keepdims=keepdims,
-            )
-        else:
-            op = core.CreateOperator(
-                "Moments",
-                ["X"],
-                ["mean", "variance"],
-                axes=axes,
-                keepdims=keepdims,
-            )
-
-        def ref(X):
-            mean = np.mean(X, axis=None if axes is None else tuple(
-                axes), keepdims=keepdims)
-            variance = np.var(X, axis=None if axes is None else tuple(
-                axes), keepdims=keepdims)
-            return [mean, variance]
-
-        self.assertReferenceChecks(gc, op, [X], ref)
-        self.assertDeviceChecks(dc, op, [X], [0, 1])
-        self.assertGradientChecks(gc, op, [X], 0, [0, 1])
-
-    @serial.given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
-           num_axes=st.integers(1, 4), **hu.gcs)
-    def test_moments(self, X, keepdims, num_axes, gc, dc):
-        self.run_moments_test(X, None, keepdims, gc, dc)
-        num_dims = len(X.shape)
-        if num_dims < num_axes:
-            self.run_moments_test(X, range(num_dims), keepdims, gc, dc)
-        else:
-            for axes in it.combinations(range(num_dims), num_axes):
-                self.run_moments_test(X, axes, keepdims, gc, dc)
diff --git a/caffe2/python/operator_test/momentum_sgd_test.py b/caffe2/python/operator_test/momentum_sgd_test.py
deleted file mode 100644
index 58f16e87a21c..000000000000
--- a/caffe2/python/operator_test/momentum_sgd_test.py
+++ /dev/null
@@ -1,191 +0,0 @@
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given, assume, settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-class TestMomentumSGD(serial.SerializedTestCase):
-    @given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_momentum_sgd(self, n, nesterov, gc, dc):
-        param = np.random.rand(n).astype(np.float32)
-        grad = np.random.rand(n).astype(np.float32)
-        lr = np.random.rand(1).astype(np.float32)
-        param_momentum = np.random.rand(n).astype(np.float32)
-        momentum = 0.9
-
-        def momentum_sgd(grad, param_momentum, lr, param=None):
-            if not nesterov:
-                adjusted_gradient = lr * grad + momentum * param_momentum
-                if param is None:
-                    return [adjusted_gradient, adjusted_gradient]
-                else:
-                    paramup = param - adjusted_gradient
-                    return [adjusted_gradient, adjusted_gradient, paramup]
-            else:
-                m_new = momentum * param_momentum + lr * grad
-                grad_new = (1 + momentum) * m_new - momentum * param_momentum
-                if param is None:
-                    return [grad_new, m_new]
-                else:
-                    paramup = param - grad_new
-                    return [grad_new, m_new, paramup]
-
-        op = core.CreateOperator(
-            "MomentumSGDUpdate",
-            ["grad", "param_momentum", "lr", "param"],
-            ["grad", "param_momentum", "param"],
-            momentum=momentum,
-            nesterov=int(nesterov),
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[grad, param_momentum, lr, param],
-            reference=momentum_sgd
-        )
-
-        op_noparam = core.CreateOperator(
-            "MomentumSGD",
-            ["grad", "param_momentum", "lr"],
-            ["grad", "param_momentum"],
-            momentum=momentum,
-            nesterov=int(nesterov),
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op_noparam,
-            inputs=[grad, param_momentum, lr],
-            reference=momentum_sgd
-        )
-
-    @given(
-        inputs=hu.tensors(n=3),
-        momentum=st.floats(min_value=0.1, max_value=0.9),
-        nesterov=st.booleans(),
-        lr=st.floats(min_value=0.1, max_value=0.9),
-        data_strategy=st.data(),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_sparse_momentum_sgd(
-        self, inputs, momentum, nesterov, lr, data_strategy, gc, dc
-    ):
-        w, grad, m = inputs
-
-        # Create an indexing array containing values which index into grad
-        indices = data_strategy.draw(
-            hu.tensor(
-                max_dim=1,
-                min_value=1,
-                max_value=grad.shape[0],
-                dtype=np.int64,
-                elements=st.sampled_from(np.arange(grad.shape[0])),
-            ),
-        )
-
-        # Verify that the generated indices are unique
-        assume(
-            np.array_equal(
-                np.unique(indices.flatten()),
-                np.sort(indices.flatten())))
-
-        # Sparsify grad
-        grad = grad[indices]
-
-        # Make momentum >= 0
-        m = np.abs(m)
-
-        # Convert lr to a numpy array
-        lr = np.asarray([lr], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "SparseMomentumSGDUpdate", ["grad", "m", "lr", "param", "indices"],
-            ["adjusted_grad", "m", "param"],
-            momentum=momentum,
-            nesterov=int(nesterov),
-            device_option=gc
-        )
-
-        # Reference
-        def momentum_sgd(grad, m, lr):
-            lr = lr[0]
-            if not nesterov:
-                adjusted_gradient = lr * grad + momentum * m
-                return (adjusted_gradient, adjusted_gradient)
-            else:
-                m_new = momentum * m + lr * grad
-                return ((1 + momentum) * m_new - momentum * m, m_new)
-
-        def sparse(grad, m, lr, param, i):
-            grad_new, m_new = momentum_sgd(grad, m[i], lr)
-            m[i] = m_new
-            param[i] -= grad_new
-            return (grad_new, m, param)
-
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [grad, m, lr, w, indices],
-            sparse)
-
-    @unittest.skip("Test is flaky, see https://github.com/pytorch/pytorch/issues/31368")
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
-    @given(n=st.integers(4, 8), nesterov=st.booleans(), **hu.gcs)
-    def test_fp16momentum_sgd(self, n, nesterov, gc, dc):
-        assume(core.IsGPUDeviceType(gc.device_type))
-        gpuvers = workspace.GetDeviceProperties(0)["major"]
-        if gc.device_type == caffe2_pb2.CUDA and gpuvers < 6:
-            print("No FP16 support because major version {} < 6".format(gpuvers))
-            return
-
-        param = np.random.rand(n).astype(np.float16)
-        grad = np.random.rand(n).astype(np.float16)
-        lr = np.random.rand(1).astype(np.float32)
-        param_momentum = np.random.rand(n).astype(np.float16)
-        momentum = 0.9
-
-        def momentum_sgd(grad, param_momentum, lr, param=None):
-            if not nesterov:
-                adjusted_gradient = lr * grad + momentum * param_momentum
-                paramup = param - adjusted_gradient
-                return [adjusted_gradient, adjusted_gradient, paramup]
-            else:
-                m_new = momentum * param_momentum + lr * grad
-                grad_new = (1 + momentum) * m_new - momentum * param_momentum
-                paramup = param - grad_new
-                return [grad_new, m_new, paramup]
-
-        op = core.CreateOperator(
-            "FP16MomentumSGDUpdate",
-            ["grad", "param_momentum", "lr", "param"],
-            ["grad", "param_momentum", "param"],
-            momentum=momentum,
-            nesterov=int(nesterov),
-            weight_decay=0.0,
-        )
-
-        threshold = 1e-3 if (gc.device_type == caffe2_pb2.HIP) else 1e-4
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[grad, param_momentum, lr, param],
-            reference=momentum_sgd,
-            threshold=threshold
-        )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/mpi_test.py b/caffe2/python/operator_test/mpi_test.py
deleted file mode 100644
index fd0b896c9f04..000000000000
--- a/caffe2/python/operator_test/mpi_test.py
+++ /dev/null
@@ -1,199 +0,0 @@
-
-
-
-
-
-from hypothesis import given
-import hypothesis.strategies as st
-
-import numpy as np
-import unittest
-
-from caffe2.python import core, workspace, dyndep
-import caffe2.python.hypothesis_test_util as hu
-
-dyndep.InitOpsLibrary("@/caffe2/caffe2/mpi:mpi_ops")
-
-_has_mpi =False
-COMM = None
-RANK = 0
-SIZE = 0
-
-def SetupMPI():
-    try:
-        # pyre-fixme[21]: undefined import
-        from mpi4py import MPI
-        global _has_mpi, COMM, RANK, SIZE
-        _has_mpi = core.IsOperatorWithEngine("CreateCommonWorld", "MPI")
-        COMM = MPI.COMM_WORLD
-        RANK = COMM.Get_rank()
-        SIZE = COMM.Get_size()
-    except ImportError:
-        _has_mpi = False
-
-
-@unittest.skipIf(not _has_mpi,
-                 "MPI is not available. Skipping.")
-class TestMPI(hu.HypothesisTestCase):
-    @given(X=hu.tensor(),
-           root=st.integers(min_value=0, max_value=SIZE - 1),
-           device_option=st.sampled_from(hu.device_options),
-           **hu.gcs)
-    def test_broadcast(self, X, root, device_option, gc, dc):
-        # Use mpi4py's broadcast to make sure that all nodes inherit the
-        # same hypothesis test.
-        X = COMM.bcast(X)
-        root = COMM.bcast(root)
-        device_option = COMM.bcast(device_option)
-        X[:] = RANK
-        self.assertTrue(
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "CreateCommonWorld", [], "comm", engine="MPI",
-                    device_option=device_option)))
-        self.assertTrue(workspace.FeedBlob("X", X, device_option))
-        mpi_op = core.CreateOperator(
-            "Broadcast", ["comm", "X"], "X", engine="MPI", root=root,
-            device_option=device_option)
-        self.assertTrue(workspace.RunOperatorOnce(mpi_op))
-        new_X = workspace.FetchBlob("X")
-        np.testing.assert_array_equal(new_X, root)
-        workspace.ResetWorkspace()
-
-    @given(X=hu.tensor(),
-           root=st.integers(min_value=0, max_value=SIZE - 1),
-           device_option=st.sampled_from(hu.device_options),
-           **hu.gcs)
-    def test_reduce(self, X, root, device_option, gc, dc):
-        # Use mpi4py's broadcast to make sure that all nodes inherit the
-        # same hypothesis test.
-        X = COMM.bcast(X)
-        root = COMM.bcast(root)
-        device_option = COMM.bcast(device_option)
-        X[:] = RANK
-        self.assertTrue(
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "CreateCommonWorld", [], "comm", engine="MPI",
-                    device_option=device_option)))
-        self.assertTrue(workspace.FeedBlob("X", X, device_option))
-        mpi_op = core.CreateOperator(
-            "Reduce", ["comm", "X"], "X_reduced", engine="MPI", root=root,
-            device_option=device_option)
-        self.assertTrue(workspace.RunOperatorOnce(mpi_op))
-        if (RANK == root):
-            new_X = workspace.FetchBlob("X")
-            np.testing.assert_array_equal(new_X, root)
-        workspace.ResetWorkspace()
-
-    @given(X=hu.tensor(),
-           root=st.integers(min_value=0, max_value=SIZE - 1),
-           device_option=st.sampled_from(hu.device_options),
-           inplace=st.booleans(),
-           **hu.gcs)
-    def test_allreduce(self, X, root, device_option, inplace, gc, dc):
-        # Use mpi4py's broadcast to make sure that all nodes inherit the
-        # same hypothesis test.
-        X = COMM.bcast(X)
-        root = COMM.bcast(root)
-        device_option = COMM.bcast(device_option)
-        inplace = COMM.bcast(inplace)
-        X[:] = RANK
-        self.assertTrue(
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "CreateCommonWorld", [], "comm", engine="MPI",
-                    device_option=device_option)))
-        # Use mpi4py's broadcast to make sure that all copies have the same
-        # tensor size.
-        X = COMM.bcast(X)
-        X[:] = RANK
-        self.assertTrue(workspace.FeedBlob("X", X, device_option))
-        mpi_op = core.CreateOperator(
-            "Allreduce", ["comm", "X"],
-            "X" if inplace else "X_reduced",
-            engine="MPI", root=root,
-            device_option=device_option)
-        self.assertTrue(workspace.RunOperatorOnce(mpi_op))
-        new_X = workspace.FetchBlob("X" if inplace else "X_reduced")
-        np.testing.assert_array_equal(new_X, SIZE * (SIZE - 1) / 2)
-        workspace.ResetWorkspace()
-
-    @given(X=hu.tensor(),
-           device_option=st.sampled_from(hu.device_options),
-           specify_send_blob=st.booleans(),
-           specify_recv_blob=st.booleans(),
-           **hu.gcs)
-    def test_sendrecv(
-            self, X, device_option, specify_send_blob, specify_recv_blob,
-            gc, dc):
-        # Use mpi4py's broadcast to make sure that all nodes inherit the
-        # same hypothesis test.
-        X = COMM.bcast(X)
-        device_option = COMM.bcast(device_option)
-        specify_send_blob = COMM.bcast(specify_send_blob)
-        specify_recv_blob = COMM.bcast(specify_recv_blob)
-        X[:] = RANK
-
-        self.assertTrue(
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "CreateCommonWorld", [], "comm", engine="MPI",
-                    device_option=device_option)))
-        self.assertTrue(workspace.FeedBlob("X", X, device_option))
-        for src in range(SIZE):
-            for dst in range(SIZE):
-                tag = src * SIZE + dst
-                if src == dst:
-                    continue
-                elif RANK == src:
-                    X[:] = RANK
-                    self.assertTrue(workspace.FeedBlob("X", X, device_option))
-                    if specify_send_blob:
-                        self.assertTrue(workspace.FeedBlob(
-                            "dst", np.array(dst, dtype=np.int32)))
-                        self.assertTrue(workspace.FeedBlob(
-                            "tag", np.array(tag, dtype=np.int32)))
-                        mpi_op = core.CreateOperator(
-                            "SendTensor", ["comm", "X", "dst", "tag"], [],
-                            engine="MPI", raw_buffer=True,
-                            device_option=device_option)
-                    else:
-                        mpi_op = core.CreateOperator(
-                            "SendTensor", ["comm", "X"], [], engine="MPI",
-                            dst=dst, tag=tag, raw_buffer=True,
-                            device_option=device_option)
-                    self.assertTrue(workspace.RunOperatorOnce(mpi_op))
-                elif RANK == dst:
-                    if specify_recv_blob:
-                        self.assertTrue(workspace.FeedBlob(
-                            "src", np.array(src, dtype=np.int32)))
-                        self.assertTrue(workspace.FeedBlob(
-                            "tag", np.array(tag, dtype=np.int32)))
-                        mpi_op = core.CreateOperator(
-                            "ReceiveTensor", ["comm", "X", "src", "tag"],
-                            ["X", "src", "tag"],
-                            engine="MPI",
-                            src=src, tag=tag, raw_buffer=True,
-                            device_option=device_option)
-                    else:
-                        mpi_op = core.CreateOperator(
-                            "ReceiveTensor", ["comm", "X"], ["X", "src", "tag"],
-                            engine="MPI",
-                            src=src, tag=tag, raw_buffer=True,
-                            device_option=device_option)
-                    self.assertTrue(workspace.RunOperatorOnce(mpi_op))
-                    received = workspace.FetchBlob("X")
-                    np.testing.assert_array_equal(received, src)
-                    src_blob = workspace.FetchBlob("src")
-                    np.testing.assert_array_equal(src_blob, src)
-                    tag_blob = workspace.FetchBlob("tag")
-                    np.testing.assert_array_equal(tag_blob, tag)
-                # simply wait for the guys to finish
-                COMM.barrier()
-        workspace.ResetWorkspace()
-
-if __name__ == "__main__":
-    SetupMPI()
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/mul_gradient_benchmark.py b/caffe2/python/operator_test/mul_gradient_benchmark.py
deleted file mode 100644
index b0239b47de6e..000000000000
--- a/caffe2/python/operator_test/mul_gradient_benchmark.py
+++ /dev/null
@@ -1,50 +0,0 @@
-
-
-
-
-
-import argparse
-import numpy as np
-
-from caffe2.python import core, workspace
-
-
-def benchmark_mul_gradient(args):
-    workspace.FeedBlob("dC", np.random.rand(args.m, args.n).astype(np.float32))
-    workspace.FeedBlob("A", np.random.rand(args.m, args.n).astype(np.float32))
-    workspace.FeedBlob("B", np.random.rand(args.n).astype(np.float32))
-
-    net = core.Net("mynet")
-    net.MulGradient(
-        ["dC", "A", "B"],
-        ["dC" if args.inplace else "dA", "dB"],
-        broadcast=True,
-        axis=1,
-        allow_broadcast_fastpath=args.allow_broadcast_fastpath,
-    )
-    workspace.CreateNet(net)
-
-    workspace.BenchmarkNet(net.Name(), 1, args.iteration, True)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="benchmark for MulGradient.")
-    parser.add_argument(
-        '-m', type=int, default=9508,
-        help="The number of rows of A")
-    parser.add_argument(
-        "-n", type=int, default=80,
-        help="The number of columns of A")
-    parser.add_argument(
-        '-i', "--iteration", type=int, default=100,
-        help="The number of iterations.")
-    parser.add_argument(
-        "--inplace",
-        action='store_true', help="Whether to perform the op in-place.")
-    parser.add_argument(
-        "--allow-broadcast-fastpath",
-        action='store_true', help="Whether the broadcast fastpath is enabled.")
-    args, extra_args = parser.parse_known_args()
-    core.GlobalInit(['python'] + extra_args)
-    benchmark_mul_gradient(args)
diff --git a/caffe2/python/operator_test/negate_gradient_op_test.py b/caffe2/python/operator_test/negate_gradient_op_test.py
deleted file mode 100644
index 137be1eece34..000000000000
--- a/caffe2/python/operator_test/negate_gradient_op_test.py
+++ /dev/null
@@ -1,44 +0,0 @@
-
-
-
-
-
-from caffe2.python import workspace, core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestNegateGradient(serial.SerializedTestCase):
-
-    @given(X=hu.tensor(), inplace=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_forward(self, X, inplace, gc, dc):
-        def neg_grad_ref(X):
-            return (X,)
-
-        op = core.CreateOperator("NegateGradient", ["X"], ["Y" if not inplace else "X"])
-        self.assertReferenceChecks(gc, op, [X], neg_grad_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(size=st.lists(st.integers(min_value=1, max_value=20),
-                         min_size=1, max_size=5))
-    def test_grad(self, size):
-        X = np.random.random_sample(size)
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("X", X.astype(np.float32))
-
-        net = core.Net("negate_grad_test")
-        Y = net.NegateGradient(["X"], ["Y"])
-
-        grad_map = net.AddGradientOperators([Y])
-        workspace.RunNetOnce(net)
-
-        # check X_grad == negate of Y_grad
-        x_val, y_val = workspace.FetchBlobs(['X', 'Y'])
-        x_grad_val, y_grad_val = workspace.FetchBlobs([grad_map['X'],
-                                                        grad_map['Y']])
-        np.testing.assert_array_equal(x_val, y_val)
-        np.testing.assert_array_equal(x_grad_val, y_grad_val * (-1))
diff --git a/caffe2/python/operator_test/ngram_ops_test.py b/caffe2/python/operator_test/ngram_ops_test.py
deleted file mode 100644
index 3f4e57fa230b..000000000000
--- a/caffe2/python/operator_test/ngram_ops_test.py
+++ /dev/null
@@ -1,75 +0,0 @@
-
-
-
-
-
-import hypothesis.strategies as st
-
-from caffe2.python import core, workspace
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-
-import numpy as np
-
-
-class TestNGramOps(hu.HypothesisTestCase):
-    @given(
-        seed=st.integers(0, 2**32 - 1),
-        N=st.integers(min_value=10, max_value=100),
-        D=st.integers(min_value=2, max_value=10),
-        out_of_vcb=st.floats(min_value=0, max_value=0.5),
-        max_categorical_limit=st.integers(min_value=5, max_value=20),
-        max_in_vcb_val=st.integers(min_value=1000, max_value=10000),
-        **hu.gcs_cpu_only
-    )
-    def test_ngram_from_categorical_op(
-        self,
-        seed,
-        N,
-        D,
-        out_of_vcb,
-        max_categorical_limit,
-        max_in_vcb_val,
-        gc,
-        dc,
-    ):
-        np.random.seed(seed)
-        col_num = max(int(D / 2), 1)
-        col_ids = np.random.choice(D, col_num, False).astype(np.int32)
-        categorical_limits = np.random.randint(
-            2, high=max_categorical_limit, size=col_num
-        ).astype(np.int32)
-        vcb = [
-            np.random.choice(max_in_vcb_val, x, False)
-            for x in categorical_limits
-        ]
-        vals = np.array([x for l in vcb for x in l], dtype=np.int32)
-
-        # Enforce round(floats) to be negative.
-        floats = np.random.rand(N, D).astype(np.float32) - 2
-        expected_output = []
-        for i in range(N):
-            val = 0
-            for (k, j) in enumerate(col_ids):
-                base = np.prod(categorical_limits[:k])
-                r = np.random.randint(categorical_limits[k])
-                p = np.random.rand()
-                if p > out_of_vcb:
-                    val += base * r
-                    floats[i][j] = vcb[k][r]
-            expected_output.append(val)
-        expected_output = np.array(expected_output, dtype=np.int32)
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob('floats', floats)
-        op = core.CreateOperator(
-            "NGramFromCategorical",
-            ['floats'],
-            ['output'],
-            col_ids=col_ids,
-            categorical_limits=categorical_limits,
-            vals=vals,
-        )
-        workspace.RunOperatorOnce(op)
-        output = workspace.blobs['output']
-        np.testing.assert_array_equal(output, expected_output)
diff --git a/caffe2/python/operator_test/normalize_op_test.py b/caffe2/python/operator_test/normalize_op_test.py
deleted file mode 100644
index 7a35e0bafa31..000000000000
--- a/caffe2/python/operator_test/normalize_op_test.py
+++ /dev/null
@@ -1,54 +0,0 @@
-
-
-
-
-import functools
-
-import numpy as np
-from hypothesis import given, settings
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import copy
-
-
-class TestNormalizeOp(hu.HypothesisTestCase):
-    @given(
-        X=hu.tensor(
-            min_dim=1, max_dim=5, elements=hu.floats(min_value=0.5, max_value=1.0)
-        ),
-        **hu.gcs
-    )
-    @settings(max_examples=10, deadline=None)
-    def test_normalize(self, X, gc, dc):
-        def ref_normalize(X, axis):
-            x_normed = X / np.maximum(
-                np.sqrt((X ** 2).sum(axis=axis, keepdims=True)), 1e-12
-            )
-            return (x_normed,)
-
-        for axis in range(-X.ndim, X.ndim):
-            x = copy.copy(X)
-            op = core.CreateOperator("Normalize", "X", "Y", axis=axis)
-            self.assertReferenceChecks(
-                gc, op, [x], functools.partial(ref_normalize, axis=axis)
-            )
-            self.assertDeviceChecks(dc, op, [x], [0])
-            self.assertGradientChecks(gc, op, [x], 0, [0])
-
-    @given(
-        X=hu.tensor(
-            min_dim=1, max_dim=5, elements=hu.floats(min_value=0.5, max_value=1.0)
-        ),
-        **hu.gcs
-    )
-    @settings(max_examples=10, deadline=None)
-    def test_normalize_L1(self, X, gc, dc):
-        def ref(X, axis):
-            norm = abs(X).sum(axis=axis, keepdims=True)
-            return (X / norm,)
-
-        for axis in range(-X.ndim, X.ndim):
-            print("axis: ", axis)
-            op = core.CreateOperator("NormalizeL1", "X", "Y", axis=axis)
-            self.assertReferenceChecks(gc, op, [X], functools.partial(ref, axis=axis))
-            self.assertDeviceChecks(dc, op, [X], [0])
diff --git a/caffe2/python/operator_test/numpy_tile_op_test.py b/caffe2/python/operator_test/numpy_tile_op_test.py
deleted file mode 100644
index c32aa99470db..000000000000
--- a/caffe2/python/operator_test/numpy_tile_op_test.py
+++ /dev/null
@@ -1,66 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import unittest
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-class TestNumpyTile(serial.SerializedTestCase):
-    @given(ndim=st.integers(min_value=1, max_value=4),
-           seed=st.integers(min_value=0, max_value=65536),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_numpy_tile(self, ndim, seed, gc, dc):
-        np.random.seed(seed)
-
-        input_dims = np.random.randint(1, 4, size=ndim)
-        input = np.random.randn(*input_dims)
-        repeats = np.random.randint(1, 5, size=ndim)
-
-        op = core.CreateOperator(
-            'NumpyTile', ['input', 'repeats'], 'out',
-        )
-
-        def tile_ref(input, repeats):
-            tiled_data = np.tile(input, repeats)
-            return (tiled_data,)
-
-        # Check against numpy reference
-        self.assertReferenceChecks(gc, op, [input, repeats],
-                                   tile_ref)
-
-    @given(ndim=st.integers(min_value=1, max_value=4),
-           seed=st.integers(min_value=0, max_value=65536),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_numpy_tile_zero_dim(self, ndim, seed, gc, dc):
-        np.random.seed(seed)
-
-        input_dims = np.random.randint(0, 4, size=ndim)
-        input = np.random.randn(*input_dims)
-        repeats = np.random.randint(0, 5, size=ndim)
-
-        op = core.CreateOperator(
-            'NumpyTile', ['input', 'repeats'], 'out',
-        )
-
-        def tile_ref(input, repeats):
-            tiled_data = np.tile(input, repeats)
-            return (tiled_data,)
-
-        # Check against numpy reference
-        self.assertReferenceChecks(gc, op, [input, repeats],
-                                   tile_ref)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/one_hot_ops_test.py b/caffe2/python/operator_test/one_hot_ops_test.py
deleted file mode 100644
index e23e04434ab3..000000000000
--- a/caffe2/python/operator_test/one_hot_ops_test.py
+++ /dev/null
@@ -1,207 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.proto import caffe2_pb2
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-def _one_hots():
-    index_size = st.integers(min_value=1, max_value=5)
-    lengths = st.lists(
-        elements=st.integers(min_value=0, max_value=5))
-    return st.tuples(index_size, lengths).flatmap(
-        lambda x: st.tuples(
-            st.just(x[0]),
-            st.just(x[1]),
-            st.lists(
-                elements=st.integers(min_value=0, max_value=x[0] - 1),
-                min_size=sum(x[1]),
-                max_size=sum(x[1]))))
-
-
-class TestOneHotOps(serial.SerializedTestCase):
-    @serial.given(
-        x=hu.tensor(
-            min_dim=2, max_dim=2, dtype=np.int32,
-            elements=st.integers(min_value=0, max_value=10)),
-        **hu.gcs_cpu_only)
-    def test_batch_one_hot(self, x, gc, dc):
-        d = x.shape[1]
-        lens = []
-        vals = []
-        for i in range(0, d):
-            val = np.unique(x[:, i])
-            vals.extend(val)
-            lens.append(len(val))
-        lens = np.array(lens, dtype=np.int32)
-        vals = np.array(vals, dtype=np.int32)
-
-        def ref(x, lens, vals):
-            output_dim = vals.size
-            ret = np.zeros((x.shape[0], output_dim)).astype(x.dtype)
-            p = 0
-            for i, l in enumerate(lens):
-                for j in range(0, l):
-                    v = vals[p + j]
-                    ret[x[:, i] == v, p + j] = 1
-                p += lens[i]
-            return (ret, )
-
-        op = core.CreateOperator('BatchOneHot', ["X", "LENS", "VALS"], ["Y"])
-        self.assertReferenceChecks(gc, op, [x, lens, vals], ref)
-
-    @given(
-        x=hu.tensor(
-            min_dim=2, max_dim=2, dtype=np.float32,
-            elements=st.integers(min_value=-5, max_value=5)),
-        seed=st.integers(min_value=0, max_value=1000),
-        **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_batch_bucketized_one_hot(self, x, seed, gc, dc):
-        np.random.seed(seed)
-        d = x.shape[1]
-        lens = np.random.randint(low=1, high=5, size=d)
-        boundaries = []
-        for i in range(d):
-            # add [0, 0] as duplicated boundary for duplicated bucketization
-            if lens[i] > 2:
-                cur_boundary = np.append(
-                    np.random.randn(lens[i] - 2) * 5, [0, 0])
-            else:
-                cur_boundary = np.random.randn(lens[i]) * 5
-            cur_boundary.sort()
-            boundaries += cur_boundary.tolist()
-
-        lens = np.array(lens, dtype=np.int32)
-        boundaries = np.array(boundaries, dtype=np.float32)
-
-        def ref(x, lens, boundaries):
-            output_dim = lens.size + boundaries.size
-            ret = np.zeros((x.shape[0], output_dim)).astype(x.dtype)
-            boundary_offset = 0
-            output_offset = 0
-            for i, l in enumerate(lens):
-                bucket_idx_right = np.digitize(
-                    x[:, i],
-                    boundaries[boundary_offset:boundary_offset + l],
-                    right=True
-                )
-                bucket_idx_left = np.digitize(
-                    x[:, i],
-                    boundaries[boundary_offset:boundary_offset + l],
-                    right=False
-                )
-                bucket_idx = np.floor_divide(
-                    np.add(bucket_idx_right, bucket_idx_left), 2)
-                for j in range(x.shape[0]):
-                    ret[j, output_offset + bucket_idx[j]] = 1.0
-                boundary_offset += lens[i]
-                output_offset += (lens[i] + 1)
-            return (ret, )
-
-        op = core.CreateOperator('BatchBucketOneHot',
-                                 ["X", "LENS", "BOUNDARIES"], ["Y"])
-        self.assertReferenceChecks(gc, op, [x, lens, boundaries], ref)
-
-    @serial.given(
-        hot_indices=hu.tensor(
-            min_dim=1, max_dim=1, dtype=np.int64,
-            elements=st.integers(min_value=0, max_value=42)),
-        end_padding=st.integers(min_value=0, max_value=2),
-        **hu.gcs)
-    def test_one_hot(self, hot_indices, end_padding, gc, dc):
-
-        def one_hot_ref(hot_indices, size):
-            out = np.zeros([len(hot_indices), size], dtype=float)
-            x = enumerate(hot_indices)
-            for i, x in enumerate(hot_indices):
-                out[i, x] = 1.
-            return (out, )
-
-        size = np.array(max(hot_indices) + end_padding + 1, dtype=np.int64)
-        if size == 0:
-            size = 1
-        op = core.CreateOperator('OneHot', ['hot_indices', 'size'], ['output'])
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [hot_indices, size],
-            one_hot_ref,
-            input_device_options={'size': core.DeviceOption(caffe2_pb2.CPU)})
-
-    @serial.given(hot_indices=_one_hots())
-    def test_segment_one_hot(self, hot_indices):
-        index_size, lengths, indices = hot_indices
-
-        index_size = np.array(index_size, dtype=np.int64)
-        lengths = np.array(lengths, dtype=np.int32)
-        indices = np.array(indices, dtype=np.int64)
-
-        def segment_one_hot_ref(lengths, hot_indices, size):
-            offset = 0
-            out = np.zeros([len(lengths), size], dtype=float)
-            for i, length in enumerate(lengths):
-                for idx in hot_indices[offset:offset + length]:
-                    out[i, idx] = 1.
-                offset += length
-            return (out, )
-
-        op = core.CreateOperator(
-            'SegmentOneHot',
-            ['lengths', 'hot_indices', 'size'],
-            ['output'])
-        self.assertReferenceChecks(
-            hu.cpu_do,
-            op,
-            [lengths, indices, index_size],
-            segment_one_hot_ref)
-
-    @given(
-        x=hu.tensor(
-            min_dim=2, max_dim=2, dtype=np.float32,
-            elements=st.integers(min_value=-5, max_value=5)),
-        seed=st.integers(min_value=0, max_value=1000),
-        **hu.gcs_cpu_only)
-    def test_batch_bucket_one_hot_shape_inference(self, x, seed, gc, dc):
-        np.random.seed(seed)
-        d = x.shape[1]
-        lens = np.random.randint(low=1, high=5, size=d)
-        boundaries = []
-        for i in range(d):
-            # add [0, 0] as duplicated boundary for duplicated bucketization
-            if lens[i] > 2:
-                cur_boundary = np.append(
-                    np.random.randn(lens[i] - 2) * 5, [0, 0])
-            else:
-                cur_boundary = np.random.randn(lens[i]) * 5
-            cur_boundary.sort()
-            boundaries += cur_boundary.tolist()
-
-        lens = np.array(lens, dtype=np.int32)
-        boundaries = np.array(boundaries, dtype=np.float32)
-
-        workspace.FeedBlob('lens', lens)
-        workspace.FeedBlob('boundaries', boundaries)
-        workspace.FeedBlob('x', x)
-
-        net = core.Net("batch_bucket_one_hot_test")
-        result = net.BatchBucketOneHot(["x", "lens", "boundaries"], 1)
-        (shapes, types) = workspace.InferShapesAndTypes([net])
-        workspace.RunNetOnce(net)
-
-        self.assertEqual(shapes[result], list(workspace.blobs[result].shape))
-        self.assertEqual(
-            shapes[result], [x.shape[0], lens.shape[0] + boundaries.shape[0]])
-        self.assertEqual(types[result], core.DataType.FLOAT)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/onnx_while_test.py b/caffe2/python/operator_test/onnx_while_test.py
deleted file mode 100644
index 8f76892e41a6..000000000000
--- a/caffe2/python/operator_test/onnx_while_test.py
+++ /dev/null
@@ -1,98 +0,0 @@
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-class TestONNXWhile(serial.SerializedTestCase):
-    @given(
-        condition=st.booleans(),
-        max_trip_count=st.integers(0, 100),
-        save_scopes=st.booleans(),
-        disable_scopes=st.booleans(),
-        seed=st.integers(0, 65535),
-        **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_onnx_while_fibb(
-            self, condition, max_trip_count, save_scopes, disable_scopes, seed, gc, dc):
-        np.random.seed(seed)
-        if disable_scopes:
-            save_scopes = False
-
-        # Create body net
-        body_net = caffe2_pb2.NetDef()
-        # Two loop carried dependencies: first and second
-        body_net.external_input.extend(['i', 'cond', 'first', 'second'])
-        body_net.external_output.extend(['cond_new', 'second', 'third', 'third'])
-        add_op = core.CreateOperator(
-            'Add',
-            ['first', 'second'],
-            ['third'],
-        )
-        print3 = core.CreateOperator(
-            'Print',
-            ['third'],
-            [],
-        )
-        limit_const = core.CreateOperator(
-            'ConstantFill',
-            [],
-            ['limit_const'],
-            shape=[1],
-            dtype=caffe2_pb2.TensorProto.FLOAT,
-            value=100.0,
-        )
-        cond = core.CreateOperator(
-            'LT',
-            ['third', 'limit_const'],
-            ['cond_new'],
-        )
-        body_net.op.extend([add_op, print3, limit_const, cond])
-
-        while_op = core.CreateOperator(
-            'ONNXWhile',
-            ['max_trip_count', 'condition', 'first_init', 'second_init'],
-            ['first_a', 'second_a', 'third_a'],
-            body=body_net,
-            has_cond=True,
-            has_trip_count=True,
-            save_scopes=save_scopes,
-            disable_scopes=disable_scopes,
-        )
-
-        condition_arr = np.array(condition).astype(bool)
-        max_trip_count_arr = np.array(max_trip_count).astype(np.int64)
-        first_init = np.array([1]).astype(np.float32)
-        second_init = np.array([1]).astype(np.float32)
-
-        def ref(max_trip_count, condition, first_init, second_init):
-            first = 1
-            second = 1
-            results = []
-            if condition:
-                for _ in range(max_trip_count):
-                    third = first + second
-                    first = second
-                    second = third
-                    results.append(third)
-                    if third > 100:
-                        break
-            return (first, second, np.array(results).astype(np.float32))
-
-        self.assertReferenceChecks(
-            gc,
-            while_op,
-            [max_trip_count_arr, condition_arr, first_init, second_init],
-            ref,
-        )
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/order_switch_test.py b/caffe2/python/operator_test/order_switch_test.py
deleted file mode 100644
index 7b3f40a27c97..000000000000
--- a/caffe2/python/operator_test/order_switch_test.py
+++ /dev/null
@@ -1,40 +0,0 @@
-
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-from caffe2.python import core, utils
-from hypothesis import given, settings
-
-
-class OrderSwitchOpsTest(hu.HypothesisTestCase):
-    @given(
-        X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5),
-        engine=st.sampled_from(["", "CUDNN"]),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_nchw2nhwc(self, X, engine, gc, dc):
-        op = core.CreateOperator("NCHW2NHWC", ["X"], ["Y"], engine=engine)
-
-        def nchw2nhwc_ref(X):
-            return (utils.NCHW2NHWC(X),)
-
-        self.assertReferenceChecks(gc, op, [X], nchw2nhwc_ref)
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(
-        X=hu.tensor(min_dim=3, max_dim=5, min_value=1, max_value=5),
-        engine=st.sampled_from(["", "CUDNN"]),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_nhwc2nchw(self, X, engine, gc, dc):
-        op = core.CreateOperator("NHWC2NCHW", ["X"], ["Y"], engine=engine)
-
-        def nhwc2nchw_ref(X):
-            return (utils.NHWC2NCHW(X),)
-
-        self.assertReferenceChecks(gc, op, [X], nhwc2nchw_ref)
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-        self.assertDeviceChecks(dc, op, [X], [0])
diff --git a/caffe2/python/operator_test/pack_ops_test.py b/caffe2/python/operator_test/pack_ops_test.py
deleted file mode 100644
index 7a5f81bb30d1..000000000000
--- a/caffe2/python/operator_test/pack_ops_test.py
+++ /dev/null
@@ -1,377 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given, settings
-from hypothesis import strategies as st
-import numpy as np
-import time
-
-
-class TestTensorPackOps(serial.SerializedTestCase):
-
-    def pack_segments_ref(self, return_presence_mask=False, max_length=None):
-        def pack_segments_ref(lengths, data, max_length=max_length):
-            arr = []
-            constant_values = 0
-            if data.dtype.char == 'S':
-                constant_values = ''
-            if max_length is None:
-                max_length = np.max(lengths)
-            start = 0
-            for idx in range(np.size(lengths)):
-                len = lengths[idx] if max_length >= lengths[idx] else max_length
-                chunk = data[start : start + len]
-                pad_length = max_length - len
-
-                # ((0, pad_length), (0, 0)) says add pad_length rows of padding
-                # below chunk and 0 rows of padding elsewhere
-                arr.append(
-                    np.pad(
-                        chunk, ((0, pad_length), (0, 0)),
-                        mode=str("constant"),
-                        constant_values=constant_values
-                    )
-                )
-                start += lengths[idx]
-            result = [arr]
-            if return_presence_mask:
-                presence_arr = []
-                for length in lengths:
-                    length = length if max_length >= length else max_length
-                    pad_length = max_length - length
-                    presence_arr.append(
-                        np.pad(
-                            np.ones((length), dtype=bool), ((0, pad_length)),
-                            mode=str("constant")
-                        )
-                    )
-                result.append(presence_arr)
-            return result
-
-        return pack_segments_ref
-
-    @given(
-        num_seq=st.integers(10, 100),
-        cell_size=st.integers(1, 10),
-        max_length_buffer=st.integers(-5, 5),
-        **hu.gcs
-    )
-    @settings(deadline=None, max_examples=50)
-    def test_pack_with_max_length_ops(
-        self, num_seq, cell_size, max_length_buffer, gc, dc
-    ):
-        # create data
-        lengths = np.arange(num_seq, dtype=np.int32) + 1
-        num_cell = np.sum(lengths)
-        data = np.zeros(num_cell * cell_size, dtype=np.float32)
-        left = np.cumsum(np.arange(num_seq) * cell_size)
-        right = np.cumsum(lengths * cell_size)
-        for i in range(num_seq):
-            data[left[i]:right[i]] = i + 1.0
-        data.resize(num_cell, cell_size)
-        print("\nnum seq:{},    num cell: {},   cell size:{}\n".format(
-            num_seq, num_cell, cell_size)
-            + "=" * 60
-        )
-        # run test
-        max_length = num_seq + max_length_buffer
-        op = core.CreateOperator(
-            'PackSegments', ['l', 'd'], ['t'], max_length=max_length)
-        workspace.FeedBlob('l', lengths)
-        workspace.FeedBlob('d', data)
-        start = time.time()
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[lengths, data, max_length],
-            reference=self.pack_segments_ref(max_length=max_length),
-        )
-        end = time.time()
-        print("{} used time: {}".format(gc, end - start).replace('\n', ' '))
-
-        with core.DeviceScope(gc):
-            workspace.FeedBlob('l', lengths)
-            workspace.FeedBlob('d', data)
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'PackSegments',
-            ['l', 'd'],
-            ['t'],
-            max_length=max_length,
-            device_option=gc))
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'UnpackSegments',
-            ['l', 't'],
-            ['newd'],
-            max_length=max_length,
-            device_option=gc))
-        assert(workspace.FetchBlob('t').shape[1] == max_length)
-
-        def _cal_unpacked_data(data):
-            if max_length >= num_seq:
-                return data
-            output = None
-            start = 0
-            for i, length in enumerate(lengths):
-                new_len = max_length if length > max_length else length
-                chunk = data[start: start + new_len]
-                if output is None:
-                    output = chunk
-                else:
-                    output = np.concatenate((output, chunk), axis=0)
-                start += length
-            return output
-
-        true_newd = _cal_unpacked_data(workspace.FetchBlob('d'))
-        assert((workspace.FetchBlob('newd') == true_newd).all())
-
-    @given(
-        num_seq=st.integers(10, 500),
-        cell_size=st.integers(1, 10),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_pack_ops(self, num_seq, cell_size, gc, dc):
-        # create data
-        lengths = np.arange(num_seq, dtype=np.int32) + 1
-        num_cell = np.sum(lengths)
-        data = np.zeros(num_cell * cell_size, dtype=np.float32)
-        left = np.cumsum(np.arange(num_seq) * cell_size)
-        right = np.cumsum(lengths * cell_size)
-        for i in range(num_seq):
-            data[left[i]:right[i]] = i + 1.0
-        data.resize(num_cell, cell_size)
-        print("\nnum seq:{},    num cell: {},   cell size:{}\n".format(
-            num_seq, num_cell, cell_size)
-            + "=" * 60
-        )
-        # run test
-        op = core.CreateOperator(
-            'PackSegments', ['l', 'd'], ['t'])
-        workspace.FeedBlob('l', lengths)
-        workspace.FeedBlob('d', data)
-
-        start = time.time()
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[lengths, data],
-            reference=self.pack_segments_ref(),
-        )
-        end = time.time()
-        print("{} used time: {}".format(gc, end - start).replace('\n', ' '))
-
-        with core.DeviceScope(gc):
-            workspace.FeedBlob('l', lengths)
-            workspace.FeedBlob('d', data)
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'PackSegments',
-            ['l', 'd'],
-            ['t'],
-            device_option=gc))
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'UnpackSegments',
-            ['l', 't'],
-            ['newd'],
-            device_option=gc))
-        assert((workspace.FetchBlob('newd') == workspace.FetchBlob('d')).all())
-
-    @given(
-        **hu.gcs_cpu_only
-    )
-    def test_pack_ops_str(self, gc, dc):
-        # GPU does not support string. Test CPU implementation only.
-        workspace.FeedBlob('l', np.array([1, 2, 3], dtype=np.int64))
-        strs = np.array([
-            ["a", "a"],
-            ["b", "b"],
-            ["bb", "bb"],
-            ["c", "c"],
-            ["cc", "cc"],
-            ["ccc", "ccc"]],
-            dtype='|S')
-        workspace.FeedBlob('d', strs)
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'PackSegments',
-            ['l', 'd'],
-            ['t'],
-            device_option=gc))
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'UnpackSegments',
-            ['l', 't'],
-            ['newd'],
-            device_option=gc))
-        assert((workspace.FetchBlob('newd') == workspace.FetchBlob('d')).all())
-
-    def test_pad_minf(self):
-        workspace.FeedBlob('l', np.array([1, 2, 3], dtype=np.int32))
-        workspace.FeedBlob(
-            'd',
-            np.array([
-                [1.0, 1.1],
-                [2.0, 2.1],
-                [2.2, 2.2],
-                [3.0, 3.1],
-                [3.2, 3.3],
-                [3.4, 3.5]],
-                dtype=np.float32))
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'PackSegments', ['l', 'd'], ['t'], pad_minf=True))
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'Exp', ['t'], ['r']
-        ))
-        result = workspace.FetchBlob('t')
-        assert(result[0, -1, 0] < -1000.0)
-
-        # The whole point of padding with -inf is that when we exponentiate it
-        # then it should be zero.
-        exponentiated = workspace.FetchBlob('r')
-        assert(exponentiated[0, -1, 0] == 0.0)
-
-    def test_pad_no_minf(self):
-        workspace.FeedBlob('l', np.array([1, 2, 3], dtype=np.int32))
-        workspace.FeedBlob(
-            'd',
-            np.array([
-                [1.0, 1.1],
-                [2.0, 2.1],
-                [2.2, 2.2],
-                [3.0, 3.1],
-                [3.2, 3.3],
-                [3.4, 3.5]],
-                dtype=np.float32))
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                'PackSegments', ['l', 'd'], ['t'], pad_minf=False),
-        )
-        result = workspace.FetchBlob('t')
-        assert(result[0, -1, 0] == 0.0)
-
-        workspace.FeedBlob(
-            'i',
-            np.array([
-                [1, 1],
-                [2, 2],
-                [2, 2],
-                [3, 3],
-                [3, 3],
-                [3, 3]],
-                dtype=np.int32))
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                'PackSegments', ['l', 'i'], ['t2'], pad_minf=False),
-        )
-        result = workspace.FetchBlob('t2')
-        assert(result[0, -1, 0] == 0)
-
-    @given(**hu.gcs)
-    def test_presence_mask(self, gc, dc):
-        lengths = np.array([1, 2, 3], dtype=np.int32)
-        data = np.array(
-            [
-                [1.0, 1.0], [2.0, 2.0], [2.0, 2.0], [3.0, 3.0], [3.0, 3.0],
-                [3.0, 3.0]
-            ],
-            dtype=np.float32
-        )
-
-        op = core.CreateOperator(
-            'PackSegments', ['l', 'd'], ['t', 'p'], return_presence_mask=True
-        )
-        workspace.FeedBlob('l', lengths)
-        workspace.FeedBlob('d', data)
-        inputs = [lengths, data]
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=self.pack_segments_ref(return_presence_mask=True),
-        )
-
-        op = core.CreateOperator(
-            'PackSegments', ['l', 'd'], ['t', 'p'], return_presence_mask=True
-        )
-        workspace.RunOperatorOnce(op)
-
-        output = workspace.FetchBlob('t')
-        expected_output_shape = (3, 3, 2)
-        self.assertEqual(output.shape, expected_output_shape)
-
-        presence_mask = workspace.FetchBlob('p')
-        expected_presence_mask = np.array(
-            [[True, False, False], [True, True, False], [True, True, True]],
-            dtype=bool
-        )
-        self.assertEqual(presence_mask.shape, expected_presence_mask.shape)
-        np.testing.assert_array_equal(presence_mask, expected_presence_mask)
-
-    def test_presence_mask_empty(self):
-        lengths = np.array([], dtype=np.int32)
-        data = np.array([], dtype=np.float32)
-
-        op = core.CreateOperator(
-            'PackSegments', ['l', 'd'], ['t', 'p'], return_presence_mask=True
-        )
-        workspace.FeedBlob('l', lengths)
-        workspace.FeedBlob('d', data)
-        workspace.RunOperatorOnce(op)
-
-        output = workspace.FetchBlob('p')
-        expected_output_shape = (0, 0)
-        self.assertEqual(output.shape, expected_output_shape)
-
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_out_of_bounds(self, gc, dc):
-        # Copy pasted from test_pack_ops but with 3 changed to 4
-        lengths = np.array([1, 2, 4], dtype=np.int32)
-        data = np.array([
-            [1.0, 1.0],
-            [2.0, 2.0],
-            [2.0, 2.0],
-            [3.0, 3.0],
-            [3.0, 3.0],
-            [3.0, 3.0]], dtype=np.float32)
-        op = core.CreateOperator(
-            'PackSegments', ['l', 'd'], ['t'])
-
-        inputs = [lengths, data]
-        self.assertRunOpRaises(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            exception=RuntimeError
-        )
-
-    @given(**hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_under_bounds(self, gc, dc):
-        # Copy pasted from test_pack_ops but with 3 changed to 2
-        lengths = np.array([1, 2, 2], dtype=np.int32)
-        data = np.array([
-            [1.0, 1.0],
-            [2.0, 2.0],
-            [2.0, 2.0],
-            [3.0, 3.0],
-            [3.0, 3.0],
-            [3.0, 3.0]], dtype=np.float32)
-        op = core.CreateOperator(
-            'PackSegments', ['l', 'd'], ['t'])
-
-        inputs = [lengths, data]
-        self.assertRunOpRaises(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            exception=RuntimeError
-        )
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py b/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
deleted file mode 100644
index eceb1e5ba6a9..000000000000
--- a/caffe2/python/operator_test/pack_rnn_sequence_op_test.py
+++ /dev/null
@@ -1,90 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestPackRNNSequenceOperator(serial.SerializedTestCase):
-
-    @serial.given(n=st.integers(0, 10), k=st.integers(1, 5),
-           dim=st.integers(1, 5), **hu.gcs_cpu_only)
-    def test_pack_rnn_seqence(self, n, k, dim, gc, dc):
-        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
-        values = np.random.rand(sum(lengths), dim).astype(np.float32)
-
-        def pack_op(values, lengths):
-            T = max(lengths) if any(lengths) else 0
-            N = lengths.size
-            output = np.zeros((T, N) + values.shape[1:]).astype(np.float32)
-            offset = 0
-            for c in range(N):
-                for r in range(lengths[c]):
-                    output[r][c] = values[offset + r]
-                offset += lengths[c]
-            return [output]
-
-        op = core.CreateOperator(
-            'PackRNNSequence',
-            ['values', 'lengths'],
-            'out'
-        )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[values, lengths],
-            reference=pack_op,
-        )
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [values, lengths], [0])
-        # Gradient check
-        self.assertGradientChecks(gc, op, [values, lengths], 0, [0])
-
-    @serial.given(n=st.integers(0, 10), k=st.integers(2, 5),
-           dim=st.integers(1, 5), **hu.gcs_cpu_only)
-    def test_unpack_rnn_seqence(self, n, k, dim, gc, dc):
-        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
-        T = max(lengths) if any(lengths) else 0
-        N = lengths.size
-        values = np.random.rand(T, N, dim).astype(np.float32)
-
-        def unpack_op(values, lengths):
-            M = sum(lengths)
-            output = np.zeros((M,) + values.shape[2:]).astype(np.float32)
-            N = lengths.size
-            offset = 0
-            for c in range(N):
-                for r in range(lengths[c]):
-                    output[offset + r] = values[r][c]
-                offset += lengths[c]
-            return [output]
-
-        op = core.CreateOperator(
-            'UnpackRNNSequence',
-            ['values', 'lengths'],
-            'out'
-        )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[values, lengths],
-            reference=unpack_op,
-        )
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [values, lengths], [0])
-        # Gradient check
-        self.assertGradientChecks(gc, op, [values, lengths], 0, [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/pad_test.py b/caffe2/python/operator_test/pad_test.py
deleted file mode 100644
index 788c4035dd5f..000000000000
--- a/caffe2/python/operator_test/pad_test.py
+++ /dev/null
@@ -1,49 +0,0 @@
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-class TestPad(serial.SerializedTestCase):
-    @serial.given(pad_t=st.integers(-5, 0),
-           pad_l=st.integers(-5, 0),
-           pad_b=st.integers(-5, 0),
-           pad_r=st.integers(-5, 0),
-           mode=st.sampled_from(["constant", "reflect", "edge"]),
-           size_w=st.integers(16, 128),
-           size_h=st.integers(16, 128),
-           size_c=st.integers(1, 4),
-           size_n=st.integers(1, 4),
-           **hu.gcs)
-    def test_crop(self,
-                  pad_t, pad_l, pad_b, pad_r,
-                  mode,
-                  size_w, size_h, size_c, size_n,
-                  gc, dc):
-        op = core.CreateOperator(
-            "PadImage",
-            ["X"],
-            ["Y"],
-            pad_t=pad_t,
-            pad_l=pad_l,
-            pad_b=pad_b,
-            pad_r=pad_r,
-        )
-        X = np.random.rand(
-            size_n, size_c, size_h, size_w).astype(np.float32)
-
-        def ref(X):
-            return (X[:, :, -pad_t:pad_b or None, -pad_l:pad_r or None],)
-
-        self.assertReferenceChecks(gc, op, [X], ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/partition_ops_test.py b/caffe2/python/operator_test/partition_ops_test.py
deleted file mode 100644
index b600c302d83b..000000000000
--- a/caffe2/python/operator_test/partition_ops_test.py
+++ /dev/null
@@ -1,181 +0,0 @@
-
-
-
-
-import numpy as np
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase, rand_array
-
-
-class TestPartitionOps(TestCase):
-
-    def test_configs(self):
-        # (main dims, partitions,  main type, [list of (extra dims, type)])
-        configs = [
-            ((10, ), 3),
-            ((4, ), 10),
-            ((10, 10), 4),
-            ((100, ), 2),
-            ((5, ), 1),
-            ((1, ), 1),
-            ((2, 10), 2),
-        ]
-        suffixes = [
-            [],
-            [((2, 2), np.float32)],
-            [((3, ), np.int64), ((2, ), np.float32)],
-        ]
-        return [
-            (main_dims, parts, main_type, extra, pack)
-            for main_dims, parts in configs
-            for main_type in [np.int32, np.int64] for extra in suffixes
-            for pack in [False, True]
-        ]
-
-    def testPartition(self):
-        for main_dims, parts, main_type, extra_ins, pack in self.test_configs():
-            ins = ['in' + str(i) for i in range(1 + len(extra_ins))]
-            outs = [
-                'in{}_p{}'.format(j, i)
-                for i in range(parts) for j in range(1 + len(extra_ins))
-            ]
-            op = core.CreateOperator(
-                'Partition', ins, outs, pack_first_input=(1 if pack else 0))
-            x = []
-            for i, (dims, t) in enumerate([((), main_type)] + extra_ins):
-                if t in [np.float32, np.float64]:
-                    d = rand_array(*(main_dims + dims))
-                else:
-                    d = np.random.randint(-100, 100, (main_dims + dims))
-                d = d.astype(t)
-                workspace.FeedBlob(ins[i], d)
-                x.append(d)
-
-            def sharding(x):
-                # numpy has proper modulo op that yields non-negative results
-                shards = (x[0] % parts).reshape([-1])
-                out = []
-                for i in range(parts):
-                    for ind, v in enumerate(x):
-                        suffix_shape = v.shape[len(x[0].shape):]
-                        accum = []
-                        data = v.reshape((-1, ) + suffix_shape)
-
-                        if pack and ind == 0:
-                            data = data // parts
-
-                        for j, s in enumerate(shards):
-                            if s == i:
-                                accum.append(data[j])
-
-                        def join(a):
-                            if not a:
-                                return np.empty(shape=(0, ) + suffix_shape)
-                            return np.stack(a)
-
-                        out.append(join(accum))
-                return out
-
-            workspace.RunOperatorOnce(op)
-            ref = sharding(x)
-            print(x)
-            print(ref)
-            for name, expected in zip(outs, ref):
-                np.testing.assert_array_equal(
-                    expected, workspace.FetchBlob(name)
-                )
-
-            # test inverse operation (GatherByKey)
-            if len(main_dims) == 1:
-                # currently only 1D key tensor supported
-                for i in range(len(extra_ins)):
-                    expected_out = ins[i + 1]
-                    gather_ins = [ins[0]] + [
-                        outs[len(ins) * p + i + 1] for p in range(parts)]
-                    actual_out = expected_out + '_actual'
-                    op = core.CreateOperator(
-                        'GatherByKey', gather_ins, actual_out)
-                    workspace.RunOperatorOnce(op)
-                    expected = workspace.FetchBlob(expected_out)
-                    actual = workspace.FetchBlob(actual_out)
-                    np.testing.assert_array_equal(expected, actual)
-
-
-    def testLengthsPartition(self):
-        for main_dims, parts, main_type, extra_ins, pack in self.test_configs():
-            # For LengthsSharding only 1-D tensors supported as a first input
-            if len(main_dims) > 1:
-                continue
-            ins = ['in' + str(i) for i in range(2 + len(extra_ins))]
-            outs = [
-                'in{}_p{}'.format(j, i)
-                for i in range(parts) for j in range(2 + len(extra_ins))
-            ]
-            op = core.CreateOperator(
-                'LengthsPartition', ins, outs,
-                pack_first_input=(1 if pack else 0)
-            )
-            x = []
-            for i, (dims, t) in enumerate([((), main_type)] + extra_ins):
-                if t in [np.float32, np.float64]:
-                    d = rand_array(*(main_dims + dims))
-                else:
-                    d = np.random.randint(-100, 100, (main_dims + dims))
-                d = d.astype(t)
-                workspace.FeedBlob(ins[i + 1], d)
-                x.append(d)
-
-            # Randomly generate length tensor as well
-            elements = np.random.randint(2, 10)
-            lengths = []
-            total_length = 0
-            for _ in range(elements - 1):
-                lengths.append(np.random.randint(main_dims[0] - total_length))
-                total_length += lengths[-1]
-            lengths.append(main_dims[0] - total_length)
-            workspace.FeedBlob(ins[0], np.array(lengths, dtype=np.int32))
-
-            def sharding(x):
-                # numpy has proper modulo op that yields non-negative results
-                shards = (x[0] % parts).reshape([-1])
-                out = []
-                for i in range(parts):
-                    idx = 0
-                    sharded_lengths = np.zeros(elements)
-                    for ind, length in enumerate(lengths):
-                        for _ in range(length):
-                            if shards[idx] == i:
-                                sharded_lengths[ind] += 1
-                            idx += 1
-                    out.append(sharded_lengths)
-
-                    for ind, v in enumerate(x):
-                        suffix_shape = v.shape[len(x[0].shape):]
-                        accum = []
-                        data = v.reshape((-1, ) + suffix_shape)
-
-                        if pack and ind == 0:
-                            data = data // parts
-
-                        for j, s in enumerate(shards):
-                            if s == i:
-                                accum.append(data[j])
-
-                        def join(a):
-                            if not a:
-                                return np.empty(shape=(0, ) + suffix_shape)
-                            return np.stack(a)
-
-                        out.append(join(accum))
-                return out
-
-            workspace.RunOperatorOnce(op)
-            ref = sharding(x)
-            for name, expected in zip(outs, ref):
-                np.testing.assert_array_equal(
-                    expected, workspace.FetchBlob(name)
-                )
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/percentile_op_test.py b/caffe2/python/operator_test/percentile_op_test.py
deleted file mode 100644
index 40c4192e21e9..000000000000
--- a/caffe2/python/operator_test/percentile_op_test.py
+++ /dev/null
@@ -1,130 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import numpy as np
-
-
-class TestPercentileOp(hu.HypothesisTestCase):
-    def _test_percentile_op(
-        self,
-        original_inp,
-        value_to_pct_map,
-        dist_lengths,
-        expected_values
-    ):
-        op = core.CreateOperator(
-            'Percentile',
-            ['original_values', 'value_to_pct_map', 'dist_lengths'],
-            ['percentile_values']
-        )
-        workspace.FeedBlob('original_values', np.array(
-            original_inp, dtype=np.float32))
-        workspace.FeedBlob(
-            'value_to_pct_map', np.array(value_to_pct_map, dtype=np.float32))
-        workspace.FeedBlob('dist_lengths', np.array(
-            dist_lengths, dtype=np.int32))
-        workspace.RunOperatorOnce(op)
-        np.testing.assert_array_almost_equal(
-            workspace.FetchBlob('percentile_values'),
-            np.array(expected_values),
-            decimal=5
-        )
-        self._test_shape_inference(
-            original_inp,
-            value_to_pct_map,
-            dist_lengths,
-            expected_values
-        )
-
-    def _test_shape_inference(
-        self,
-        original_inp,
-        value_to_pct_map,
-        dist_lengths,
-        expected_values
-    ):
-        net = core.Net('test_shape_inference')
-        result = net.Percentile(
-            ['original_values', 'value_to_pct_map', 'dist_lengths'],
-            ['percentile_values']
-        )
-        workspace.FeedBlob('original_values', np.array(
-            original_inp, dtype=np.float32))
-        workspace.FeedBlob(
-            'value_to_pct_map', np.array(value_to_pct_map, dtype=np.float32))
-        workspace.FeedBlob('dist_lengths', np.array(
-            dist_lengths, dtype=np.int32))
-        (shapes, types) = workspace.InferShapesAndTypes([net])
-        workspace.RunNetOnce(net)
-        self.assertEqual(shapes[result], list(workspace.blobs[result].shape))
-        self.assertEqual(shapes[result], list(workspace.blobs['original_values'].shape))
-        self.assertEqual(types[result], core.DataType.FLOAT)
-
-    def test_percentile_op_with_only_one_dist(self):
-        self._test_percentile_op(
-            original_inp=[[5]],
-            value_to_pct_map=[[5, 0.4]],
-            dist_lengths=[1],
-            expected_values=[[0.4]]
-        )
-
-    def test_percentile_op_with_all_elements_in_map(self):
-        self._test_percentile_op(
-            original_inp=[[3, 4], [10, 4]],
-            value_to_pct_map=[[3, 0.3], [4, 0.6],
-                              [10, 0.8], [4, 0.5], [5, 0.6]],
-            dist_lengths=[3, 2],
-            expected_values=[[0.3, 0.5], [0.8, 0.5]],
-        )
-
-    def test_percentile_op_with_same_value(self):
-        self._test_percentile_op(
-            original_inp=[[1, 1], [1, 2]],
-            value_to_pct_map=[[1, 0.1], [4, 0.4], [2, 0.5]],
-            dist_lengths=[2, 1],
-            expected_values=[[0.1, 0.0], [0.1, 0.5]]
-        )
-
-    def test_percentile_op_with_elements_bigger_than_map_range(self):
-        self._test_percentile_op(
-            original_inp=[[1, 5], [3, 4]],
-            value_to_pct_map=[[1, 0.1], [4, 0.4], [2, 0.1], [3, 0.3]],
-            dist_lengths=[2, 2],
-            expected_values=[[0.1, 1.], [0.3, 1.0]]
-        )
-
-    def test_percentile_op_with_elements_smaller_than_map_range(self):
-        self._test_percentile_op(
-            original_inp=[[1], [5], [6]],
-            value_to_pct_map=[[2, 0.2], [5, 0.5], [7, 0.5]],
-            dist_lengths=[3],
-            expected_values=[[0.0], [0.5], [0.5]]
-        )
-
-    def test_percentile_op_with_interpolation(self):
-        self._test_percentile_op(
-            original_inp=[[3, 2, 5], [6, 7, 8]],
-            value_to_pct_map=[[1, 0.1], [4, 0.7], [4.5, 0.8],
-                              [6, 0.5], [8, 0.9],
-                              [8, 0.6]],
-            dist_lengths=[3, 2, 1],
-            expected_values=[[0.5, 0.0, 0.0], [1.0, 0.7, 0.6]]
-        )
-
-    def test_percentile_op_with_large_sample_size_per_dist(self):
-        self._test_percentile_op(
-            original_inp=[[3, 1], [5, 7]],
-            value_to_pct_map=[[3, 0.5], [4, 0.6], [5, 0.7],
-                              [1, 0.2], [2, 0.3], [5, 0.8]],
-            dist_lengths=[3, 3],
-            expected_values=[[0.5, 0.2], [0.7, 1.0]]
-        )
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/piecewise_linear_transform_test.py b/caffe2/python/operator_test/piecewise_linear_transform_test.py
deleted file mode 100644
index 0c260d944d81..000000000000
--- a/caffe2/python/operator_test/piecewise_linear_transform_test.py
+++ /dev/null
@@ -1,171 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-class TestPiecewiseLinearTransform(serial.SerializedTestCase):
-    def constrain(self, v, min_val, max_val):
-        def constrain_internal(x):
-            return min(max(x, min_val), max_val)
-        return np.array([constrain_internal(x) for x in v])
-
-    def transform(self, x, bounds, slopes, intercepts):
-        n = len(slopes)
-        x_ = self.constrain(x, bounds[0], bounds[-1])
-        index = np.minimum(
-            np.maximum(
-                np.searchsorted(bounds, x_) - 1,
-                0
-            ),
-            n - 1
-        )
-        y = slopes[index] * x_ + intercepts[index]
-        return y
-
-    @given(n=st.integers(1, 100), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_multi_predictions_params_from_arg(self, n, gc, dc):
-        slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
-        intercepts = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
-        bounds = np.random.uniform(0.1, 0.9,
-                                   (2, n + 1)).astype(np.float32)
-        bounds.sort()
-        X = np.random.uniform(0, 1, (n, 2)).astype(np.float32)
-
-        op = core.CreateOperator(
-            "PiecewiseLinearTransform", ["X"], ["Y"],
-            bounds=bounds.flatten().tolist(),
-            slopes=slopes.flatten().tolist(),
-            intercepts=intercepts.flatten().tolist(),
-        )
-
-        def piecewise(x, *args, **kw):
-            x_0 = self.transform(
-                x[:, 0], bounds[0, :], slopes[0, :], intercepts[0, :])
-            x_1 = self.transform(
-                x[:, 1], bounds[1, :], slopes[1, :], intercepts[1, :])
-
-            return [np.vstack((x_0, x_1)).transpose()]
-
-        self.assertReferenceChecks(gc, op, [X], piecewise)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(n=st.integers(1, 100), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_binary_predictions_params_from_arg(self, n, gc, dc):
-        slopes = np.random.uniform(-1, 1, size=n).astype(np.float32)
-        intercepts = np.random.uniform(-1, 1, size=n).astype(np.float32)
-        bounds = np.random.uniform(0.1, 0.9, n + 1).astype(np.float32)
-        bounds.sort()
-
-        X = np.random.uniform(0, 1, (n, 2)).astype(np.float32)
-        X[:, 0] = 1 - X[:, 1]
-
-        op = core.CreateOperator(
-            "PiecewiseLinearTransform", ["X"], ["Y"],
-            bounds=bounds.flatten().tolist(),
-            slopes=slopes.flatten().tolist(),
-            intercepts=intercepts.flatten().tolist(),
-            pieces=n,
-            binary=True,
-        )
-
-        def piecewise(x):
-            x_ = self.transform(x[:, 1], bounds, slopes, intercepts)
-            return [np.vstack((1 - x_, x_)).transpose()]
-
-        self.assertReferenceChecks(gc, op, [X], piecewise)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(n=st.integers(1, 100), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_multi_predictions_params_from_input(self, n, gc, dc):
-        slopes = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
-        intercepts = np.random.uniform(-1, 1, (2, n)).astype(np.float32)
-        bounds = np.random.uniform(0.1, 0.9,
-                                   (2, n + 1)).astype(np.float32)
-        bounds.sort()
-        X = np.random.uniform(0, 1, (n, 2)).astype(np.float32)
-
-        op = core.CreateOperator(
-            "PiecewiseLinearTransform",
-            ["X", "bounds", "slopes", "intercepts"],
-            ["Y"],
-        )
-
-        def piecewise(x, bounds, slopes, intercepts):
-            x_0 = self.transform(
-                x[:, 0], bounds[0, :], slopes[0, :], intercepts[0, :])
-            x_1 = self.transform(
-                x[:, 1], bounds[1, :], slopes[1, :], intercepts[1, :])
-
-            return [np.vstack((x_0, x_1)).transpose()]
-
-        self.assertReferenceChecks(
-            gc, op, [X, bounds, slopes, intercepts], piecewise)
-        self.assertDeviceChecks(dc, op, [X, bounds, slopes, intercepts], [0])
-
-    @given(n=st.integers(1, 100), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_binary_predictions_params_from_input(self, n, gc, dc):
-        slopes = np.random.uniform(-1, 1, size=n).astype(np.float32)
-        intercepts = np.random.uniform(-1, 1, size=n).astype(np.float32)
-        bounds = np.random.uniform(0.1, 0.9, n + 1).astype(np.float32)
-        bounds.sort()
-
-        X = np.random.uniform(0, 1, (n, 2)).astype(np.float32)
-        X[:, 0] = 1 - X[:, 1]
-
-        op = core.CreateOperator(
-            "PiecewiseLinearTransform",
-            ["X", "bounds", "slopes", "intercepts"],
-            ["Y"],
-            binary=True,
-        )
-
-        def piecewise(x, bounds, slopes, intercepts):
-            x_ = self.transform(x[:, 1], bounds, slopes, intercepts)
-            return [np.vstack((1 - x_, x_)).transpose()]
-
-        self.assertReferenceChecks(
-            gc, op, [X, bounds, slopes, intercepts], piecewise)
-        self.assertDeviceChecks(dc, op, [X, bounds, slopes, intercepts], [0])
-
-    @given(n=st.integers(1, 100), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_1D_predictions_params_from_input(self, n, gc, dc):
-        slopes = np.random.uniform(-1, 1, size=n).astype(np.float32)
-        intercepts = np.random.uniform(-1, 1, size=n).astype(np.float32)
-        bounds = np.random.uniform(0.1, 0.9, n + 1).astype(np.float32)
-        bounds.sort()
-
-        X = np.random.uniform(0, 1, size=n).astype(np.float32)
-
-        op = core.CreateOperator(
-            "PiecewiseLinearTransform",
-            ["X", "bounds", "slopes", "intercepts"],
-            ["Y"],
-            binary=True,
-        )
-
-        def piecewise(x, bounds, slopes, intercepts):
-            x_ = self.transform(x, bounds, slopes, intercepts)
-            return [x_]
-
-        self.assertReferenceChecks(
-            gc, op, [X, bounds, slopes, intercepts], piecewise)
-        self.assertDeviceChecks(dc, op, [X, bounds, slopes, intercepts], [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py
deleted file mode 100644
index 2954face6b85..000000000000
--- a/caffe2/python/operator_test/pooling_test.py
+++ /dev/null
@@ -1,467 +0,0 @@
-
-
-
-
-import numpy as np
-from hypothesis import assume, given, settings
-import hypothesis.strategies as st
-import os
-import unittest
-
-from caffe2.python import core, utils, workspace
-import caffe2.python.hip_test_util as hiputl
-import caffe2.python.hypothesis_test_util as hu
-
-class TestPooling(hu.HypothesisTestCase):
-    # CUDNN does NOT support different padding values and we skip it
-    @given(stride_h=st.integers(1, 3),
-           stride_w=st.integers(1, 3),
-           pad_t=st.integers(0, 3),
-           pad_l=st.integers(0, 3),
-           pad_b=st.integers(0, 3),
-           pad_r=st.integers(0, 3),
-           kernel=st.integers(3, 5),
-           size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(0, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool",
-                                   "MaxPool2D", "AveragePool2D"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_pooling_separate_stride_pad(self, stride_h, stride_w,
-                                         pad_t, pad_l, pad_b,
-                                         pad_r, kernel, size,
-                                         input_channels,
-                                         batch_size, order,
-                                         op_type,
-                                         gc, dc):
-        assume(np.max([pad_t, pad_l, pad_b, pad_r]) < kernel)
-
-        op = core.CreateOperator(
-            op_type,
-            ["X"],
-            ["Y"],
-            stride_h=stride_h,
-            stride_w=stride_w,
-            pad_t=pad_t,
-            pad_l=pad_l,
-            pad_b=pad_b,
-            pad_r=pad_r,
-            kernel=kernel,
-            order=order,
-        )
-        X = np.random.rand(
-            batch_size, size, size, input_channels).astype(np.float32)
-
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        if 'MaxPool' not in op_type:
-            self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    # This test is to check if CUDNN works for bigger batch size or not
-    @unittest.skipIf(not os.getenv('CAFFE2_DEBUG'),
-                     "This is a test that reproduces a cudnn error. If you "
-                     "want to run it, set env variable CAFFE2_DEBUG=1.")
-    @given(**hu.gcs_cuda_only)
-    def test_pooling_big_batch(self, gc, dc):
-        op = core.CreateOperator(
-            "AveragePool",
-            ["X"],
-            ["Y"],
-            stride=1,
-            kernel=7,
-            pad=0,
-            order="NHWC",
-            engine="CUDNN",
-        )
-        X = np.random.rand(70000, 7, 7, 81).astype(np.float32)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(0, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           op_type=st.sampled_from(["MaxPool", "AveragePool",
-                                    "MaxPool1D", "AveragePool1D"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_pooling_1d(self, stride, pad, kernel, size, input_channels,
-                        batch_size, order, op_type, gc, dc):
-        assume(pad < kernel)
-        op = core.CreateOperator(
-            op_type,
-            ["X"],
-            ["Y"],
-            strides=[stride],
-            kernels=[kernel],
-            pads=[pad, pad],
-            order=order,
-            engine="",
-        )
-        X = np.random.rand(
-            batch_size, size, input_channels).astype(np.float32)
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        if 'MaxPool' not in op_type:
-            self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 2),
-           kernel=st.integers(1, 6),
-           size=st.integers(3, 5),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(0, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           op_type=st.sampled_from(["MaxPool", "AveragePool",
-                                    "MaxPool3D", "AveragePool3D"]),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_pooling_3d(self, stride, pad, kernel, size, input_channels,
-                        batch_size, order, op_type, engine, gc, dc):
-        assume(pad < kernel)
-        assume(size + pad + pad >= kernel)
-        # Currently MIOpen Pooling only supports pooling with NCHW order.
-        if hiputl.run_in_hip(gc, dc) and (workspace.GetHIPVersion() < 303 or  order == "NHWC"):
-            assume(engine != "CUDNN")
-        # some case here could be calculated with global pooling, but instead
-        # calculated with general implementation, slower but should still
-        # be correct.
-        op = core.CreateOperator(
-            op_type,
-            ["X"],
-            ["Y"],
-            strides=[stride] * 3,
-            kernels=[kernel] * 3,
-            pads=[pad] * 6,
-            order=order,
-            engine=engine,
-        )
-        X = np.random.rand(
-            batch_size, size, size, size, input_channels).astype(np.float32)
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-
-        self.assertDeviceChecks(dc, op, [X], [0], threshold=0.001)
-        if 'MaxPool' not in op_type:
-            self.assertGradientChecks(gc, op, [X], 0, [0], threshold=0.001)
-
-    @given(kernel=st.integers(3, 6),
-           size=st.integers(3, 5),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(0, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           op_type=st.sampled_from(["MaxPool", "AveragePool",
-                                    "MaxPool3D", "AveragePool3D"]),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_global_pooling_3d(self, kernel, size, input_channels,
-                               batch_size, order, op_type, engine, gc, dc):
-        # Currently MIOpen Pooling only supports pooling with NCHW order.
-        if hiputl.run_in_hip(gc, dc) and (workspace.GetHIPVersion() < 303 or  order == "NHWC"):
-            assume(engine != "CUDNN")
-        # pad and stride ignored because they will be inferred in global_pooling
-        op = core.CreateOperator(
-            op_type,
-            ["X"],
-            ["Y"],
-            kernels=[kernel] * 3,
-            order=order,
-            global_pooling=True,
-            engine=engine,
-        )
-        X = np.random.rand(
-            batch_size, size, size, size, input_channels).astype(np.float32)
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-
-        self.assertDeviceChecks(dc, op, [X], [0], threshold=0.001)
-        if 'MaxPool' not in op_type:
-            self.assertGradientChecks(gc, op, [X], 0, [0], threshold=0.001)
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(0, 3),
-           **hu.gcs_gpu_only)
-    def test_pooling_with_index(self, stride, pad, kernel, size,
-                                input_channels, batch_size, gc, dc):
-        assume(pad < kernel)
-        op = core.CreateOperator(
-            "MaxPoolWithIndex",
-            ["X"],
-            ["Y", "Y_index"],
-            stride=stride,
-            kernel=kernel,
-            pad=pad,
-            order="NCHW",
-            deterministic=1,
-        )
-        X = np.random.rand(
-            batch_size, size, size, input_channels).astype(np.float32)
-
-        # transpose due to order = NCHW
-        X = utils.NHWC2NCHW(X)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(sz=st.integers(1, 20),
-           batch_size=st.integers(0, 4),
-           engine=st.sampled_from(["", "CUDNN"]),
-           op_type=st.sampled_from(["AveragePool", "AveragePool2D"]),
-           **hu.gcs)
-    @settings(max_examples=3, deadline=None)
-    def test_global_avg_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc):
-        ''' Special test to stress the fast path of NCHW average pool '''
-        op = core.CreateOperator(
-            op_type,
-            ["X"],
-            ["Y"],
-            stride=1,
-            kernel=sz,
-            pad=0,
-            order="NCHW",
-            engine=engine,
-        )
-        X = np.random.rand(
-            batch_size, 3, sz, sz).astype(np.float32)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(sz=st.integers(1, 20),
-           batch_size=st.integers(0, 4),
-           engine=st.sampled_from(["", "CUDNN"]),
-           op_type=st.sampled_from(["MaxPool", "MaxPool2D"]),
-           **hu.gcs)
-    @settings(max_examples=3, deadline=None)
-    def test_global_max_pool_nchw(self, op_type, sz,
-                                  batch_size, engine, gc, dc):
-        ''' Special test to stress the fast path of NCHW max pool '''
-        # CuDNN 5 does not support deterministic max pooling.
-        assume(workspace.GetCuDNNVersion() >= 6000 or engine != "CUDNN")
-        op = core.CreateOperator(
-            op_type,
-            ["X"],
-            ["Y"],
-            stride=1,
-            kernel=sz,
-            pad=0,
-            order="NCHW",
-            engine=engine,
-            deterministic=1,
-        )
-
-        np.random.seed(1234)
-        X = np.random.rand(
-            batch_size, 3, sz, sz).astype(np.float32)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-4)
-
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 3),
-           kernel=st.integers(1, 5),
-           size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(0, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool",
-                                   "MaxPool2D", "AveragePool2D"]),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_pooling(self, stride, pad, kernel, size,
-                     input_channels, batch_size,
-                     order, op_type, engine, gc, dc):
-        assume(pad < kernel)
-        if hiputl.run_in_hip(gc, dc) and engine == "CUDNN":
-            assume(order == "NCHW" and op_type != "LpPool")
-
-        op = core.CreateOperator(
-            op_type,
-            ["X"],
-            ["Y"],
-            stride=stride,
-            kernel=kernel,
-            pad=pad,
-            order=order,
-            engine=engine,
-        )
-        X = np.random.rand(
-            batch_size, size, size, input_channels).astype(np.float32)
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        if 'MaxPool' not in op_type:
-            self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(size=st.integers(7, 9),
-           input_channels=st.integers(1, 3),
-           batch_size=st.integers(0, 3),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool"]),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_global_pooling(self, size, input_channels, batch_size,
-                            order, op_type, engine, gc, dc):
-        # CuDNN 5 does not support deterministic max pooling.
-        assume(workspace.GetCuDNNVersion() >= 6000 or op_type != "MaxPool")
-
-        if hiputl.run_in_hip(gc, dc) and engine == "CUDNN":
-            assume(order == "NCHW" and op_type != "LpPool")
-        op = core.CreateOperator(
-            op_type,
-            ["X"],
-            ["Y"],
-            order=order,
-            engine=engine,
-            global_pooling=True,
-        )
-        X = np.random.rand(
-            batch_size, size, size, input_channels).astype(np.float32)
-        if order == "NCHW":
-            X = utils.NHWC2NCHW(X)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        if 'MaxPool' not in op_type:
-            self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(op_type=st.sampled_from(["MaxPool", "MaxPoolND"]),
-           dim=st.integers(1, 3),
-           N=st.integers(1, 3),
-           C=st.integers(1, 3),
-           D=st.integers(3, 5),
-           H=st.integers(3, 5),
-           W=st.integers(3, 5),
-           kernel=st.integers(1, 3),
-           stride=st.integers(1, 3),
-           pad=st.integers(0, 2),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_max_pool_grad(
-            self, op_type, dim, N, C, D, H, W, kernel, stride, pad, order,
-            engine, gc, dc):
-        assume(pad < kernel)
-        assume(dim > 1 or engine == "")
-        if hiputl.run_in_hip(gc, dc):
-            if dim != 2:
-                assume(engine != "CUDNN")
-            elif engine == "CUDNN":
-                assume(order == "NCHW")
-
-        if op_type.endswith("ND"):
-            op_type = op_type.replace("N", str(dim))
-
-        op = core.CreateOperator(
-            op_type,
-            ["X"],
-            ["Y"],
-            kernels=[kernel] * dim,
-            strides=[stride] * dim,
-            pads=[pad] * dim * 2,
-            order=order,
-            engine=engine,
-        )
-
-        if dim == 1:
-            size = W
-            dims = [N, C, W]
-            axes = [0, 2, 1]
-        elif dim == 2:
-            size = H * W
-            dims = [N, C, H, W]
-            axes = [0, 2, 3, 1]
-        else:
-            size = D * H * W
-            dims = [N, C, D, H, W]
-            axes = [0, 2, 3, 4, 1]
-
-        X = np.zeros((N * C, size)).astype(np.float32)
-        for i in range(N * C):
-            X[i, :] = np.arange(size, dtype=np.float32) / size
-            np.random.shuffle(X[i, :])
-        X = X.reshape(dims)
-        if order == "NHWC":
-            X = np.transpose(X, axes)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(
-            gc, op, [X], 0, [0], threshold=0.05, stepsize=0.005)
-
-    @given(op_type=st.sampled_from(["AveragePool", "AveragePoolND"]),
-           dim=st.integers(1, 3),
-           N=st.integers(1, 3),
-           C=st.integers(1, 3),
-           D=st.integers(3, 5),
-           H=st.integers(3, 5),
-           W=st.integers(3, 5),
-           kernel=st.integers(1, 3),
-           stride=st.integers(1, 3),
-           pad=st.integers(0, 2),
-           count_include_pad=st.booleans(),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_avg_pool_count_include_pad(
-            self, op_type, dim, N, C, D, H, W, kernel, stride, pad,
-            count_include_pad, order, engine, gc, dc):
-        assume(pad < kernel)
-        if hiputl.run_in_hip(gc, dc):
-            if dim != 2:
-                assume(engine != "CUDNN")
-            elif engine == "CUDNN":
-                assume(order == "NCHW")
-
-        if op_type.endswith("ND"):
-            op_type = op_type.replace("N", str(dim))
-
-        op = core.CreateOperator(
-            op_type,
-            ["X"],
-            ["Y"],
-            kernels=[kernel] * dim,
-            strides=[stride] * dim,
-            pads=[pad] * dim * 2,
-            count_include_pad=count_include_pad,
-            order=order,
-            engine=engine,
-        )
-
-        if dim == 1:
-            dims = [N, C, W]
-            axes = [0, 2, 1]
-        elif dim == 2:
-            dims = [N, C, H, W]
-            axes = [0, 2, 3, 1]
-        else:
-            dims = [N, C, D, H, W]
-            axes = [0, 2, 3, 4, 1]
-        X = np.random.randn(*dims).astype(np.float32)
-        if order == "NHWC":
-            X = np.transpose(X, axes)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/prepend_dim_test.py b/caffe2/python/operator_test/prepend_dim_test.py
deleted file mode 100644
index d794ba2162b9..000000000000
--- a/caffe2/python/operator_test/prepend_dim_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-
-
-
-
-import numpy as np
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-from caffe2.proto import caffe2_pb2
-
-
-class TestPrependDim(TestCase):
-    def _test_fwd_bwd(self):
-        old_shape = (128, 2, 4)
-        new_shape = (8, 16, 2, 4)
-        X = np.random.rand(*old_shape).astype(np.float32)
-        Y = np.random.rand(*new_shape).astype(np.float32)
-
-        net = core.Net('net')
-
-        net.GivenTensorFill([], 'X', shape=old_shape, values=X.flatten())
-        net.GivenTensorFill([], 'Y', shape=new_shape, values=Y.flatten())
-
-        net.PrependDim(['X'], ['X_out'], dim_size=8)
-        net.DotProduct(['X_out', 'Y'], 'Z')
-        net.AddGradientOperators(['Z'])
-
-        workspace.RunNetOnce(net)
-
-        X_out = workspace.FetchBlob('X_out')
-        X_grad = workspace.FetchBlob('X_grad')
-        Y_grad = workspace.FetchBlob('Y_grad')
-
-        # Check the shape of the gradient
-        np.testing.assert_array_equal(X_out.shape, Y.shape)
-        np.testing.assert_array_equal(X_grad.shape, X.shape)
-        np.testing.assert_array_equal(Y_grad.shape, Y.shape)
-
-    def test_prepend_dim(self):
-        devices = [core.DeviceOption(caffe2_pb2.CPU, 0)]
-        if workspace.NumGpuDevices() > 0:
-            devices.append(core.DeviceOption(workspace.GpuDeviceType, 0))
-
-        for device_opt in devices:
-            with core.DeviceScope(device_opt):
-                self._test_fwd_bwd()
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/python_op_test.py b/caffe2/python/operator_test/python_op_test.py
deleted file mode 100644
index 8f41815585dc..000000000000
--- a/caffe2/python/operator_test/python_op_test.py
+++ /dev/null
@@ -1,43 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.core import CreatePythonOperator
-import caffe2.python.hypothesis_test_util as hu
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-class PythonOpTest(hu.HypothesisTestCase):
-    @given(x=hu.tensor(),
-           n=st.integers(min_value=1, max_value=20),
-           w=st.integers(min_value=1, max_value=20))
-    @settings(deadline=10000)
-    def test_simple_python_op(self, x, n, w):
-        def g(input_, output):
-            output[...] = input_
-
-        def f(inputs, outputs):
-            outputs[0].reshape(inputs[0].shape)
-            g(inputs[0].data, outputs[0].data)
-
-        ops = [CreatePythonOperator(f, ["x"], [str(i)]) for i in range(n)]
-        net = core.Net("net")
-        net.Proto().op.extend(ops)
-        net.Proto().type = "dag"
-        net.Proto().num_workers = w
-        iters = 100
-        plan = core.Plan("plan")
-        plan.AddStep(core.ExecutionStep("test-step", net, iters))
-        workspace.FeedBlob("x", x)
-        workspace.RunPlan(plan.Proto().SerializeToString())
-        for i in range(n):
-            y = workspace.FetchBlob(str(i))
-            np.testing.assert_almost_equal(x, y)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/quantile_test.py b/caffe2/python/operator_test/quantile_test.py
deleted file mode 100644
index 39f3728d8e81..000000000000
--- a/caffe2/python/operator_test/quantile_test.py
+++ /dev/null
@@ -1,86 +0,0 @@
-
-
-import unittest
-
-import caffe2.python.hypothesis_test_util as hu
-import numpy as np
-from caffe2.python import core, workspace
-
-
-class TestQuantile(hu.HypothesisTestCase):
-    def _test_quantile(self, inputs, quantile, abs, tol):
-        net = core.Net("test_net")
-        net.Proto().type = "dag"
-        input_tensors = []
-        for i, input in enumerate(inputs):
-            workspace.FeedBlob("t_{}".format(i), input)
-            input_tensors.append("t_{}".format(i))
-        net.Quantile(
-            input_tensors, ["quantile_value"], quantile=quantile, abs=abs, tol=tol
-        )
-        workspace.RunNetOnce(net)
-        quantile_value_blob = workspace.FetchBlob("quantile_value")
-        assert np.size(quantile_value_blob) == 1
-        quantile_value = quantile_value_blob[0]
-
-        input_cat = np.concatenate([input.flatten() for input in inputs])
-        input_cat = np.abs(input_cat) if abs == 1 else input_cat
-        target_cnt = np.ceil(np.size(input_cat) * quantile)
-        actual_cnt = np.sum(input_cat <= quantile_value)
-        # prune with return value will remove no less than
-        # "quantile" portion of  elements
-        assert actual_cnt >= target_cnt
-        # Expect that (hi-lo) < tol * (|lo| + |hi|)
-        # if tol < 1.0 -> hi * lo > 0, then we are expecting
-        # 1. if hi >0,
-        #           |hi|-|lo| < tol * (|lo| + |hi|)
-        #          hi - lo  < (2 tol) /(1 + tol)  |hi| < 2 tol |hi|
-        # 2. if hi < 0,
-        #           |lo|- |hi| < tol * (|lo| + |hi|)
-        #          hi - lo  < (2 tol) /(1 - tol)  |hi| < 2.5 tol |hi| if tol < 0.2
-        quantile_value_lo = quantile_value - 2.5 * tol * np.abs(quantile_value)
-        lo_cnt = np.sum(input_cat <= quantile_value_lo)
-        # prune with a slightly smaller value will remove
-        # less than "quantile" portion of elements
-        assert lo_cnt <= target_cnt
-
-    def test_quantile_1(self):
-        inputs = []
-        num_tensors = 5
-        for i in range(num_tensors):
-            dim = np.random.randint(5, 100)
-            inputs.append(np.random.rand(dim))
-        self._test_quantile(inputs=inputs, quantile=0.2, abs=1, tol=1e-4)
-
-    def test_quantile_2(self):
-        inputs = []
-        num_tensors = 5
-        for i in range(num_tensors):
-            dim = np.random.randint(5, 100)
-            inputs.append(np.random.rand(dim))
-        self._test_quantile(inputs=inputs, quantile=1e-6, abs=0, tol=1e-3)
-
-    def test_quantile_3(self):
-        inputs = []
-        num_tensors = 5
-        for i in range(num_tensors):
-            dim1 = np.random.randint(5, 100)
-            dim2 = np.random.randint(5, 100)
-            inputs.append(np.random.rand(dim1, dim2))
-        self._test_quantile(inputs=inputs, quantile=1 - 1e-6, abs=1, tol=1e-5)
-
-    def test_quantile_4(self):
-        inputs = []
-        num_tensors = 5
-        for i in range(num_tensors):
-            dim1 = np.random.randint(5, 100)
-            dim2 = np.random.randint(5, 100)
-            inputs.append(np.random.rand(dim1, dim2))
-            inputs.append(np.random.rand(dim1))
-        self._test_quantile(inputs=inputs, quantile=0.168, abs=1, tol=1e-4)
-
-
-if __name__ == "__main__":
-    global_options = ["caffe2"]
-    core.GlobalInit(global_options)
-    unittest.main()
diff --git a/caffe2/python/operator_test/rand_quantization_op_speed_test.py b/caffe2/python/operator_test/rand_quantization_op_speed_test.py
deleted file mode 100644
index 1c56faff645f..000000000000
--- a/caffe2/python/operator_test/rand_quantization_op_speed_test.py
+++ /dev/null
@@ -1,103 +0,0 @@
-
-
-import time
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, workspace
-from hypothesis import given, settings
-
-
-np.set_printoptions(precision=6)
-
-
-class TestSpeedFloatToFusedRandRowwiseQuantized(hu.HypothesisTestCase):
-    @given(
-        bitwidth_=st.sampled_from([1, 2, 4, 8]),
-        random_=st.sampled_from([True, False]),
-        data_shape_=st.sampled_from(
-            [
-                np.array([32, 512]),
-                np.array([1, 1024]),
-                np.array([1024, 1024]),
-                np.array([1024, 1224]),
-                np.array([512, 969]),
-            ]
-        ),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_speed_of_rand_quantization(self, bitwidth_, random_, data_shape_, gc, dc):
-        X1 = np.random.rand(data_shape_[0], data_shape_[1]).astype(np.float32)
-        X2 = np.random.rand(data_shape_[0], data_shape_[1]).astype(np.float32)
-
-        sub_scale_sum_net = core.Net("sub_scale_sum")
-        sub_op = core.CreateOperator("Sub", ["X1", "X2"], ["dX"])
-        scale_op = core.CreateOperator("Scale", ["dX"], ["dX"], scale=0.023)
-        sum_op = core.CreateOperator("Sum", ["X2", "dX"], ["X2"])
-        sub_scale_sum_net.Proto().op.extend([sub_op, scale_op, sum_op])
-
-        enc_net = core.Net("enc")
-        enc_op = core.CreateOperator(
-            "FloatToFusedRandRowwiseQuantized",
-            ["dX"],
-            ["Y"],
-            bitwidth=bitwidth_,
-            random=random_,
-        )
-        enc_net.Proto().op.extend([enc_op])
-
-        dec_net = core.Net("dec")
-        dec_op = core.CreateOperator(
-            "FusedRandRowwiseQuantizedToFloat", ["Y"], ["decX"]
-        )
-        dec_net.Proto().op.extend([dec_op])
-
-        workspace.FeedBlob("X1", X1)
-        workspace.FeedBlob("X2", X2)
-
-        workspace.CreateNet(sub_scale_sum_net)
-        workspace.CreateNet(enc_net)
-        workspace.CreateNet(dec_net)
-        workspace.RunNet(sub_scale_sum_net)
-        workspace.RunNet(enc_net)
-        workspace.RunNet(dec_net)
-
-        sub_scale_sum_time = 0
-        enc_time = 0
-        dec_time = 0
-        times = 10
-        for _ in range(times):
-            start = time.time()
-            workspace.RunNet(sub_scale_sum_net)
-            end = time.time()
-            sub_scale_sum_time += end - start
-
-            start = time.time()
-            workspace.RunNet(enc_net)
-            end = time.time()
-            enc_time += end - start
-
-            start = time.time()
-            workspace.RunNet(dec_net)
-            end = time.time()
-            dec_time += end - start
-
-        print("Sub+Scale+Sum time: {} ms".format(sub_scale_sum_time / times * 1000))
-        print(
-            "Quantizing time: {} ms ({}X)".format(
-                enc_time / times * 1000, enc_time / sub_scale_sum_time
-            )
-        )
-        print(
-            "De-quantizing time: {} ms ({}X)".format(
-                dec_time / times * 1000, dec_time / sub_scale_sum_time
-            )
-        )
-
-
-if __name__ == "__main__":
-    import unittest
-
-    unittest.main()
diff --git a/caffe2/python/operator_test/rank_loss_operator_test.py b/caffe2/python/operator_test/rank_loss_operator_test.py
deleted file mode 100644
index 592ad4684c22..000000000000
--- a/caffe2/python/operator_test/rank_loss_operator_test.py
+++ /dev/null
@@ -1,146 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestPairWiseLossOps(serial.SerializedTestCase):
-    @given(X=hu.arrays(dims=[2, 1],
-                       elements=hu.floats(min_value=0.0, max_value=10.0)),
-           label=hu.arrays(dims=[2, 1],
-                           elements=st.integers(min_value=0, max_value=1),
-                           dtype=np.float32),
-           **hu.gcs_cpu_only)
-    def test_pair_wise_loss_predictions(self, X, label, gc, dc):
-        workspace.FeedBlob('X', X)
-        workspace.FeedBlob('label', label)
-        new_label = np.array([label[1], label[0]])
-        new_x = np.array([X[1], X[0]])
-        workspace.FeedBlob('new_x', new_x)
-        workspace.FeedBlob('new_label', new_label)
-        net = core.Net('net')
-        net.PairWiseLoss(['X', 'label'], ['output'])
-        net.PairWiseLoss(['new_x', 'new_label'], ['new_output'])
-        plan = core.Plan('predict_data')
-        plan.AddStep(core.execution_step('predict_data',
-                                         [net], num_iter=1))
-        workspace.RunPlan(plan)
-        output = workspace.FetchBlob('output')
-        new_output = workspace.FetchBlob('new_output')
-        sign = 1 if label[0] > label[1] else -1
-        if label[0] == label[1]:
-            self.assertEqual(output.item(), 0)
-            return
-
-        self.assertAlmostEqual(
-            output.item(),
-            np.log(1 + np.exp(sign * (X[1] - X[0]))).item(),
-            delta=1e-4
-        )
-        # check swapping row order doesn't alter overall loss
-        self.assertAlmostEqual(output, new_output)
-
-    @given(X=hu.arrays(dims=[2, 1],
-                       elements=hu.floats(min_value=0.0, max_value=10.0)),
-           label=hu.arrays(dims=[2, 1],
-                           elements=st.integers(min_value=0, max_value=1),
-                           dtype=np.float32),
-           dY=hu.arrays(dims=[1],
-                        elements=hu.floats(min_value=1, max_value=10)),
-           **hu.gcs_cpu_only)
-    def test_pair_wise_loss_gradient(self, X, label, dY, gc, dc):
-        workspace.FeedBlob('X', X)
-        workspace.FeedBlob('dY', dY)
-        workspace.FeedBlob('label', label)
-        net = core.Net('net')
-        net.PairWiseLossGradient(
-            ['X', 'label', 'dY'],
-            ['dX'],
-        )
-        plan = core.Plan('predict_data')
-        plan.AddStep(core.execution_step('predict_data',
-                                         [net], num_iter=1))
-        workspace.RunPlan(plan)
-        dx = workspace.FetchBlob('dX')
-        sign = 1 if label[0] > label[1] else -1
-        if label[0] == label[1]:
-            self.assertEqual(dx[0].item(), 0)
-            return
-        self.assertAlmostEqual(
-            dx[0].item(),
-            (-dY[0] * sign / (1 + np.exp(sign * (X[0] - X[1])))).item(),
-            delta=1e-2 * abs(dx[0].item()))
-
-        self.assertEqual(dx[0].item(), (-dx[1]).item())
-        delta = 1e-3
-        up_x = np.array([[X[0] + delta], [X[1]]], dtype=np.float32)
-        down_x = np.array([[X[0] - delta], [X[1]]], dtype=np.float32)
-        workspace.FeedBlob('up_x', up_x)
-        workspace.FeedBlob('down_x', down_x)
-        new_net = core.Net('new_net')
-        new_net.PairWiseLoss(['up_x', 'label'], ['up_output'])
-        new_net.PairWiseLoss(['down_x', 'label'], ['down_output'])
-
-        plan = core.Plan('predict_data')
-        plan.AddStep(core.execution_step('predict_data', [new_net], num_iter=1))
-        workspace.RunPlan(plan)
-        down_output_pred = workspace.FetchBlob('down_output')
-        up_output_pred = workspace.FetchBlob('up_output')
-        np.testing.assert_allclose(
-            dx[0].item(),
-            (0.5 * dY[0] *
-                (up_output_pred[0] - down_output_pred[0]) / delta).item(),
-            rtol=1e-2, atol=1e-2)
-
-    @serial.given(n=st.integers(0, 10), k=st.integers(1, 5), **hu.gcs_cpu_only)
-    def test_pair_wise_loss_batch(self, n, k, gc, dc):
-        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
-        X = np.random.rand(sum(lengths)).astype(np.float32)
-        label = np.random.randint(k, size=sum(lengths)).astype(np.float32)
-
-        def pair_wise_op(X, label, lengths):
-            N = lengths.size
-            output = np.zeros(N).astype(np.float32)
-
-            def f(x):
-                return np.log(1 + np.exp(x))
-
-            offset = 0
-            for idx in range(N):
-                offset += lengths[idx - 1] if idx > 0 else 0
-                count = 0
-                for i in range(offset, offset + lengths[idx]):
-                    for j in range(offset, i):
-                        if label[i] == label[j]:
-                            continue
-                        sign = 1 if label[i] > label[j] else -1
-                        output[idx] += f(sign * (X[j] - X[i]))
-                        count += 1
-                if count > 0:
-                    output[idx] /= count
-            return [output]
-
-        op = core.CreateOperator(
-            'PairWiseLoss',
-            ['X', 'label', 'lengths'],
-            'out'
-        )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, label, lengths],
-            reference=pair_wise_op,
-        )
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X, label, lengths], [0])
-        # Gradient check
-        self.assertGradientChecks(gc, op, [X, label, lengths], 0, [0])
diff --git a/caffe2/python/operator_test/rebatching_queue_test.py b/caffe2/python/operator_test/rebatching_queue_test.py
deleted file mode 100644
index 20f6f610e11c..000000000000
--- a/caffe2/python/operator_test/rebatching_queue_test.py
+++ /dev/null
@@ -1,288 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-
-import numpy as np
-import numpy.testing as npt
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-
-import functools
-
-
-def primefac(n):
-    ret = []
-    divisor = 2
-    while divisor * divisor <= n:
-        while (n % divisor) == 0:
-            ret.append(divisor)
-            n = n // divisor
-        divisor = divisor + 1
-    if n > 1:
-        ret.append(n)
-    return ret
-
-
-class TestReBatchingQueue(TestCase):
-    def test_rebatching_queue_single_enqueue_dequeue(self):
-        net = core.Net('net')
-
-        tensors = [
-            net.ConstantFill([], 1, value=1.0, run_once=False)
-            for times in range(3)
-        ]
-
-        queue = net.CreateRebatchingQueue([], 1, capacity=10, num_blobs=1)
-
-        net.EnqueueRebatchingQueue([queue, tensors[0]], [])
-        net.EnqueueRebatchingQueue([queue, tensors[1]], [])
-        net.EnqueueRebatchingQueue([queue, tensors[2]], [])
-
-        results = [
-            net.DequeueRebatchingQueue([queue], 1),
-            net.DequeueRebatchingQueue([queue], 1),
-            net.DequeueRebatchingQueue([queue], 1),
-        ]
-
-        workspace.RunNetOnce(net)
-
-        for idx in range(3):
-            self.assertEqual(workspace.FetchBlob(results[idx]), [1.0])
-
-    def test_rebatching_queue_multi_enqueue_dequeue(self):
-        net = core.Net('net')
-        workspace.FeedBlob(
-            "tensors", np.array([x for x in range(10)], np.int32)
-        )
-
-        queue = net.CreateRebatchingQueue([], 1, capacity=10, num_blobs=1)
-
-        net.EnqueueRebatchingQueue([queue, "tensors"], [], enqueue_batch=True)
-
-        results = [
-            net.DequeueRebatchingQueue([queue], 1, num_elements=5),
-            net.DequeueRebatchingQueue([queue], 1, num_elements=5),
-        ]
-
-        workspace.RunNetOnce(net)
-
-        npt.assert_array_equal(
-            workspace.FetchBlob(results[0]), workspace.FetchBlob("tensors")[:5]
-        )
-        npt.assert_array_equal(
-            workspace.FetchBlob(results[1]), workspace.FetchBlob("tensors")[5:]
-        )
-
-    def test_rebatching_queue_closes_properly(self):
-        net = core.Net('net')
-        workspace.FeedBlob(
-            "tensors", np.array([x for x in range(10)], np.int32)
-        )
-
-        queue = net.CreateRebatchingQueue([], 1, capacity=10, num_blobs=1)
-
-        net.EnqueueRebatchingQueue([queue, "tensors"], 0, enqueue_batch=True)
-
-        net.CloseRebatchingQueue([queue], 0)
-
-        results = [
-            net.DequeueRebatchingQueue([queue], 1, num_elements=5),
-            net.DequeueRebatchingQueue([queue], 1, num_elements=5),
-        ]
-
-        workspace.RunNetOnce(net)
-
-        npt.assert_array_equal(
-            workspace.FetchBlob(results[0]), workspace.FetchBlob("tensors")[:5]
-        )
-        npt.assert_array_equal(
-            workspace.FetchBlob(results[1]), workspace.FetchBlob("tensors")[5:]
-        )
-
-        # Enqueuing more should fail now since the queue is closed
-        net.EnqueueRebatchingQueue([queue, "tensors"], [], enqueue_batch=True)
-
-        with self.assertRaises(RuntimeError):
-            workspace.RunNetOnce(net)
-
-        # Dequeuing more should fail now since the queue is closed
-        results = [
-            net.DequeueRebatchingQueue([queue], 1, num_elements=5),
-        ]
-
-        with self.assertRaises(RuntimeError):
-            workspace.RunNetOnce(net)
-
-    def test_rebatching_queue_multiple_components(self):
-        NUM_BLOBS = 4
-        NUM_ELEMENTS = 10
-
-        net = core.Net('net')
-
-        workspace.blobs['complex_tensor'] = np.array(
-            [[x, x + 1] for x in range(NUM_ELEMENTS)], dtype=np.int32
-        )
-
-        tensors = [
-            net.GivenTensorIntFill(
-                [],
-                1,
-                shape=[NUM_ELEMENTS],
-                values=[x for x in range(NUM_ELEMENTS)]
-            ),
-            net.GivenTensorFill(
-                [],
-                1,
-                shape=[NUM_ELEMENTS],
-                values=[x * 1.0 for x in range(NUM_ELEMENTS)]
-            ),
-            net.GivenTensorBoolFill(
-                [],
-                1,
-                shape=[NUM_ELEMENTS],
-                values=[(x % 2 == 0) for x in range(NUM_ELEMENTS)]
-            ),
-            'complex_tensor',
-        ]
-
-        queue = net.CreateRebatchingQueue(
-            [], 1, capacity=10, num_blobs=NUM_BLOBS
-        )
-
-        net.EnqueueRebatchingQueue([queue] + tensors, [], enqueue_batch=True)
-
-        results = net.DequeueRebatchingQueue([queue], NUM_BLOBS, num_elements=5)
-
-        workspace.RunNetOnce(net)
-
-        for idx in range(NUM_BLOBS):
-            npt.assert_array_equal(
-                workspace.FetchBlob(results[idx]),
-                workspace.FetchBlob(tensors[idx])[:5]
-            )
-
-    @given(
-        num_producers=st.integers(1, 5),
-        num_consumers=st.integers(1, 5),
-        producer_input_size=st.integers(1, 10),
-        producer_num_iterations=st.integers(1, 10),
-        capacity=st.integers(1, 10)
-    )
-    @settings(deadline=10000)
-    def test_rebatching_parallel_producer_consumer(
-        self, num_producers, num_consumers, producer_input_size,
-        producer_num_iterations, capacity
-    ):
-        ### Init ###
-        total_inputs = producer_num_iterations * producer_input_size * num_producers
-        inputs = []
-        init_net = core.Net('init_net')
-        queue = init_net.CreateRebatchingQueue(
-            [], 1, capacity=capacity, num_blobs=1
-        )
-
-        ### Producers ###
-        producer_steps = []
-        for i in range(num_producers):
-            name = 'producer_%d' % i
-            net = core.Net(name)
-            values = [
-                producer_input_size * i + x for x in range(producer_input_size)
-            ]
-            for _ in range(producer_num_iterations):
-                inputs.extend(values)
-            tensors = net.GivenTensorIntFill(
-                [], 1, shape=[producer_input_size], values=values
-            )
-
-            net.EnqueueRebatchingQueue([queue, tensors], [], enqueue_batch=True)
-
-            step = core.execution_step(
-                name, net, num_iter=producer_num_iterations
-            )
-            producer_steps.append(step)
-
-        producer_step = core.execution_step(
-            'producer', [
-                core.execution_step(
-                    'producers', producer_steps, concurrent_substeps=True
-                )
-            ]
-        )
-
-        ### Consumers ###
-        outputs = []
-
-        def append(ins, outs):
-            # Extend is atomic
-            outputs.extend(ins[0].data.tolist())
-
-        consumer_steps = []
-        for i in range(num_consumers):
-            # This is just a way of deterministally read all the elements.
-            # We make `num_consumers` almost equal splits
-            # (the reminder goes to the last consumer).
-            num_elements_to_read = total_inputs // num_consumers
-            if i == num_consumers - 1:
-                num_elements_to_read = num_elements_to_read \
-                    + total_inputs % num_consumers
-
-            # If we have nothing to read this consumer will be idle
-            if (num_elements_to_read == 0):
-                continue
-
-            # Now we have to make a split on number of iterations and the read
-            # size for each iteration. This is again just one of many
-            # deterministic  ways of doing it. We factorize the total number of
-            # elements we have to read and assign half of the factors to the
-            # iterations half to the read size.
-            factors = list(primefac(num_elements_to_read))
-
-            num_elements_per_iteration = functools.reduce(
-                lambda x, y: x * y, factors[len(factors) // 2:], 1
-            )
-
-            num_iterations = functools.reduce(
-                lambda x, y: x * y, factors[:len(factors) // 2], 1
-            )
-
-            name = 'consumer_%d' % i
-            net = core.Net(name)
-            blobs = net.DequeueRebatchingQueue(
-                [queue], 1, num_elements=num_elements_per_iteration
-            )
-            net.Python(append)([blobs], 0)
-            consumer_steps.append(
-                core.execution_step(name, net, num_iter=num_iterations)
-            )
-
-        consumer_step = core.execution_step(
-            'consumer', consumer_steps, concurrent_substeps=True
-        )
-
-        init_step = core.execution_step('init', init_net)
-        worker_step = core.execution_step(
-            'worker', [consumer_step, producer_step], concurrent_substeps=True
-        )
-
-        ### Execute Plan ###
-        plan = core.Plan('test')
-        plan.AddStep(init_step)
-        plan.AddStep(worker_step)
-
-        self.ws.run(plan)
-
-        ### Check Results ###
-        # We check that the outputs are a permutation of inputs
-        inputs.sort()
-        outputs.sort()
-        self.assertEqual(inputs, outputs)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/record_queue_test.py b/caffe2/python/operator_test/record_queue_test.py
deleted file mode 100644
index 00e47ed1cb68..000000000000
--- a/caffe2/python/operator_test/record_queue_test.py
+++ /dev/null
@@ -1,92 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.dataset import Dataset
-from caffe2.python.schema import (
-    Struct, Map, Scalar, from_blob_list, NewRecord, FeedRecord)
-from caffe2.python.record_queue import RecordQueue
-from caffe2.python.test_util import TestCase
-import numpy as np
-
-
-class TestRecordQueue(TestCase):
-    def test_record_queue(self):
-        num_prod = 8
-        num_consume = 3
-        schema = Struct(
-            ('floats', Map(
-                Scalar(np.int32),
-                Scalar(np.float32))),
-        )
-        contents_raw = [
-            [1, 2, 3],  # len
-            [11, 21, 22, 31, 32, 33],  # key
-            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
-        ]
-        contents = from_blob_list(schema, contents_raw)
-        ds = Dataset(schema)
-        net = core.Net('init')
-        ds.init_empty(net)
-
-        content_blobs = NewRecord(net, contents)
-        FeedRecord(content_blobs, contents)
-        writer = ds.writer(init_net=net)
-        writer.write_record(net, content_blobs)
-        reader = ds.reader(init_net=net)
-
-        # prepare receiving dataset
-        rec_dataset = Dataset(contents, name='rec')
-        rec_dataset.init_empty(init_net=net)
-        rec_dataset_writer = rec_dataset.writer(init_net=net)
-
-        workspace.RunNetOnce(net)
-
-        queue = RecordQueue(contents, num_threads=num_prod)
-
-        def process(net, fields):
-            new_fields = []
-            for f in fields.field_blobs():
-                new_f = net.Copy(f)
-                new_fields.append(new_f)
-            new_fields = from_blob_list(fields, new_fields)
-            return new_fields
-
-        q_reader, q_step, q_exit, fields = queue.build(reader, process)
-        producer_step = core.execution_step('producer', [q_step, q_exit])
-
-        consumer_steps = []
-        for i in range(num_consume):
-            name = 'queue_reader_' + str(i)
-            net_consume = core.Net(name)
-            should_stop, fields = q_reader.read_record(net_consume)
-            step_consume = core.execution_step(name, net_consume)
-
-            name = 'dataset_writer_' + str(i)
-            net_dataset = core.Net(name)
-            rec_dataset_writer.write(net_dataset, fields.field_blobs())
-            step_dataset = core.execution_step(name, net_dataset)
-
-            step = core.execution_step(
-                'consumer_' + str(i),
-                [step_consume, step_dataset],
-                should_stop_blob=should_stop)
-            consumer_steps.append(step)
-        consumer_step = core.execution_step(
-            'consumers', consumer_steps, concurrent_substeps=True)
-
-        work_steps = core.execution_step(
-            'work', [producer_step, consumer_step], concurrent_substeps=True)
-
-        plan = core.Plan('test')
-        plan.AddStep(work_steps)
-        core.workspace.RunPlan(plan)
-        data = workspace.FetchBlobs(rec_dataset.get_blobs())
-        self.assertEqual(6, sum(data[0]))
-        self.assertEqual(150, sum(data[1]))
-        self.assertAlmostEqual(15, sum(data[2]), places=5)
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/recurrent_net_executor_test.py b/caffe2/python/operator_test/recurrent_net_executor_test.py
deleted file mode 100644
index 7c21ee633168..000000000000
--- a/caffe2/python/operator_test/recurrent_net_executor_test.py
+++ /dev/null
@@ -1,307 +0,0 @@
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import model_helper, workspace, core, rnn_cell, test_util
-from caffe2.python.attention import AttentionType
-
-import numpy as np
-
-import unittest
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-from hypothesis import given, settings
-
-
-class TestRNNExecutor(test_util.TestCase):
-
-    def setUp(self):
-        super().setUp()
-        self.batch_size = 8
-        self.input_dim = 20
-        self.hidden_dim = 30
-        self.encoder_dim = 40
-
-    @given(
-        T=st.integers(10, 100),
-        forward_only=st.booleans(),
-        **hu.gcs)
-    @settings(deadline=10000)
-    def test_lstm_with_attention_equal_simplenet(self, T, forward_only, gc, dc):
-        self.Tseq = [T, T // 2, T // 2 + T // 4, T, T // 2 + 1]
-        workspace.ResetWorkspace()
-        with core.DeviceScope(gc):
-            print("Run with device: {}, forward only: {}".format(
-                gc, forward_only))
-
-            workspace.FeedBlob(
-                "seq_lengths",
-                np.array([T] * self.batch_size, dtype=np.int32)
-            )
-            workspace.FeedBlob("target", np.random.rand(
-                T, self.batch_size, self.hidden_dim).astype(np.float32))
-            workspace.FeedBlob("hidden_init", np.zeros(
-                [1, self.batch_size, self.hidden_dim], dtype=np.float32
-            ))
-            workspace.FeedBlob("cell_init", np.zeros(
-                [1, self.batch_size, self.hidden_dim], dtype=np.float32
-            ))
-
-            model = model_helper.ModelHelper(name="lstm")
-            model.net.AddExternalInputs(["input"])
-
-            init_blobs = []
-            hidden_init, cell_init, encoder_outputs = model.net.AddExternalInputs(
-                "hidden_init",
-                "cell_init",
-                "encoder_outputs"
-            )
-
-            awec_init = model.net.AddExternalInputs([
-                'initial_attention_weighted_encoder_context',
-            ])
-            init_blobs.extend([hidden_init, cell_init])
-
-            workspace.FeedBlob(
-                awec_init,
-                np.random.rand(1, self.batch_size, self.encoder_dim).astype(
-                    np.float32),
-            )
-            workspace.FeedBlob(
-                encoder_outputs,
-                np.random.rand(1, self.batch_size, self.encoder_dim).astype(
-                    np.float32),
-            )
-
-            outputs = rnn_cell.LSTMWithAttention(
-                model=model,
-                decoder_inputs="input",
-                decoder_input_lengths="seq_lengths",
-                initial_decoder_hidden_state=hidden_init,
-                initial_decoder_cell_state=cell_init,
-                initial_attention_weighted_encoder_context=awec_init,
-                encoder_output_dim=self.encoder_dim,
-                encoder_outputs=encoder_outputs,
-                encoder_lengths=None,
-                decoder_input_dim=self.input_dim,
-                decoder_state_dim=self.hidden_dim,
-                scope="",
-                attention_type=AttentionType.Recurrent,
-                forward_only=forward_only,
-                outputs_with_grads=[0],
-            )
-            output = outputs[0]
-
-            print(outputs)
-            loss = model.AveragedLoss(
-                model.SquaredL2Distance([output, "target"], "dist"),
-                "loss"
-            )
-            # Add gradient ops
-            if not forward_only:
-                model.AddGradientOperators([loss])
-
-            # init
-            for init_blob in init_blobs:
-                workspace.FeedBlob(init_blob, np.zeros(
-                    [1, self.batch_size, self.hidden_dim], dtype=np.float32
-                ))
-
-            self._compare(model, forward_only)
-
-    def init_lstm_model(self, T, num_layers, forward_only, use_loss=True):
-        workspace.FeedBlob(
-            "seq_lengths",
-            np.array([T] * self.batch_size, dtype=np.int32)
-        )
-        workspace.FeedBlob("target", np.random.rand(
-            T, self.batch_size, self.hidden_dim).astype(np.float32))
-        workspace.FeedBlob("hidden_init", np.zeros(
-            [1, self.batch_size, self.hidden_dim], dtype=np.float32
-        ))
-        workspace.FeedBlob("cell_init", np.zeros(
-            [1, self.batch_size, self.hidden_dim], dtype=np.float32
-        ))
-
-        model = model_helper.ModelHelper(name="lstm")
-        model.net.AddExternalInputs(["input"])
-
-        init_blobs = []
-        for i in range(num_layers):
-            hidden_init, cell_init = model.net.AddExternalInputs(
-                "hidden_init_{}".format(i),
-                "cell_init_{}".format(i)
-            )
-            init_blobs.extend([hidden_init, cell_init])
-
-        output, last_hidden, _, last_state = rnn_cell.LSTM(
-            model=model,
-            input_blob="input",
-            seq_lengths="seq_lengths",
-            initial_states=init_blobs,
-            dim_in=self.input_dim,
-            dim_out=[self.hidden_dim] * num_layers,
-            scope="",
-            drop_states=True,
-            forward_only=forward_only,
-            return_last_layer_only=True,
-        )
-
-        if use_loss:
-            loss = model.AveragedLoss(
-                model.SquaredL2Distance([output, "target"], "dist"),
-                "loss"
-            )
-            # Add gradient ops
-            if not forward_only:
-                model.AddGradientOperators([loss])
-
-        # init
-        for init_blob in init_blobs:
-            workspace.FeedBlob(init_blob, np.zeros(
-                [1, self.batch_size, self.hidden_dim], dtype=np.float32
-            ))
-
-        return model, output
-
-    def test_empty_sequence(self):
-        '''
-        Test the RNN executor's handling of empty input sequences
-        '''
-        Tseq = [0, 1, 2, 3, 0, 1]
-        workspace.ResetWorkspace()
-        with core.DeviceScope(caffe2_pb2.DeviceOption()):
-            model, output = self.init_lstm_model(
-                T=4, num_layers=1, forward_only=True, use_loss=False)
-
-            workspace.RunNetOnce(model.param_init_net)
-
-            self.enable_rnn_executor(model.net, 1, True)
-
-            np.random.seed(10022015)
-            first_call = True
-            for seq_len in Tseq:
-                input_shape = [seq_len, self.batch_size, self.input_dim]
-                workspace.FeedBlob(
-                    "input", np.random.rand(*input_shape).astype(np.float32))
-                workspace.FeedBlob(
-                    "target",
-                    np.random.rand(
-                        seq_len, self.batch_size, self.hidden_dim
-                    ).astype(np.float32))
-                if first_call:
-                    workspace.CreateNet(model.net, overwrite=True)
-                    first_call = False
-
-                workspace.RunNet(model.net.Proto().name)
-                val = workspace.FetchBlob(output)
-                self.assertEqual(val.shape[0], seq_len)
-
-    @given(
-        num_layers=st.integers(1, 8),
-        T=st.integers(4, 100),
-        forward_only=st.booleans(),
-        **hu.gcs)
-    @settings(deadline=10000)
-    def test_lstm_equal_simplenet(self, num_layers, T, forward_only, gc, dc):
-        '''
-        Test that the RNN executor produces same results as
-        the non-executor (i.e running step nets as sequence of simple nets).
-        '''
-        self.Tseq = [T, T // 2, T // 2 + T // 4, T, T // 2 + 1]
-
-        workspace.ResetWorkspace()
-        with core.DeviceScope(gc):
-            print("Run with device: {}, forward only: {}".format(
-                gc, forward_only))
-
-            model, _ = self.init_lstm_model(T, num_layers, forward_only)
-            self._compare(model, forward_only)
-
-    def _compare(self, model, forward_only):
-        # Store list of blobs that exist in the beginning
-        workspace.RunNetOnce(model.param_init_net)
-        init_ws = {k: workspace.FetchBlob(k) for k in workspace.Blobs()}
-
-        # Run with executor
-        for enable_executor in [0, 1]:
-            self.enable_rnn_executor(model.net, enable_executor, forward_only)
-            workspace.ResetWorkspace()
-
-            # Reset original state
-            for k, v in init_ws.items():
-                workspace.FeedBlob(k, v)
-
-            np.random.seed(10022015)
-            ws = {}
-            for j in range(len(self.Tseq)):
-                input_shape = [self.Tseq[j], self.batch_size, self.input_dim]
-                workspace.FeedBlob(
-                    "input", np.random.rand(*input_shape).astype(np.float32))
-                workspace.FeedBlob(
-                    "target",
-                    np.random.rand(
-                        self.Tseq[j], self.batch_size, self.hidden_dim
-                    ).astype(np.float32))
-                if j == 0:
-                    workspace.CreateNet(model.net, overwrite=True)
-
-                workspace.RunNet(model.net.Proto().name)
-
-                # Store results for each iteration
-                for k in workspace.Blobs():
-                    ws[k + "." + str(j)] = workspace.FetchBlob(k)
-
-            if enable_executor:
-                rnn_exec_ws = ws
-            else:
-                non_exec_ws = ws
-
-        # Test that all blobs are equal after running with executor
-        # or without.
-        self.assertEqual(list(non_exec_ws.keys()), list(rnn_exec_ws.keys()))
-
-        mismatch = False
-        for k in rnn_exec_ws.keys():
-            non_exec_v = non_exec_ws[k]
-            rnn_exec_v = rnn_exec_ws[k]
-            if type(non_exec_v) is np.ndarray:
-                if not np.allclose(non_exec_v, rnn_exec_v):
-                    print("Mismatch: {}".format(k))
-                    nv = non_exec_v.flatten()
-                    rv = rnn_exec_v.flatten()
-                    c = 0
-                    for j in range(len(nv)):
-                        if rv[j] != nv[j]:
-                            print(j, rv[j], nv[j])
-                            c += 1
-                            if c == 10:
-                                break
-
-                    mismatch = True
-
-        self.assertFalse(mismatch)
-
-    def enable_rnn_executor(self, net, value, forward_only):
-        num_found = 0
-        for op in net.Proto().op:
-            if op.type.startswith("RecurrentNetwork"):
-                for arg in op.arg:
-                    if arg.name == 'enable_rnn_executor':
-                        arg.i = value
-                        num_found += 1
-        # This sanity check is so that if someone changes the
-        # enable_rnn_executor parameter name, the test will
-        # start failing as this function will become defective.
-        self.assertEqual(1 if forward_only else 2, num_found)
-
-if __name__ == "__main__":
-    import random
-    random.seed(2603)
-    workspace.GlobalInit([
-        'caffe2',
-        '--caffe2_log_level=0',
-        '--caffe2_rnn_executor=1'])
-    unittest.main()
diff --git a/caffe2/python/operator_test/recurrent_network_test.py b/caffe2/python/operator_test/recurrent_network_test.py
deleted file mode 100644
index 33ada4d6881c..000000000000
--- a/caffe2/python/operator_test/recurrent_network_test.py
+++ /dev/null
@@ -1,383 +0,0 @@
-
-
-
-
-
-from caffe2.python import recurrent, workspace
-from caffe2.python.model_helper import ModelHelper
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-class RecurrentNetworkTest(serial.SerializedTestCase):
-    @given(T=st.integers(1, 4),
-           n=st.integers(1, 5),
-           d=st.integers(1, 5))
-    @settings(deadline=10000)
-    def test_sum_mul(self, T, n, d):
-        model = ModelHelper(name='external')
-
-        input_blob, initial_input_blob = model.net.AddExternalInputs(
-            'input', 'initial_input')
-
-        step = ModelHelper(name='step', param_model=model)
-        input_t, output_t_prev = step.net.AddExternalInput(
-            'input_t', 'output_t_prev')
-        output_t_internal = step.net.Sum([input_t, output_t_prev])
-        output_t = step.net.Mul([input_t, output_t_internal])
-        step.net.AddExternalOutput(output_t)
-
-        self.simple_rnn(T, n, d, model, step, input_t, output_t, output_t_prev,
-                        input_blob, initial_input_blob)
-
-    @given(T=st.integers(1, 4),
-           n=st.integers(1, 5),
-           d=st.integers(1, 5))
-    @settings(deadline=10000)
-    def test_mul(self, T, n, d):
-        model = ModelHelper(name='external')
-
-        input_blob, initial_input_blob = model.net.AddExternalInputs(
-            'input', 'initial_input')
-
-        step = ModelHelper(name='step', param_model=model)
-        input_t, output_t_prev = step.net.AddExternalInput(
-            'input_t', 'output_t_prev')
-        output_t = step.net.Mul([input_t, output_t_prev])
-        step.net.AddExternalOutput(output_t)
-
-        self.simple_rnn(T, n, d, model, step, input_t, output_t, output_t_prev,
-                        input_blob, initial_input_blob)
-
-    @given(T=st.integers(1, 4),
-           n=st.integers(1, 5),
-           d=st.integers(1, 5))
-    def test_extract(self, T, n, d):
-        model = ModelHelper(name='external')
-        workspace.ResetWorkspace()
-
-        input_blob, initial_input_blob = model.net.AddExternalInputs(
-            'input', 'initial_input')
-
-        step = ModelHelper(name='step', param_model=model)
-        input_t, output_t_prev = step.net.AddExternalInput(
-            'input_t', 'output_t_prev')
-        output_t = step.net.Mul([input_t, output_t_prev])
-        step.net.AddExternalOutput(output_t)
-
-        inputs = np.random.randn(T, n, d).astype(np.float32)
-        initial_input = np.random.randn(1, n, d).astype(np.float32)
-        recurrent.recurrent_net(
-            net=model.net,
-            cell_net=step.net,
-            inputs=[(input_t, input_blob)],
-            initial_cell_inputs=[(output_t_prev, initial_input_blob)],
-            links={output_t_prev: output_t},
-            scope="test_rnn_sum_mull",
-        )
-
-        workspace.blobs[input_blob] = inputs
-        workspace.blobs[initial_input_blob] = initial_input
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.CreateNet(model.net)
-
-        prefix = "extractTest"
-
-        workspace.RunNet(model.net.Proto().name, T)
-        retrieved_blobs = recurrent.retrieve_step_blobs(
-            model.net, prefix
-        )
-
-        # needed for python3.6, which returns bytearrays instead of str
-        retrieved_blobs = [x.decode() for x in retrieved_blobs]
-
-        for i in range(T):
-            blob_name = prefix + "_" + "input_t" + str(i)
-            self.assertTrue(
-                blob_name in retrieved_blobs,
-                "blob extraction failed on timestep {}\
-                    . \n\n Extracted Blobs: {} \n\n Looking for {}\
-                    .".format(i, retrieved_blobs, blob_name)
-            )
-
-    def simple_rnn(self, T, n, d, model, step, input_t, output_t, output_t_prev,
-                   input_blob, initial_input_blob):
-
-        input = np.random.randn(T, n, d).astype(np.float32)
-        initial_input = np.random.randn(1, n, d).astype(np.float32)
-        print(locals())
-        recurrent.recurrent_net(
-            net=model.net,
-            cell_net=step.net,
-            inputs=[(input_t, input_blob)],
-            initial_cell_inputs=[(output_t_prev, initial_input_blob)],
-            links={output_t_prev: output_t},
-            scope="test_rnn_sum_mull",
-        )
-        workspace.blobs[input_blob] = input
-        workspace.blobs[initial_input_blob] = initial_input
-
-        op = model.net._net.op[-1]
-        # Just conviniently store all inputs in an array in the same
-        # order as op.input
-        inputs = [workspace.blobs[name] for name in op.input]
-
-        def reference(input, initial_input):
-            global_ws_name = workspace.CurrentWorkspace()
-            input_all = workspace.blobs[input_blob]
-
-            workspace.SwitchWorkspace("ref", create_if_missing=True)
-            workspace.blobs[input_blob] = input
-            workspace.blobs[output_t_prev] = initial_input.reshape(n, d)
-            res_all = np.zeros(shape=input.shape, dtype=np.float32)
-
-            for t_cur in range(T):
-                workspace.blobs[input_t] = input_all[t_cur]
-                workspace.RunNetOnce(step.net)
-                result_t = workspace.blobs[output_t]
-                workspace.blobs[output_t_prev] = result_t
-                res_all[t_cur] = result_t
-
-            workspace.SwitchWorkspace(global_ws_name)
-
-            shape = list(input.shape)
-            shape[0] = 1
-            return (res_all, res_all[-1].reshape(shape))
-
-        self.assertReferenceChecks(
-            device_option=hu.cpu_do,
-            op=op,
-            inputs=inputs,
-            reference=reference,
-            output_to_grad=op.output[0],
-            outputs_to_check=[0, 1],
-        )
-
-        self.assertGradientChecks(
-            device_option=hu.cpu_do,
-            op=op,
-            inputs=inputs,
-            outputs_to_check=0,
-            outputs_with_grads=[0],
-            threshold=0.01,
-            stepsize=0.005,
-        )
-
-    # Hacky version of 1-D convolution
-    def _convolution_1d(
-        self,
-        model,
-        inputs,
-        conv_window,
-        conv_filter,
-        conv_bias,
-        output_name,
-        left_pad,
-    ):
-        if left_pad:
-            padding_width = conv_window - 1
-        else:
-            padding_width = 0
-
-        # [batch_size, inputs_length, state_size]
-        inputs_transposed = model.net.Transpose(
-            inputs,
-            'inputs_transposed',
-            axes=[1, 0, 2],
-        )
-        # [batch_size, 1, inputs_length, state_size]
-        inputs_transposed_4d = model.net.ExpandDims(
-            inputs_transposed,
-            'inputs_transposed_4d',
-            dims=[1],
-        )
-        # [batch_size, 1, inputs_length - conv_window + 1, state_size]
-        output_transposed_4d = model.net.Conv(
-            [inputs_transposed_4d, conv_filter, conv_bias],
-            output_name + '_transposed_4d',
-            kernel_h=1,
-            kernel_w=conv_window,
-            order='NHWC',
-            pad_t=0,
-            pad_l=padding_width,
-            pad_b=0,
-            pad_r=0,
-        )
-        # [batch_size, inputs_length - conv_window + 1, state_size]
-        output_transposed = model.net.Squeeze(
-            output_transposed_4d,
-            output_name + '_transposed',
-            dims=[1],
-        )
-        # [inputs_length - conv_window + 1, batch_size, state_size]
-        output = model.net.Transpose(
-            output_transposed,
-            output_name,
-            axes=[1, 0, 2],
-        )
-        return output
-
-    @given(sequence_length=st.integers(3, 7),
-           conv_window=st.integers(1, 3),
-           batch_size=st.integers(1, 5),
-           state_size=st.integers(1, 5))
-    def test_stateful_convolution_forward_only(
-        self,
-        sequence_length,
-        conv_window,
-        batch_size,
-        state_size,
-    ):
-        '''
-        This unit test demonstrates another ways of using RecurrentNetwork.
-
-        Imagine, that you want to compute convolution over a sequence,
-        but sequence elements are not given to you from the beginning,
-        so you have to loop over the sequence and compute convolution
-        for each element separately. This situation can occur,
-        during inference/generation step of the neural networks.
-
-        First of all, you have to provide actual input via recurrent states,
-        since the input of RecurrentNetwork should be known in advance.
-        Here, we use `fake_inputs` as the input,
-        and it's used by the op to extract batch size and sequence length.
-        The actual input sequence is stored in the recurrent state
-        `input_state`. At every step we generate a new element via input_state_t
-        (in this example, input_state_t is generated at random, but
-        in a real situation it can be created using convolution output
-        from the previous step).
-
-        A few important differences from regular RecurrentNetwork usecase:
-
-        1. input_state_t_prev is not only a single previous element of
-        input_state sequence. It is last conv_window elements including (!)
-        the current one - input_state_t. We specify that using `link_window`
-        argument of RecurrentNetwork. We need that many elements to
-        compute a single convolution step. Also, note that `link_window`
-        specifies how many elements to link starting at
-        `timestep` + `link_offset` position.
-
-        2. First few steps might require additional zero padding from the left,
-        since there is no enough element of input_state sequence are available.
-        So the initial_state for input_state contains several elements
-        (exactly how many pads we need for the first step). Also, because of
-        that all offseting over input_state sequence is being shifted
-        by length of initial_input_state: see `link_offset` and `alias_offset`
-        arguments of RecurrentNetwork.
-
-        In this test, we assert that we get the same result
-        if we apply convolution over all elements simultaneously,
-        since the whole input_state sequence was generated at the end.
-    '''
-        model = ModelHelper(name='model')
-        fake_inputs = model.param_init_net.UniformFill(
-            [],
-            'fake_inputs',
-            min=-1.0,
-            max=1.0,
-            shape=[sequence_length, batch_size, state_size],
-        )
-        initial_input_state = model.param_init_net.ConstantFill(
-            [],
-            'initial_input_state',
-            value=0.0,
-            shape=[conv_window - 1, batch_size, state_size],
-        )
-        initial_output_state = model.param_init_net.ConstantFill(
-            [],
-            'initial_output_state',
-            value=0.0,
-            shape=[1, batch_size, state_size],
-        )
-        step_model = ModelHelper(name='step_model', param_model=model)
-        (
-            fake_input_t,
-            timestep,
-            input_state_t_prev,
-        ) = step_model.net.AddExternalInputs(
-            'fake_input_t',
-            'timestep',
-            'input_state_t_prev',
-        )
-        conv_filter = step_model.param_init_net.XavierFill(
-            [],
-            'conv_filter',
-            shape=[state_size, 1, conv_window, state_size],
-        )
-        conv_bias = step_model.param_init_net.ConstantFill(
-            [],
-            'conv_bias',
-            shape=[state_size],
-            value=0.0,
-        )
-        step_model.params.extend([conv_filter, conv_bias])
-        input_state_t = step_model.net.UniformFill(
-            [],
-            'input_state_t',
-            min=-1.0,
-            max=1.0,
-            shape=[1, batch_size, state_size],
-        )
-        output_state_t = self._convolution_1d(
-            model=step_model,
-            inputs=input_state_t_prev,
-            conv_window=conv_window,
-            conv_filter=conv_filter,
-            conv_bias=conv_bias,
-            output_name='output_state_t',
-            left_pad=False,
-        )
-        initial_recurrent_states = [initial_input_state, initial_output_state]
-        all_inputs = (
-            [fake_inputs] + step_model.params + initial_recurrent_states
-        )
-        all_outputs = ['input_state_all', 'output_state_all']
-        recurrent_states = ['input_state', 'output_state']
-        input_state_all, output_state_all, _ = model.net.RecurrentNetwork(
-            all_inputs,
-            all_outputs + ['step_workspaces'],
-            param=[all_inputs.index(p) for p in step_model.params],
-            alias_src=recurrent_states,
-            alias_dst=all_outputs,
-            alias_offset=[conv_window - 1, 1],
-            recurrent_states=recurrent_states,
-            initial_recurrent_state_ids=[
-                all_inputs.index(s) for s in initial_recurrent_states
-            ],
-            link_internal=[
-                str(input_state_t_prev),
-                str(input_state_t),
-                str(output_state_t),
-            ],
-            link_external=['input_state', 'input_state', 'output_state'],
-            link_offset=[0, conv_window - 1, 1],
-            link_window=[conv_window, 1, 1],
-            backward_link_internal=[],
-            backward_link_external=[],
-            backward_link_offset=[],
-            step_net=step_model.net.Proto(),
-            timestep='timestep' if timestep is None else str(timestep),
-            outputs_with_grads=[],
-        )
-
-        output_states_2 = self._convolution_1d(
-            model=model,
-            inputs=input_state_all,
-            conv_window=conv_window,
-            conv_filter=conv_filter,
-            conv_bias=conv_bias,
-            output_name='output_states_2',
-            left_pad=True,
-        )
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-        np.testing.assert_almost_equal(
-            workspace.FetchBlob(output_state_all),
-            workspace.FetchBlob(output_states_2),
-            decimal=3,
-        )
diff --git a/caffe2/python/operator_test/reduce_ops_test.py b/caffe2/python/operator_test/reduce_ops_test.py
deleted file mode 100644
index 66f5dd4aa99c..000000000000
--- a/caffe2/python/operator_test/reduce_ops_test.py
+++ /dev/null
@@ -1,449 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from hypothesis import given, settings
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-import itertools as it
-
-
-class TestReduceOps(serial.SerializedTestCase):
-    def run_reduce_op_test_impl(
-            self, op_name, X, axes, keepdims, ref_func, gc, dc, allow_broadcast_fastpath):
-        extra_args = dict(allow_broadcast_fastpath=True) if allow_broadcast_fastpath else {}
-        if axes is None:
-            op = core.CreateOperator(
-                op_name,
-                ["X"],
-                ["Y"],
-                keepdims=keepdims,
-                **extra_args,
-            )
-        else:
-            op = core.CreateOperator(
-                op_name,
-                ["X"],
-                ["Y"],
-                axes=axes,
-                keepdims=keepdims,
-                **extra_args,
-            )
-
-        def ref(X):
-            return [ref_func(
-                X, axis=None if axes is None else tuple(axes),
-                keepdims=keepdims)]
-
-        with self.set_disable_serialized_check(allow_broadcast_fastpath):
-            self.assertReferenceChecks(gc, op, [X], ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    def run_reduce_op_test(
-            self, op_name, X, keepdims, num_axes, ref_func, gc, dc, allow_broadcast_fastpath=False):
-        self.run_reduce_op_test_impl(
-            op_name, X, None, keepdims, ref_func, gc, dc, allow_broadcast_fastpath)
-
-        num_dims = len(X.shape)
-        if num_dims < num_axes:
-            self.run_reduce_op_test_impl(
-                op_name, X, range(num_dims), keepdims, ref_func, gc, dc, allow_broadcast_fastpath)
-        else:
-            for axes in it.combinations(range(num_dims), num_axes):
-                self.run_reduce_op_test_impl(
-                    op_name, X, axes, keepdims, ref_func, gc, dc, allow_broadcast_fastpath)
-
-    @serial.given(
-        X=hu.tensor(max_dim=3, dtype=np.float32),
-        keepdims=st.booleans(),
-        allow_broadcast_fastpath=st.booleans(),
-        num_axes=st.integers(1, 3), **hu.gcs)
-    def test_reduce_min(self, X, keepdims, allow_broadcast_fastpath, num_axes, gc, dc):
-        X_dims = X.shape
-        X_size = X.size
-        X = np.arange(X_size, dtype=np.float32)
-        np.random.shuffle(X)
-        X = X.reshape(X_dims)
-        self.run_reduce_op_test(
-            "ReduceMin", X, keepdims, num_axes, np.min, gc, dc,
-            allow_broadcast_fastpath=allow_broadcast_fastpath)
-
-    @serial.given(
-        X=hu.tensor(max_dim=3, dtype=np.float32),
-        keepdims=st.booleans(),
-        allow_broadcast_fastpath=st.booleans(),
-        num_axes=st.integers(1, 3), **hu.gcs)
-    def test_reduce_max(self, X, keepdims, allow_broadcast_fastpath, num_axes, gc, dc):
-        X_dims = X.shape
-        X_size = X.size
-        X = np.arange(X_size, dtype=np.float32)
-        np.random.shuffle(X)
-        X = X.reshape(X_dims)
-        self.run_reduce_op_test(
-            "ReduceMax", X, keepdims, num_axes, np.max, gc, dc,
-            allow_broadcast_fastpath=allow_broadcast_fastpath)
-
-    @given(n=st.integers(0, 5), m=st.integers(0, 5), k=st.integers(0, 5),
-           t=st.integers(0, 5), keepdims=st.booleans(),
-           allow_broadcast_fastpath=st.booleans(),
-           num_axes=st.integers(1, 3), **hu.gcs)
-    @settings(deadline=10000)
-    def test_reduce_sum(self, n, m, k, t, keepdims, allow_broadcast_fastpath, num_axes, gc, dc):
-        X = np.random.randn(n, m, k, t).astype(np.float32)
-        self.run_reduce_op_test(
-            "ReduceSum", X, keepdims, num_axes, np.sum, gc, dc,
-            allow_broadcast_fastpath=allow_broadcast_fastpath)
-
-    @serial.given(X=hu.tensor(dtype=np.float32), keepdims=st.booleans(),
-                  allow_broadcast_fastpath=st.booleans(),
-                  num_axes=st.integers(1, 4), **hu.gcs)
-    def test_reduce_mean(self, X, keepdims, allow_broadcast_fastpath, num_axes, gc, dc):
-        self.run_reduce_op_test(
-            "ReduceMean", X, keepdims, num_axes, np.mean, gc, dc,
-            allow_broadcast_fastpath=allow_broadcast_fastpath)
-
-    @given(n=st.integers(1, 3), m=st.integers(1, 3), k=st.integers(1, 3),
-           keepdims=st.booleans(), allow_broadcast_fastpath=st.booleans(),
-           num_axes=st.integers(1, 3), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_reduce_l1(self, n, m, k, keepdims, allow_broadcast_fastpath, num_axes, gc, dc):
-        X = np.arange(n * m * k, dtype=np.float32) - 0.5
-        np.random.shuffle(X)
-        X = X.reshape((m, n, k))
-        self.run_reduce_op_test(
-            "ReduceL1", X, keepdims, num_axes, getNorm(1), gc, dc,
-            allow_broadcast_fastpath=allow_broadcast_fastpath)
-
-    @serial.given(n=st.integers(1, 5), m=st.integers(1, 5), k=st.integers(1, 5),
-                  keepdims=st.booleans(), allow_broadcast_fastpath=st.booleans(),
-                  num_axes=st.integers(1, 3), **hu.gcs_cpu_only)
-    def test_reduce_l2(self, n, m, k, keepdims, allow_broadcast_fastpath, num_axes, gc, dc):
-        X = np.random.randn(n, m, k).astype(np.float32)
-        self.run_reduce_op_test(
-            "ReduceL2", X, keepdims, num_axes, getNorm(2), gc, dc,
-            allow_broadcast_fastpath=allow_broadcast_fastpath)
-
-
-def getNorm(p):
-    if p == 1:
-        def norm(X, axis, keepdims):
-            return np.sum(np.abs(X), axis=axis, keepdims=keepdims)
-    elif p == 2:
-        def norm(X, axis, keepdims):
-            return np.sqrt(np.sum(np.power(X, 2), axis=axis, keepdims=keepdims))
-    else:
-        raise RuntimeError("Only L1 and L2 norms supported")
-    return norm
-
-
-class TestReduceFrontReductions(serial.SerializedTestCase):
-    def grad_variant_input_test(self, grad_op_name, X, ref, num_reduce_dim):
-        workspace.ResetWorkspace()
-
-        Y = np.array(ref(X)[0]).astype(np.float32)
-        dY = np.array(np.random.rand(*Y.shape)).astype(np.float32)
-        shape = np.array(X.shape).astype(np.int64)
-
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("dY", dY)
-        workspace.FeedBlob("shape", shape)
-
-        grad_op = core.CreateOperator(
-            grad_op_name, ["dY", "X"], ["dX"], num_reduce_dim=num_reduce_dim)
-
-        grad_op1 = core.CreateOperator(
-            grad_op_name, ["dY", "shape"], ["dX1"],
-            num_reduce_dim=num_reduce_dim)
-
-        workspace.RunOperatorOnce(grad_op)
-        workspace.RunOperatorOnce(grad_op1)
-
-        dX = workspace.FetchBlob("dX")
-        dX1 = workspace.FetchBlob("dX1")
-        np.testing.assert_array_equal(dX, dX1)
-
-    def max_op_test(
-            self, op_name, num_reduce_dim, gc, dc, in_data, in_names, ref_max):
-
-        op = core.CreateOperator(
-            op_name,
-            in_names,
-            ["outputs"],
-            num_reduce_dim=num_reduce_dim
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=in_data,
-            reference=ref_max,
-        )
-
-        # Skip gradient check because it is too unreliable with max.
-        # Just check CPU and CUDA have same results
-        Y = np.array(ref_max(*in_data)[0]).astype(np.float32)
-        dY = np.array(np.random.rand(*Y.shape)).astype(np.float32)
-        if len(in_data) == 2:
-            grad_in_names = ["dY", in_names[0], "Y", in_names[1]]
-            grad_in_data = [dY, in_data[0], Y, in_data[1]]
-        else:
-            grad_in_names = ["dY", in_names[0], "Y"]
-            grad_in_data = [dY, in_data[0], Y]
-
-        grad_op = core.CreateOperator(
-            op_name + "Gradient",
-            grad_in_names,
-            ["dX"],
-            num_reduce_dim=num_reduce_dim
-        )
-        self.assertDeviceChecks(dc, grad_op, grad_in_data, [0])
-
-    def reduce_op_test(self, op_name, op_ref, in_data, in_names,
-                       num_reduce_dims, device):
-        op = core.CreateOperator(
-            op_name,
-            in_names,
-            ["outputs"],
-            num_reduce_dim=num_reduce_dims
-        )
-
-        self.assertReferenceChecks(
-            device_option=device,
-            op=op,
-            inputs=in_data,
-            reference=op_ref
-        )
-
-        self.assertGradientChecks(
-            device, op, in_data, 0, [0], stepsize=1e-2, threshold=1e-2)
-
-    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
-    @settings(deadline=10000)
-    def test_reduce_front_sum(self, num_reduce_dim, gc, dc):
-        X = np.random.rand(7, 4, 3, 5).astype(np.float32)
-
-        def ref_sum(X):
-            return [np.sum(X, axis=(tuple(range(num_reduce_dim))))]
-
-        self.reduce_op_test(
-            "ReduceFrontSum", ref_sum, [X], ["input"], num_reduce_dim, gc)
-        self.grad_variant_input_test(
-            "ReduceFrontSumGradient", X, ref_sum, num_reduce_dim)
-
-    @given(num_reduce_dim=st.integers(0, 4), seed=st.integers(0, 4), **hu.gcs)
-    def test_reduce_front_sum_empty_batch(self, num_reduce_dim, seed, gc, dc):
-        np.random.seed(seed)
-        X = np.random.rand(0, 4, 3, 5).astype(np.float32)
-
-        def ref_sum(X):
-            return [np.sum(X, axis=(tuple(range(num_reduce_dim))))]
-
-        self.reduce_op_test(
-            "ReduceFrontSum", ref_sum, [X], ["input"], num_reduce_dim, gc)
-        self.grad_variant_input_test(
-            "ReduceFrontSumGradient", X, ref_sum, num_reduce_dim)
-
-        # test the second iteration
-        not_empty_X = np.random.rand(2, 4, 3, 5).astype(np.float32)
-        net = core.Net('test')
-        with core.DeviceScope(gc):
-            net.ReduceFrontSum(
-                ['X'], ['output'],
-                num_reduce_dim=num_reduce_dim
-            )
-            workspace.CreateNet(net)
-
-            workspace.FeedBlob('X', not_empty_X)
-            workspace.RunNet(workspace.GetNetName(net))
-            output = workspace.FetchBlob('output')
-            np.testing.assert_allclose(
-                output, ref_sum(not_empty_X)[0], atol=1e-3)
-
-            workspace.FeedBlob('X', X)
-            workspace.RunNet(workspace.GetNetName(net))
-            output = workspace.FetchBlob('output')
-            np.testing.assert_allclose(output, ref_sum(X)[0], atol=1e-3)
-
-    @given(**hu.gcs)
-    @settings(deadline=None)
-    def test_reduce_front_sum_with_length(self, dc, gc):
-        num_reduce_dim = 1
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        batch_size = int(np.prod([2, 3, 4, 5][num_reduce_dim:]))
-        d = 120 // batch_size
-        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
-
-        def ref_sum(X, lengths):
-            Y = X.reshape(d, lengths.size)
-            rv = np.zeros((lengths.size, 1)).astype(np.float32)
-            for ii in range(lengths.size):
-                rv[ii] = np.sum(Y[:lengths[ii], ii])
-            return [rv.reshape((2, 3, 4, 5)[num_reduce_dim:])]
-
-        self.reduce_op_test(
-            "ReduceFrontSum", ref_sum, [X, lengths], ["input", "lengths"],
-            num_reduce_dim, gc)
-
-    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
-    @settings(deadline=10000)
-    def test_reduce_front_mean(self, num_reduce_dim, gc, dc):
-        X = np.random.rand(6, 7, 8, 2).astype(np.float32)
-
-        def ref_mean(X):
-            return [np.mean(X, axis=(tuple(range(num_reduce_dim))))]
-
-        self.reduce_op_test(
-            "ReduceFrontMean", ref_mean, [X], ["input"], num_reduce_dim, gc)
-        self.grad_variant_input_test(
-            "ReduceFrontMeanGradient", X, ref_mean, num_reduce_dim)
-
-    @given(**hu.gcs)
-    @settings(deadline=10000)
-    def test_reduce_front_mean_with_length(self, dc, gc):
-        num_reduce_dim = 1
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        batch_size = int(np.prod([2, 3, 4, 5][num_reduce_dim:]))
-        d = 120 // batch_size
-        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
-
-        def ref_mean(X, lengths):
-            Y = X.reshape(d, lengths.size)
-            rv = np.zeros((lengths.size, 1)).astype(np.float32)
-            for ii in range(lengths.size):
-                rv[ii] = np.mean(Y[:lengths[ii], ii])
-            return [rv.reshape((2, 3, 4, 5)[num_reduce_dim:])]
-
-        self.reduce_op_test(
-            "ReduceFrontMean", ref_mean, [X, lengths], ["input", "lengths"],
-            num_reduce_dim, gc)
-
-    @serial.given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
-    def test_reduce_front_max(self, num_reduce_dim, gc, dc):
-        X = np.random.rand(6, 7, 8, 2).astype(np.float32)
-
-        def ref_frontmax(X):
-            return [np.max(X, axis=(tuple(range(num_reduce_dim))))]
-
-        self.max_op_test(
-            "ReduceFrontMax", num_reduce_dim, gc, dc, [X], ["X"], ref_frontmax)
-
-    @given(**hu.gcs)
-    def test_reduce_front_max_with_length(self, dc, gc):
-        num_reduce_dim = 1
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        batch_size = int(np.prod([2, 3, 4, 5][num_reduce_dim:]))
-        d = 120 // batch_size
-        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
-
-        def ref_max(X, lengths):
-            Y = X.reshape(d, lengths.size)
-            rv = np.zeros((lengths.size, 1)).astype(np.float32)
-            for ii in range(lengths.size):
-                rv[ii] = np.max(Y[:lengths[ii], ii])
-            return [rv.reshape((2, 3, 4, 5)[num_reduce_dim:])]
-
-        self.max_op_test(
-            "ReduceFrontMax", num_reduce_dim, gc, dc, [X, lengths],
-            ["X", "lengths"], ref_max)
-
-    @serial.given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
-    def test_reduce_back_max(self, num_reduce_dim, gc, dc):
-        X = np.random.rand(6, 7, 8, 2).astype(np.float32)
-
-        def ref_backmax(X):
-            return [np.max(X, axis=(0, 1, 2, 3)[4 - num_reduce_dim:])]
-
-        self.max_op_test(
-            "ReduceBackMax", num_reduce_dim, gc, dc, [X], ["X"], ref_backmax)
-
-    @given(**hu.gcs)
-    def test_reduce_back_max_with_length(self, gc, dc):
-        num_reduce_dim = 1
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        batch_size = int(np.prod([2, 3, 4, 5][:4 - num_reduce_dim]))
-        d = 120 // batch_size
-        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
-
-        def ref_max(X, lengths):
-            Y = X.reshape(lengths.size, d)
-            rv = np.zeros((lengths.size, 1)).astype(np.float32)
-            for ii in range(lengths.size):
-                rv[ii] = np.max(Y[ii, :lengths[ii]])
-            return [rv.reshape((2, 3, 4, 5)[:4 - num_reduce_dim])]
-
-        self.max_op_test(
-            "ReduceBackMax", num_reduce_dim, gc, dc, [X, lengths],
-            ["X", "lengths"], ref_max)
-
-    @given(**hu.gcs)
-    @settings(deadline=10000)
-    def test_reduce_back_sum(self, dc, gc):
-        num_reduce_dim = 1
-        X = np.random.rand(6, 7, 8, 2).astype(np.float32)
-
-        def ref_sum(X):
-            return [np.sum(X, axis=(0, 1, 2, 3)[4 - num_reduce_dim:])]
-
-        self.reduce_op_test(
-            "ReduceBackSum", ref_sum, [X], ["input"], num_reduce_dim, gc)
-        self.grad_variant_input_test(
-            "ReduceBackSumGradient", X, ref_sum, num_reduce_dim)
-
-    @given(**hu.gcs)
-    @settings(deadline=10000)
-    def test_reduce_back_sum_with_length(self, dc, gc):
-        num_reduce_dim = 1
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        batch_size = int(np.prod([2, 3, 4, 5][:4 - num_reduce_dim]))
-        d = 120 // batch_size
-        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
-
-        def ref_sum(X, lengths):
-            Y = X.reshape(lengths.size, d)
-            rv = np.zeros((lengths.size, 1)).astype(np.float32)
-            for ii in range(lengths.size):
-                rv[ii] = np.sum(Y[ii, :lengths[ii]])
-            return [rv.reshape((2, 3, 4, 5)[:4 - num_reduce_dim])]
-
-        self.reduce_op_test(
-            "ReduceBackSum", ref_sum, [X, lengths], ["input", "lengths"],
-            num_reduce_dim, gc)
-
-    @given(num_reduce_dim=st.integers(0, 4), **hu.gcs)
-    @settings(deadline=10000)
-    def test_reduce_back_mean(self, num_reduce_dim, dc, gc):
-        X = np.random.rand(6, 7, 8, 2).astype(np.float32)
-
-        def ref_mean(X):
-            return [np.mean(X, axis=(0, 1, 2, 3)[4 - num_reduce_dim:])]
-
-        self.reduce_op_test(
-            "ReduceBackMean", ref_mean, [X], ["input"], num_reduce_dim, gc)
-        self.grad_variant_input_test(
-            "ReduceBackMeanGradient", X, ref_mean, num_reduce_dim)
-
-    @given(**hu.gcs)
-    @settings(deadline=None)
-    def test_reduce_back_mean_with_length(self, dc, gc):
-        num_reduce_dim = 1
-        X = np.random.rand(2, 3, 4, 5).astype(np.float32)
-        batch_size = int(np.prod([2, 3, 4, 5][:4 - num_reduce_dim]))
-        d = 120 // batch_size
-        lengths = np.random.randint(1, d, size=batch_size).astype(np.int32)
-
-        def ref_mean(X, lengths):
-            Y = X.reshape(lengths.size, d)
-            rv = np.zeros((lengths.size, 1)).astype(np.float32)
-            for ii in range(lengths.size):
-                rv[ii] = np.mean(Y[ii, :lengths[ii]])
-            return [rv.reshape((2, 3, 4, 5)[:4 - num_reduce_dim])]
-
-        self.reduce_op_test(
-            "ReduceBackMean", ref_mean, [X, lengths], ["input", "lengths"],
-            num_reduce_dim, gc)
diff --git a/caffe2/python/operator_test/reduction_ops_test.py b/caffe2/python/operator_test/reduction_ops_test.py
deleted file mode 100644
index 745ab63a29fd..000000000000
--- a/caffe2/python/operator_test/reduction_ops_test.py
+++ /dev/null
@@ -1,180 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from hypothesis import assume, given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestReductionOps(serial.SerializedTestCase):
-
-    @serial.given(n=st.integers(5, 8), **hu.gcs)
-    def test_elementwise_sum(self, n, gc, dc):
-        X = np.random.rand(n).astype(np.float32)
-
-        def sum_op(X):
-            return [np.sum(X)]
-
-        op = core.CreateOperator(
-            "SumElements",
-            ["X"],
-            ["y"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=sum_op,
-        )
-
-        self.assertGradientChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            outputs_to_check=0,
-            outputs_with_grads=[0],
-        )
-
-    @given(n=st.integers(5, 8), **hu.gcs)
-    @settings(deadline=10000)
-    def test_elementwise_int_sum(self, n, gc, dc):
-        X = np.random.rand(n).astype(np.int32)
-
-        def sum_op(X):
-            return [np.sum(X)]
-
-        op = core.CreateOperator(
-            "SumElementsInt",
-            ["X"],
-            ["y"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=sum_op,
-        )
-
-    @given(n=st.integers(1, 65536),
-           dtype=st.sampled_from([np.float32, np.float16]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_elementwise_sqrsum(self, n, dtype, gc, dc):
-        if dtype == np.float16:
-            # fp16 is only supported with CUDA/HIP
-            assume(gc.device_type == workspace.GpuDeviceType)
-            dc = [d for d in dc if d.device_type == workspace.GpuDeviceType]
-
-        X = np.random.rand(n).astype(dtype)
-
-        def sumsqr_op(X):
-            return [np.sum(X * X)]
-
-        op = core.CreateOperator(
-            "SumSqrElements",
-            ["X"],
-            ["y"]
-        )
-
-        threshold = 0.01 if dtype == np.float16 else 0.005
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=sumsqr_op,
-            threshold=threshold,
-        )
-
-    @given(n=st.integers(5, 8), **hu.gcs)
-    def test_elementwise_avg(self, n, gc, dc):
-        X = np.random.rand(n).astype(np.float32)
-
-        def avg_op(X):
-            return [np.mean(X)]
-
-        op = core.CreateOperator(
-            "SumElements",
-            ["X"],
-            ["y"],
-            average=1
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=avg_op,
-        )
-
-        self.assertGradientChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            outputs_to_check=0,
-            outputs_with_grads=[0],
-        )
-
-    @serial.given(batch_size=st.integers(1, 3),
-           m=st.integers(1, 3),
-           n=st.integers(1, 4),
-           **hu.gcs)
-    def test_rowwise_max(self, batch_size, m, n, gc, dc):
-        X = np.random.rand(batch_size, m, n).astype(np.float32)
-
-        def rowwise_max(X):
-            return [np.max(X, axis=2)]
-
-        op = core.CreateOperator(
-            "RowwiseMax",
-            ["x"],
-            ["y"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=rowwise_max,
-        )
-
-    @serial.given(batch_size=st.integers(1, 3),
-           m=st.integers(1, 3),
-           n=st.integers(1, 4),
-           **hu.gcs)
-    def test_columnwise_max(self, batch_size, m, n, gc, dc):
-        X = np.random.rand(batch_size, m, n).astype(np.float32)
-
-        def columnwise_max(X):
-            return [np.max(X, axis=1)]
-
-        op = core.CreateOperator(
-            "ColwiseMax",
-            ["x"],
-            ["y"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=columnwise_max,
-        )
-
-        # Test shape inference logic
-        net = core.Net("test_shape_inference")
-        workspace.FeedBlob("x", X)
-        output = net.ColwiseMax(["x"], ["y"])
-        (shapes, types) = workspace.InferShapesAndTypes([net])
-        workspace.RunNetOnce(net)
-
-        self.assertEqual(shapes[output], list(workspace.blobs[output].shape))
-        self.assertEqual(shapes[output], [X.shape[0]] + [X.shape[2]])
-        self.assertEqual(types[output], core.DataType.FLOAT)
diff --git a/caffe2/python/operator_test/reshape_ops_test.py b/caffe2/python/operator_test/reshape_ops_test.py
deleted file mode 100644
index dc90b6815f01..000000000000
--- a/caffe2/python/operator_test/reshape_ops_test.py
+++ /dev/null
@@ -1,211 +0,0 @@
-
-
-
-
-import numpy as np
-from numpy.testing import assert_array_equal
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-from caffe2.proto import caffe2_pb2
-
-
-class TestLengthsToShapeOps(TestCase):
-    def test_lengths_to_shape_ops(self):
-        workspace.FeedBlob('l', np.array([200, 200, 200], dtype=np.int32))
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'LengthsToShape', ['l'], ['s']))
-        workspace.FeedBlob('res', np.array([3, 200], dtype=np.int32))
-        assert_array_equal(workspace.FetchBlob('s'), workspace.FetchBlob('res'))
-
-    def test_reshape_ops(self):
-        workspace.FeedBlob('res', np.array([[0, 0, 0, 0]], dtype=np.float32))
-        workspace.FeedBlob('shape', np.array([1, 4], dtype=np.int32))
-        workspace.FeedBlob('input', np.zeros((2, 2), dtype=np.float32))
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'Reshape', ['input', 'shape'], ['output', 'old_shape']))
-        assert_array_equal(workspace.FetchBlob('output'),
-                           workspace.FetchBlob('res'))
-
-    def test_basic_reshape(self):
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(2, 4))
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(2, 4), arg_shape=False)
-
-    def test_missing_dim(self):
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(-1, 8))
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(-1, 8), arg_shape=False)
-
-    def test_in_place(self):
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(-1, 8), in_place=True)
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(-1, 8),
-                     in_place=True, arg_shape=False)
-
-    def test_zero_dim(self):
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(0, 0, 0),
-                     expected_shape=(4, 2, 1))
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(0, 0, 0),
-                     expected_shape=(4, 2, 1), arg_shape=False)
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(0, 2, 1),
-                     expected_shape=(4, 2, 1))
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(0, 2, 1),
-                     expected_shape=(4, 2, 1), arg_shape=False)
-        _test_reshape_output_and_gradient(old_shape=(0, 0), new_shape=(0, 0, 0),
-                     expected_shape=(0, 0, 0), arg_shape=False)
-
-    def test_zero_dim_and_missing_dim(self):
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(0, -1, 0),
-                     expected_shape=(4, 2, 1))
-        _test_reshape_output_and_gradient(old_shape=(4, 2, 1), new_shape=(0, -1, 0),
-                     expected_shape=(4, 2, 1), arg_shape=False)
-        _test_reshape_output_and_gradient(old_shape=(4, 3, 2), new_shape=(-1, 0),
-                     expected_shape=(8, 3))
-        _test_reshape_output_and_gradient(old_shape=(4, 3, 2), new_shape=(-1, 0),
-                     expected_shape=(8, 3), arg_shape=False)
-
-        # empty tensor will just have -1 dim = 0
-        _test_reshape_output_and_gradient(
-            old_shape=(2, 0),
-            new_shape=(-1, 0),
-            expected_shape=(0, 0),
-            arg_shape=False
-        )
-
-    def test_backprop(self):
-        old_shape = (4, 2, 1)
-        new_shape = (1, 8)
-        X = np.random.rand(*old_shape).astype(np.float32)
-        Y = np.random.rand(*new_shape).astype(np.float32)
-
-        net = core.Net('net')
-
-        net.GivenTensorFill([], 'X', shape=old_shape, values=X.flatten())
-        net.GivenTensorFill([], 'Y', shape=new_shape, values=Y.flatten())
-
-        net.Reshape(['X'], ['X_out', 'old_shape'], shape=new_shape)
-        net.DotProduct(['X_out', 'Y'], 'Z')
-        net.AddGradientOperators(['Z'])
-
-        workspace.RunNetOnce(net)
-
-        Z = workspace.FetchBlob('Z')
-        X_grad = workspace.FetchBlob('X_grad')
-
-        # Check forward computation
-        np.testing.assert_allclose(
-            Z.squeeze(), X.reshape(new_shape).dot(Y.T).squeeze(), rtol=1e-5)
-
-        # Check the shape of the gradient
-        np.testing.assert_array_equal(X_grad.shape, X.shape)
-
-        # Check the gradient
-        np.testing.assert_allclose(X_grad, Y.reshape(old_shape), rtol=1e-5)
-
-    def test_input_shape_changes(self):
-        workspace.FeedBlob(
-            'input_blob',
-            np.array(np.random.rand(10, 20, 10), dtype=np.float32))
-        net = core.Net('mynet')
-        z, _ = net.Reshape('input_blob',
-                           ['z_reshape', 'dummy_size'],
-                           shape=(-1, 10))
-        workspace.CreateNet(net)
-        workspace.RunNet(net)
-        workspace.FeedBlob(
-            'input_blob',
-            np.array(np.random.rand(10, 40, 10), dtype=np.float32))
-        workspace.RunNet(net)
-
-    def test_nonempty_tensor_gradient(self):
-        old_shape = [4, 2]
-        new_shape = [1, 2, -1]
-        expected_new_shape = [1, 2, 4]
-        _test_reshape_output_and_gradient(
-            old_shape=old_shape,
-            new_shape=new_shape,
-            expected_shape=expected_new_shape,
-            expected_gradient=np.ones(shape=old_shape)
-        )
-
-    def test_empty_tensor(self):
-        old_shape = [4, 0]
-        new_shape = [1, -1]
-        expected_new_shape = [1, 0]
-        _test_reshape_output_and_gradient(
-            old_shape=old_shape,
-            new_shape=new_shape,
-            expected_shape=expected_new_shape,
-            expected_gradient=np.empty(shape=old_shape)
-        )
-
-    def test_one_dim_empty_tensor_gradient(self):
-        old_shape = (0,)
-        new_shape = [1, -1]
-        expected_new_shape = [1, 0]
-        _test_reshape_output_and_gradient(
-            old_shape=old_shape,
-            new_shape=new_shape,
-            expected_shape=expected_new_shape,
-            expected_gradient=np.empty(shape=old_shape)
-        )
-
-    def test_one_dim_and_empty_tensor(self):
-        old_shape = (0,)
-        new_shape = [0, -1]
-        expected_new_shape = [0, 0]
-        _test_reshape_output_and_gradient(old_shape=old_shape, new_shape=new_shape, expected_shape=expected_new_shape)
-
-    def test_scalar_to_tensor(self):
-        old_shape = ()
-        new_shape = [1, -1]
-        expected_new_shape = [1, 1]
-        _test_reshape_output_and_gradient(old_shape=old_shape, new_shape=new_shape, expected_shape=expected_new_shape)
-
-
-def _test_reshape_output_and_gradient(
-    old_shape,
-    new_shape,
-    expected_shape=None,
-    arg_shape=True,
-    in_place=False,
-    expected_gradient=None
-):
-    devices = [core.DeviceOption(caffe2_pb2.CPU, 0)]
-    if workspace.NumGpuDevices() > 0:
-        devices.append(core.DeviceOption(workspace.GpuDeviceType, 0))
-
-    for device_opt in devices:
-        with core.DeviceScope(device_opt):
-            if expected_shape is None:
-                expected_shape = new_shape
-            net = core.Net('net')
-
-            if len(old_shape) == 0:
-                # scalar, convert to tensor before feeding to blob
-                X = np.atleast_1d(np.random.rand(*old_shape))
-            else:
-                X = np.random.rand(*old_shape).astype(np.float32)
-            blob_in = 'X'
-            blob_out = blob_in if in_place else blob_in + '_out'
-
-            if arg_shape:
-                out, _ = net.Reshape([blob_in], [blob_out, 'old_shape'], shape=new_shape)
-            else:
-                out, _ = net.Reshape([blob_in, 'new_shape'], [blob_out, 'old_shape'])
-                workspace.FeedBlob('new_shape', np.asarray(new_shape))
-
-            workspace.FeedBlob(blob_in, X)
-            if expected_gradient is not None:
-                net.AddGradientOperators([out])
-            workspace.CreateNet(net)
-            workspace.RunNetOnce(net)
-
-            Y = workspace.FetchBlob(blob_out)
-            np.testing.assert_allclose(Y, X.reshape(expected_shape))
-            if expected_gradient is not None:
-                data_grad = workspace.FetchBlob(blob_in + '_grad')
-                np.testing.assert_array_equal(data_grad, expected_gradient)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/resize_op_test.py b/caffe2/python/operator_test/resize_op_test.py
deleted file mode 100644
index cd90656f607d..000000000000
--- a/caffe2/python/operator_test/resize_op_test.py
+++ /dev/null
@@ -1,254 +0,0 @@
-
-
-
-
-import numpy as np
-import hypothesis.strategies as st
-import unittest
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python import core
-from caffe2.proto import caffe2_pb2
-from hypothesis import assume, given, settings
-
-
-class TestResize(hu.HypothesisTestCase):
-    @given(height_scale=st.floats(0.25, 4.0) | st.just(2.0),
-           width_scale=st.floats(0.25, 4.0) | st.just(2.0),
-           height=st.integers(4, 32),
-           width=st.integers(4, 32),
-           num_channels=st.integers(1, 4),
-           batch_size=st.integers(1, 4),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           **hu.gcs)
-    @settings(max_examples=10, deadline=None)
-    def test_nearest(self, height_scale, width_scale, height, width,
-                     num_channels, batch_size, seed, order,
-                     gc, dc):
-
-        assume(order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
-        # NHWC currently only supported for CPU. Ignore other devices.
-        if order == "NHWC":
-            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
-
-        np.random.seed(seed)
-        op = core.CreateOperator(
-            "ResizeNearest",
-            ["X"],
-            ["Y"],
-            width_scale=width_scale,
-            height_scale=height_scale,
-            order=order,
-        )
-
-        X = np.random.rand(
-            batch_size, num_channels, height, width).astype(np.float32)
-        if order == "NHWC":
-            X = X.transpose([0, 2, 3, 1])
-
-        def ref(X):
-            output_height = np.int32(height * height_scale)
-            output_width = np.int32(width * width_scale)
-
-            output_h_idxs, output_w_idxs = np.meshgrid(np.arange(output_height),
-                                                       np.arange(output_width),
-                                                       indexing='ij')
-
-            input_h_idxs = np.minimum(
-                output_h_idxs / height_scale, height - 1).astype(np.int32)
-            input_w_idxs = np.minimum(
-                output_w_idxs / width_scale, width - 1).astype(np.int32)
-
-            if order == "NCHW":
-                Y = X[:, :, input_h_idxs, input_w_idxs]
-            else:
-                Y = X[:, input_h_idxs, input_w_idxs, :]
-
-            return Y,
-
-        self.assertReferenceChecks(gc, op, [X], ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.1, threshold=1e-2)
-
-    @given(height_scale=st.floats(0.25, 4.0) | st.just(2.0),
-           width_scale=st.floats(0.25, 4.0) | st.just(2.0),
-           height=st.integers(4, 32),
-           width=st.integers(4, 32),
-           num_channels=st.integers(1, 4),
-           batch_size=st.integers(1, 4),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           **hu.gcs)
-    def test_nearest_grad(self, height_scale, width_scale, height, width,
-                          num_channels, batch_size, seed, order, gc, dc):
-
-        assume(order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
-        # NHWC currently only supported for CPU. Ignore other devices.
-        if order == "NHWC":
-            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
-
-        np.random.seed(seed)
-
-        output_height = np.int32(height * height_scale)
-        output_width = np.int32(width * width_scale)
-        X = np.random.rand(batch_size,
-                           num_channels,
-                           height,
-                           width).astype(np.float32)
-
-        dY = np.random.rand(batch_size,
-                            num_channels,
-                            output_height,
-                            output_width).astype(np.float32)
-        if order == "NHWC":
-            X = X.transpose([0, 2, 3, 1])
-            dY = dY.transpose([0, 2, 3, 1])
-
-        op = core.CreateOperator(
-            "ResizeNearestGradient",
-            ["dY", "X"],
-            ["dX"],
-            width_scale=width_scale,
-            height_scale=height_scale,
-            order=order,
-        )
-
-        def ref(dY, X):
-            dX = np.zeros_like(X)
-
-            for i in range(output_height):
-                for j in range(output_width):
-                    input_i = np.minimum(i / height_scale, height - 1).astype(np.int32)
-                    input_j = np.minimum(j / width_scale, width - 1).astype(np.int32)
-                    if order == "NCHW":
-                        dX[:, :, input_i, input_j] += dY[:, :, i, j]
-                    else:
-                        dX[:, input_i, input_j, :] += dY[:, i, j, :]
-            return dX,
-
-        self.assertDeviceChecks(dc, op, [dY, X], [0])
-        self.assertReferenceChecks(gc, op, [dY, X], ref)
-
-    @given(height_scale=st.floats(0.25, 4.0) | st.just(2.0),
-           width_scale=st.floats(0.25, 4.0) | st.just(2.0),
-           height=st.integers(4, 8),
-           width=st.integers(4, 8),
-           num_channels=st.integers(1, 4),
-           batch_size=st.integers(1, 4),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_nearest_onnx(self, height_scale, width_scale, height, width,
-                          num_channels, batch_size, seed, order,
-                          gc, dc):
-
-        assume(order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
-        # NHWC currently only supported for CPU. Ignore other devices.
-        if order == "NHWC":
-            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
-
-        np.random.seed(seed)
-        op = core.CreateOperator(
-            "ResizeNearest",
-            ["X", "scales"],
-            ["Y"],
-            order=order,
-        )
-
-        X = np.random.rand(
-            batch_size, num_channels, height, width).astype(np.float32)
-        if order == "NHWC":
-            X = X.transpose([0, 2, 3, 1])
-
-        scales = np.array([height_scale, width_scale]).astype(np.float32)
-
-        def ref(X, scales):
-            output_height = np.int32(height * scales[0])
-            output_width = np.int32(width * scales[1])
-
-            output_h_idxs, output_w_idxs = np.meshgrid(np.arange(output_height),
-                                                       np.arange(output_width),
-                                                       indexing='ij')
-
-            input_h_idxs = np.minimum(
-                output_h_idxs / scales[0], height - 1).astype(np.int32)
-            input_w_idxs = np.minimum(
-                output_w_idxs / scales[1], width - 1).astype(np.int32)
-
-            if order == "NCHW":
-                Y = X[:, :, input_h_idxs, input_w_idxs]
-            else:
-                Y = X[:, input_h_idxs, input_w_idxs, :]
-
-            return Y,
-
-        self.assertReferenceChecks(gc, op, [X, scales], ref)
-        self.assertDeviceChecks(dc, op, [X, scales], [0])
-        self.assertGradientChecks(gc, op, [X, scales], 0, [0], stepsize=0.1,
-                                  threshold=1e-2)
-
-    @given(height_scale=st.floats(0.25, 4.0) | st.just(2.0),
-           width_scale=st.floats(0.25, 4.0) | st.just(2.0),
-           height=st.integers(4, 8),
-           width=st.integers(4, 8),
-           num_channels=st.integers(1, 4),
-           batch_size=st.integers(1, 4),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           **hu.gcs)
-    def test_nearest_onnx_grad(self, height_scale, width_scale, height, width,
-                               num_channels, batch_size, seed, order, gc, dc):
-
-        assume(order == "NCHW" or gc.device_type == caffe2_pb2.CPU)
-        # NHWC currently only supported for CPU. Ignore other devices.
-        if order == "NHWC":
-            dc = [d for d in dc if d.device_type == caffe2_pb2.CPU]
-
-        np.random.seed(seed)
-
-        output_height = np.int32(height * height_scale)
-        output_width = np.int32(width * width_scale)
-        X = np.random.rand(batch_size,
-                           num_channels,
-                           height,
-                           width).astype(np.float32)
-        dY = np.random.rand(batch_size,
-                            num_channels,
-                            output_height,
-                            output_width).astype(np.float32)
-        if order == "NHWC":
-            X = X.transpose([0, 2, 3, 1])
-            dY = dY.transpose([0, 2, 3, 1])
-
-        scales = np.array([height_scale, width_scale]).astype(np.float32)
-
-        op = core.CreateOperator(
-            "ResizeNearestGradient",
-            ["dY", "X", "scales"],
-            ["dX"],
-            order=order,
-        )
-
-        def ref(dY, X, scales):
-            dX = np.zeros_like(X)
-
-            for i in range(output_height):
-                for j in range(output_width):
-                    input_i = np.minimum(i / scales[0], height - 1).astype(np.int32)
-                    input_j = np.minimum(j / scales[1], width - 1).astype(np.int32)
-
-                    if order == "NCHW":
-                        dX[:, :, input_i, input_j] += dY[:, :, i, j]
-                    else:
-                        dX[:, input_i, input_j, :] += dY[:, i, j, :]
-
-            return dX,
-
-        self.assertDeviceChecks(dc, op, [dY, X, scales], [0])
-        self.assertReferenceChecks(gc, op, [dY, X, scales], ref)
-
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/rmac_regions_op_test.py b/caffe2/python/operator_test/rmac_regions_op_test.py
deleted file mode 100644
index 084d7402df5f..000000000000
--- a/caffe2/python/operator_test/rmac_regions_op_test.py
+++ /dev/null
@@ -1,99 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-
-class RMACRegionsOpTest(hu.HypothesisTestCase):
-    @given(
-        n=st.integers(500, 500),
-        h=st.integers(1, 10),
-        w=st.integers(1, 10),
-        scales=st.integers(1, 3),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test(self, n, h, w, scales, gc, dc):
-        X = np.random.rand(n, 64, h, w).astype(np.float32)
-        overlap = 0.4
-
-        def ref_op(X):
-            N, H, W = X.shape[0], X.shape[2], X.shape[3]
-
-            # Possible regions for the long dimension
-            steps = np.array((2, 3, 4, 5, 6, 7), dtype=np.float32)
-            minW = np.minimum(H, W)
-
-            # steps(idx) regions for long dimension
-            b = (np.maximum(H, W) - minW) / (steps - 1)
-            idx = np.argmin(
-                np.abs(((minW**2 - minW * b) / minW**2) - overlap)) + 1
-
-            # Region overplus per dimension
-            Wd = 0
-            Hd = 0
-            if H < W:
-                Wd = idx
-            elif H > W:
-                Hd = idx
-
-            regions_xywh = []
-            for l in range(1, scales + 1):
-                wl = np.floor(2 * minW / (l + 1))
-
-                # Center coordinates
-                if l + Wd - 1 > 0:
-                    b = (W - wl) / (l + Wd - 1)
-                else:
-                    b = 0
-                cenW = np.floor(b * np.arange(l - 1 + Wd + 1))
-
-                # Center coordinates
-                if l + Hd - 1 > 0:
-                    b = (H - wl) / (l + Hd - 1)
-                else:
-                    b = 0
-                cenH = np.floor(b * np.arange(l - 1 + Hd + 1))
-
-                for i_ in cenW:
-                    for j_ in cenH:
-                        regions_xywh.append([i_, j_, wl, wl])
-
-            # Round the regions. Careful with the borders!
-            for i in range(len(regions_xywh)):
-                for j in range(4):
-                    regions_xywh[i][j] = int(round(regions_xywh[i][j]))
-                if regions_xywh[i][0] + regions_xywh[i][2] > W:
-                    regions_xywh[i][0] -= (
-                        (regions_xywh[i][0] + regions_xywh[i][2]) - W
-                    )
-                if regions_xywh[i][1] + regions_xywh[i][3] > H:
-                    regions_xywh[i][1] -= (
-                        (regions_xywh[i][1] + regions_xywh[i][3]) - H
-                    )
-            # Filter out 0-sized regions
-            regions_xywh = [r for r in regions_xywh if r[2] * r[3] > 0]
-
-            # Convert to ROIPoolOp format: (batch_index x1 y1 x2 y2)
-            regions = [
-                [i, x, y, x + w - 1, y + h - 1]
-                for i in np.arange(N) for x, y, w, h in regions_xywh
-            ]
-            return (np.array(regions).astype(np.float32), )
-
-        op = core.CreateOperator(
-            'RMACRegions',
-            ['X'],
-            ['RMAC_REGIONS'],
-            scales=scales,
-            overlap=overlap,
-        )
-
-        # Check against numpy reference
-        self.assertReferenceChecks(gc, op, [X], ref_op)
diff --git a/caffe2/python/operator_test/rms_norm_op_test.py b/caffe2/python/operator_test/rms_norm_op_test.py
deleted file mode 100644
index 797b3c9a01c3..000000000000
--- a/caffe2/python/operator_test/rms_norm_op_test.py
+++ /dev/null
@@ -1,47 +0,0 @@
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-
-
-class TestRMSNormOp(hu.HypothesisTestCase):
-    @given(
-        M=st.integers(0, 8),
-        N=st.integers(1, 16),
-        eps=st.floats(0, 1e-3),
-        dtype=st.sampled_from([np.float32, np.float64]),
-        **hu.gcs,
-    )
-    @settings(deadline=None)
-    def test_rms_norm(self, M, N, eps, dtype, gc, dc):
-        X = (np.random.randn(M, N) * 2.0 + 1.0).astype(dtype)
-        gamma = np.random.randn(N).astype(dtype)
-        beta = np.random.randn(N).astype(dtype)
-
-        op = core.CreateOperator(
-            "RMSNorm",
-            ["X", "gamma", "beta"],
-            ["Y", "rrms"],
-            eps=eps,
-        )
-
-        def rms_norm_ref(X, gamma, beta):
-            rrms = 1.0 / np.sqrt(np.mean(np.square(X), axis=1) + eps)
-            Y = X * np.expand_dims(rrms, axis=1) * gamma + beta
-            return Y, rrms
-
-        inputs = [X, gamma, beta]
-        self.assertReferenceChecks(gc, op, inputs, rms_norm_ref)
-        self.assertDeviceChecks(dc, op, inputs, [0, 1])
-        for i in range(len(inputs)):
-            self.assertGradientChecks(gc, op, inputs, i, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
deleted file mode 100644
index 8fe037ccb70c..000000000000
--- a/caffe2/python/operator_test/rnn_cell_test.py
+++ /dev/null
@@ -1,1776 +0,0 @@
-
-
-
-
-
-from caffe2.python import (
-    core, gradient_checker, rnn_cell, workspace, scope, utils
-)
-from caffe2.python.attention import AttentionType
-from caffe2.python.model_helper import ModelHelper, ExtractPredictorNet
-from caffe2.python.rnn.rnn_cell_test_util import sigmoid, tanh, _prepare_rnn
-from caffe2.proto import caffe2_pb2
-import caffe2.python.hypothesis_test_util as hu
-
-from functools import partial
-from hypothesis import assume, given
-from hypothesis import settings as ht_settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-def lstm_unit(*args, **kwargs):
-    forget_bias = kwargs.get('forget_bias', 0.0)
-    drop_states = kwargs.get('drop_states', False)
-    sequence_lengths = kwargs.get('sequence_lengths', True)
-
-    if sequence_lengths:
-        hidden_t_prev, cell_t_prev, gates, seq_lengths, timestep = args
-    else:
-        hidden_t_prev, cell_t_prev, gates, timestep = args
-    D = cell_t_prev.shape[2]
-    G = gates.shape[2]
-    N = gates.shape[1]
-    t = (timestep * np.ones(shape=(N, D))).astype(np.int32)
-    assert t.shape == (N, D)
-    assert G == 4 * D
-    # Resize to avoid broadcasting inconsistencies with NumPy
-    gates = gates.reshape(N, 4, D)
-    cell_t_prev = cell_t_prev.reshape(N, D)
-    i_t = gates[:, 0, :].reshape(N, D)
-    f_t = gates[:, 1, :].reshape(N, D)
-    o_t = gates[:, 2, :].reshape(N, D)
-    g_t = gates[:, 3, :].reshape(N, D)
-    i_t = sigmoid(i_t)
-    f_t = sigmoid(f_t + forget_bias)
-    o_t = sigmoid(o_t)
-    g_t = tanh(g_t)
-    if sequence_lengths:
-        seq_lengths = (np.ones(shape=(N, D)) *
-                       seq_lengths.reshape(N, 1)).astype(np.int32)
-        assert seq_lengths.shape == (N, D)
-        valid = (t < seq_lengths).astype(np.int32)
-    else:
-        valid = np.ones(shape=(N, D))
-    assert valid.shape == (N, D)
-    cell_t = ((f_t * cell_t_prev) + (i_t * g_t)) * (valid) + \
-        (1 - valid) * cell_t_prev * (1 - drop_states)
-    assert cell_t.shape == (N, D)
-    hidden_t = (o_t * tanh(cell_t)) * valid + hidden_t_prev * (
-        1 - valid) * (1 - drop_states)
-    hidden_t = hidden_t.reshape(1, N, D)
-    cell_t = cell_t.reshape(1, N, D)
-    return hidden_t, cell_t
-
-
-def layer_norm_with_scale_and_bias_ref(X, scale, bias, axis=-1, epsilon=1e-4):
-    left = np.prod(X.shape[:axis])
-    reshaped = np.reshape(X, [left, -1])
-    mean = np.mean(reshaped, axis=1).reshape([left, 1])
-    stdev = np.sqrt(
-        np.mean(np.square(reshaped), axis=1).reshape([left, 1]) -
-        np.square(mean) + epsilon
-    )
-    norm = (reshaped - mean) / stdev
-    norm = np.reshape(norm, X.shape)
-    adjusted = scale * norm + bias
-
-    return adjusted
-
-
-def layer_norm_lstm_reference(
-    input,
-    hidden_input,
-    cell_input,
-    gates_w,
-    gates_b,
-    gates_t_norm_scale,
-    gates_t_norm_bias,
-    seq_lengths,
-    forget_bias,
-    drop_states=False
-):
-    T = input.shape[0]
-    N = input.shape[1]
-    G = input.shape[2]
-    D = hidden_input.shape[hidden_input.ndim - 1]
-    hidden = np.zeros(shape=(T + 1, N, D))
-    cell = np.zeros(shape=(T + 1, N, D))
-    assert hidden.shape[0] == T + 1
-    assert cell.shape[0] == T + 1
-    assert hidden.shape[1] == N
-    assert cell.shape[1] == N
-    cell[0, :, :] = cell_input
-    hidden[0, :, :] = hidden_input
-    for t in range(T):
-        input_t = input[t].reshape(1, N, G)
-        print(input_t.shape)
-        hidden_t_prev = hidden[t].reshape(1, N, D)
-        cell_t_prev = cell[t].reshape(1, N, D)
-        gates = np.dot(hidden_t_prev, gates_w.T) + gates_b
-        gates = gates + input_t
-
-        gates = layer_norm_with_scale_and_bias_ref(
-            gates, gates_t_norm_scale, gates_t_norm_bias
-        )
-
-        hidden_t, cell_t = lstm_unit(
-            hidden_t_prev,
-            cell_t_prev,
-            gates,
-            seq_lengths,
-            t,
-            forget_bias=forget_bias,
-            drop_states=drop_states,
-        )
-        hidden[t + 1] = hidden_t
-        cell[t + 1] = cell_t
-    return (
-        hidden[1:],
-        hidden[-1].reshape(1, N, D),
-        cell[1:],
-        cell[-1].reshape(1, N, D)
-    )
-
-
-def lstm_reference(input, hidden_input, cell_input,
-                   gates_w, gates_b, seq_lengths, forget_bias,
-                   drop_states=False):
-    T = input.shape[0]
-    N = input.shape[1]
-    G = input.shape[2]
-    D = hidden_input.shape[hidden_input.ndim - 1]
-    hidden = np.zeros(shape=(T + 1, N, D))
-    cell = np.zeros(shape=(T + 1, N, D))
-    assert hidden.shape[0] == T + 1
-    assert cell.shape[0] == T + 1
-    assert hidden.shape[1] == N
-    assert cell.shape[1] == N
-    cell[0, :, :] = cell_input
-    hidden[0, :, :] = hidden_input
-    for t in range(T):
-        input_t = input[t].reshape(1, N, G)
-        hidden_t_prev = hidden[t].reshape(1, N, D)
-        cell_t_prev = cell[t].reshape(1, N, D)
-        gates = np.dot(hidden_t_prev, gates_w.T) + gates_b
-        gates = gates + input_t
-        hidden_t, cell_t = lstm_unit(
-            hidden_t_prev,
-            cell_t_prev,
-            gates,
-            seq_lengths,
-            t,
-            forget_bias=forget_bias,
-            drop_states=drop_states,
-        )
-        hidden[t + 1] = hidden_t
-        cell[t + 1] = cell_t
-    return (
-        hidden[1:],
-        hidden[-1].reshape(1, N, D),
-        cell[1:],
-        cell[-1].reshape(1, N, D)
-    )
-
-
-def multi_lstm_reference(input, hidden_input_list, cell_input_list,
-                            i2h_w_list, i2h_b_list, gates_w_list, gates_b_list,
-                            seq_lengths, forget_bias, drop_states=False):
-    num_layers = len(hidden_input_list)
-    assert len(cell_input_list) == num_layers
-    assert len(i2h_w_list) == num_layers
-    assert len(i2h_b_list) == num_layers
-    assert len(gates_w_list) == num_layers
-    assert len(gates_b_list) == num_layers
-
-    for i in range(num_layers):
-        layer_input = np.dot(input, i2h_w_list[i].T) + i2h_b_list[i]
-        h_all, h_last, c_all, c_last = lstm_reference(
-            layer_input,
-            hidden_input_list[i],
-            cell_input_list[i],
-            gates_w_list[i],
-            gates_b_list[i],
-            seq_lengths,
-            forget_bias,
-            drop_states=drop_states,
-        )
-        input = h_all
-    return h_all, h_last, c_all, c_last
-
-
-def compute_regular_attention_logits(
-    hidden_t,
-    weighted_decoder_hidden_state_t_w,
-    weighted_decoder_hidden_state_t_b,
-    attention_weighted_encoder_context_t_prev,
-    weighted_prev_attention_context_w,
-    weighted_prev_attention_context_b,
-    attention_v,
-    weighted_encoder_outputs,
-    encoder_outputs_for_dot_product,
-    coverage_prev,
-    coverage_weights,
-):
-    weighted_hidden_t = np.dot(
-        hidden_t,
-        weighted_decoder_hidden_state_t_w.T,
-    ) + weighted_decoder_hidden_state_t_b
-    attention_v = attention_v.reshape([-1])
-    return np.sum(
-        attention_v * np.tanh(weighted_encoder_outputs + weighted_hidden_t),
-        axis=2,
-    )
-
-
-def compute_recurrent_attention_logits(
-    hidden_t,
-    weighted_decoder_hidden_state_t_w,
-    weighted_decoder_hidden_state_t_b,
-    attention_weighted_encoder_context_t_prev,
-    weighted_prev_attention_context_w,
-    weighted_prev_attention_context_b,
-    attention_v,
-    weighted_encoder_outputs,
-    encoder_outputs_for_dot_product,
-    coverage_prev,
-    coverage_weights,
-):
-    weighted_hidden_t = np.dot(
-        hidden_t,
-        weighted_decoder_hidden_state_t_w.T,
-    ) + weighted_decoder_hidden_state_t_b
-    weighted_prev_attention_context = np.dot(
-        attention_weighted_encoder_context_t_prev,
-        weighted_prev_attention_context_w.T
-    ) + weighted_prev_attention_context_b
-    attention_v = attention_v.reshape([-1])
-    return np.sum(
-        attention_v * np.tanh(
-            weighted_encoder_outputs + weighted_hidden_t +
-            weighted_prev_attention_context
-        ),
-        axis=2,
-    )
-
-
-def compute_dot_attention_logits(
-    hidden_t,
-    weighted_decoder_hidden_state_t_w,
-    weighted_decoder_hidden_state_t_b,
-    attention_weighted_encoder_context_t_prev,
-    weighted_prev_attention_context_w,
-    weighted_prev_attention_context_b,
-    attention_v,
-    weighted_encoder_outputs,
-    encoder_outputs_for_dot_product,
-    coverage_prev,
-    coverage_weights,
-):
-    hidden_t_for_dot_product = np.transpose(hidden_t, axes=[1, 2, 0])
-    if (
-        weighted_decoder_hidden_state_t_w is not None and
-        weighted_decoder_hidden_state_t_b is not None
-    ):
-        hidden_t_for_dot_product = np.matmul(
-            weighted_decoder_hidden_state_t_w,
-            hidden_t_for_dot_product,
-        ) + np.expand_dims(weighted_decoder_hidden_state_t_b, axis=1)
-    attention_logits_t = np.sum(
-        np.matmul(
-            encoder_outputs_for_dot_product,
-            hidden_t_for_dot_product,
-        ),
-        axis=2,
-    )
-    return np.transpose(attention_logits_t)
-
-
-def compute_coverage_attention_logits(
-    hidden_t,
-    weighted_decoder_hidden_state_t_w,
-    weighted_decoder_hidden_state_t_b,
-    attention_weighted_encoder_context_t_prev,
-    weighted_prev_attention_context_w,
-    weighted_prev_attention_context_b,
-    attention_v,
-    weighted_encoder_outputs,
-    encoder_outputs_for_dot_product,
-    coverage_prev,
-    coverage_weights,
-):
-    weighted_hidden_t = np.dot(
-        hidden_t,
-        weighted_decoder_hidden_state_t_w.T,
-    ) + weighted_decoder_hidden_state_t_b
-    coverage_part = coverage_prev.T * coverage_weights
-    encoder_part = weighted_encoder_outputs + coverage_part
-    attention_v = attention_v.reshape([-1])
-    return np.sum(
-        attention_v * np.tanh(encoder_part + weighted_hidden_t),
-        axis=2,
-    )
-
-
-def lstm_with_attention_reference(
-    input,
-    initial_hidden_state,
-    initial_cell_state,
-    initial_attention_weighted_encoder_context,
-    gates_w,
-    gates_b,
-    decoder_input_lengths,
-    encoder_outputs_transposed,
-    weighted_prev_attention_context_w,
-    weighted_prev_attention_context_b,
-    weighted_decoder_hidden_state_t_w,
-    weighted_decoder_hidden_state_t_b,
-    weighted_encoder_outputs,
-    coverage_weights,
-    attention_v,
-    attention_zeros,
-    compute_attention_logits,
-):
-    encoder_outputs = np.transpose(encoder_outputs_transposed, axes=[2, 0, 1])
-    encoder_outputs_for_dot_product = np.transpose(
-        encoder_outputs_transposed,
-        [0, 2, 1],
-    )
-    decoder_input_length = input.shape[0]
-    batch_size = input.shape[1]
-    decoder_input_dim = input.shape[2]
-    decoder_state_dim = initial_hidden_state.shape[2]
-    encoder_output_dim = encoder_outputs.shape[2]
-    hidden = np.zeros(
-        shape=(decoder_input_length + 1, batch_size, decoder_state_dim))
-    cell = np.zeros(
-        shape=(decoder_input_length + 1, batch_size, decoder_state_dim))
-    attention_weighted_encoder_context = np.zeros(
-        shape=(decoder_input_length + 1, batch_size, encoder_output_dim))
-    cell[0, :, :] = initial_cell_state
-    hidden[0, :, :] = initial_hidden_state
-    attention_weighted_encoder_context[0, :, :] = (
-        initial_attention_weighted_encoder_context
-    )
-    encoder_length = encoder_outputs.shape[0]
-    coverage = np.zeros(
-        shape=(decoder_input_length + 1, batch_size, encoder_length))
-    for t in range(decoder_input_length):
-        input_t = input[t].reshape(1, batch_size, decoder_input_dim)
-        hidden_t_prev = hidden[t].reshape(1, batch_size, decoder_state_dim)
-        cell_t_prev = cell[t].reshape(1, batch_size, decoder_state_dim)
-        attention_weighted_encoder_context_t_prev = (
-            attention_weighted_encoder_context[t].reshape(
-                1, batch_size, encoder_output_dim)
-        )
-        gates_input = np.concatenate(
-            (hidden_t_prev, attention_weighted_encoder_context_t_prev),
-            axis=2,
-        )
-        gates = np.dot(gates_input, gates_w.T) + gates_b
-        gates = gates + input_t
-        hidden_t, cell_t = lstm_unit(hidden_t_prev, cell_t_prev, gates,
-                                     decoder_input_lengths, t)
-        hidden[t + 1] = hidden_t
-        cell[t + 1] = cell_t
-
-        coverage_prev = coverage[t].reshape(1, batch_size, encoder_length)
-
-        attention_logits_t = compute_attention_logits(
-            hidden_t,
-            weighted_decoder_hidden_state_t_w,
-            weighted_decoder_hidden_state_t_b,
-            attention_weighted_encoder_context_t_prev,
-            weighted_prev_attention_context_w,
-            weighted_prev_attention_context_b,
-            attention_v,
-            weighted_encoder_outputs,
-            encoder_outputs_for_dot_product,
-            coverage_prev,
-            coverage_weights,
-        )
-
-        attention_logits_t_exp = np.exp(attention_logits_t)
-        attention_weights_t = (
-            attention_logits_t_exp /
-            np.sum(attention_logits_t_exp, axis=0).reshape([1, -1])
-        )
-        coverage[t + 1, :, :] = coverage[t, :, :] + attention_weights_t.T
-        attention_weighted_encoder_context[t + 1] = np.sum(
-            (
-                encoder_outputs *
-                attention_weights_t.reshape([-1, batch_size, 1])
-            ),
-            axis=0,
-        )
-    return (
-        hidden[1:],
-        hidden[-1].reshape(1, batch_size, decoder_state_dim),
-        cell[1:],
-        cell[-1].reshape(1, batch_size, decoder_state_dim),
-        attention_weighted_encoder_context[1:],
-        attention_weighted_encoder_context[-1].reshape(
-            1,
-            batch_size,
-            encoder_output_dim,
-        )
-    )
-
-
-def lstm_with_regular_attention_reference(
-    input,
-    initial_hidden_state,
-    initial_cell_state,
-    initial_attention_weighted_encoder_context,
-    gates_w,
-    gates_b,
-    decoder_input_lengths,
-    weighted_decoder_hidden_state_t_w,
-    weighted_decoder_hidden_state_t_b,
-    weighted_encoder_outputs,
-    attention_v,
-    attention_zeros,
-    encoder_outputs_transposed,
-):
-    return lstm_with_attention_reference(
-        input=input,
-        initial_hidden_state=initial_hidden_state,
-        initial_cell_state=initial_cell_state,
-        initial_attention_weighted_encoder_context=(
-            initial_attention_weighted_encoder_context
-        ),
-        gates_w=gates_w,
-        gates_b=gates_b,
-        decoder_input_lengths=decoder_input_lengths,
-        encoder_outputs_transposed=encoder_outputs_transposed,
-        weighted_prev_attention_context_w=None,
-        weighted_prev_attention_context_b=None,
-        weighted_decoder_hidden_state_t_w=weighted_decoder_hidden_state_t_w,
-        weighted_decoder_hidden_state_t_b=weighted_decoder_hidden_state_t_b,
-        weighted_encoder_outputs=weighted_encoder_outputs,
-        coverage_weights=None,
-        attention_v=attention_v,
-        attention_zeros=attention_zeros,
-        compute_attention_logits=compute_regular_attention_logits,
-    )
-
-
-def lstm_with_recurrent_attention_reference(
-    input,
-    initial_hidden_state,
-    initial_cell_state,
-    initial_attention_weighted_encoder_context,
-    gates_w,
-    gates_b,
-    decoder_input_lengths,
-    weighted_prev_attention_context_w,
-    weighted_prev_attention_context_b,
-    weighted_decoder_hidden_state_t_w,
-    weighted_decoder_hidden_state_t_b,
-    weighted_encoder_outputs,
-    attention_v,
-    attention_zeros,
-    encoder_outputs_transposed,
-):
-    return lstm_with_attention_reference(
-        input=input,
-        initial_hidden_state=initial_hidden_state,
-        initial_cell_state=initial_cell_state,
-        initial_attention_weighted_encoder_context=(
-            initial_attention_weighted_encoder_context
-        ),
-        gates_w=gates_w,
-        gates_b=gates_b,
-        decoder_input_lengths=decoder_input_lengths,
-        encoder_outputs_transposed=encoder_outputs_transposed,
-        weighted_prev_attention_context_w=weighted_prev_attention_context_w,
-        weighted_prev_attention_context_b=weighted_prev_attention_context_b,
-        weighted_decoder_hidden_state_t_w=weighted_decoder_hidden_state_t_w,
-        weighted_decoder_hidden_state_t_b=weighted_decoder_hidden_state_t_b,
-        weighted_encoder_outputs=weighted_encoder_outputs,
-        coverage_weights=None,
-        attention_v=attention_v,
-        attention_zeros=attention_zeros,
-        compute_attention_logits=compute_recurrent_attention_logits,
-    )
-
-
-def lstm_with_dot_attention_reference(
-    input,
-    initial_hidden_state,
-    initial_cell_state,
-    initial_attention_weighted_encoder_context,
-    gates_w,
-    gates_b,
-    decoder_input_lengths,
-    encoder_outputs_transposed,
-    weighted_decoder_hidden_state_t_w,
-    weighted_decoder_hidden_state_t_b,
-):
-    return lstm_with_attention_reference(
-        input=input,
-        initial_hidden_state=initial_hidden_state,
-        initial_cell_state=initial_cell_state,
-        initial_attention_weighted_encoder_context=(
-            initial_attention_weighted_encoder_context
-        ),
-        gates_w=gates_w,
-        gates_b=gates_b,
-        decoder_input_lengths=decoder_input_lengths,
-        encoder_outputs_transposed=encoder_outputs_transposed,
-        weighted_prev_attention_context_w=None,
-        weighted_prev_attention_context_b=None,
-        weighted_decoder_hidden_state_t_w=weighted_decoder_hidden_state_t_w,
-        weighted_decoder_hidden_state_t_b=weighted_decoder_hidden_state_t_b,
-        weighted_encoder_outputs=None,
-        coverage_weights=None,
-        attention_v=None,
-        attention_zeros=None,
-        compute_attention_logits=compute_dot_attention_logits,
-    )
-
-
-def lstm_with_dot_attention_reference_same_dim(
-    input,
-    initial_hidden_state,
-    initial_cell_state,
-    initial_attention_weighted_encoder_context,
-    gates_w,
-    gates_b,
-    decoder_input_lengths,
-    encoder_outputs_transposed,
-):
-    return lstm_with_dot_attention_reference(
-        input=input,
-        initial_hidden_state=initial_hidden_state,
-        initial_cell_state=initial_cell_state,
-        initial_attention_weighted_encoder_context=(
-            initial_attention_weighted_encoder_context
-        ),
-        gates_w=gates_w,
-        gates_b=gates_b,
-        decoder_input_lengths=decoder_input_lengths,
-        encoder_outputs_transposed=encoder_outputs_transposed,
-        weighted_decoder_hidden_state_t_w=None,
-        weighted_decoder_hidden_state_t_b=None,
-    )
-
-
-def lstm_with_dot_attention_reference_different_dim(
-    input,
-    initial_hidden_state,
-    initial_cell_state,
-    initial_attention_weighted_encoder_context,
-    gates_w,
-    gates_b,
-    decoder_input_lengths,
-    weighted_decoder_hidden_state_t_w,
-    weighted_decoder_hidden_state_t_b,
-    encoder_outputs_transposed,
-):
-    return lstm_with_dot_attention_reference(
-        input=input,
-        initial_hidden_state=initial_hidden_state,
-        initial_cell_state=initial_cell_state,
-        initial_attention_weighted_encoder_context=(
-            initial_attention_weighted_encoder_context
-        ),
-        gates_w=gates_w,
-        gates_b=gates_b,
-        decoder_input_lengths=decoder_input_lengths,
-        encoder_outputs_transposed=encoder_outputs_transposed,
-        weighted_decoder_hidden_state_t_w=weighted_decoder_hidden_state_t_w,
-        weighted_decoder_hidden_state_t_b=weighted_decoder_hidden_state_t_b,
-    )
-
-
-def lstm_with_coverage_attention_reference(
-    input,
-    initial_hidden_state,
-    initial_cell_state,
-    initial_attention_weighted_encoder_context,
-    initial_coverage,
-    gates_w,
-    gates_b,
-    decoder_input_lengths,
-    weighted_decoder_hidden_state_t_w,
-    weighted_decoder_hidden_state_t_b,
-    weighted_encoder_outputs,
-    coverage_weights,
-    attention_v,
-    attention_zeros,
-    encoder_outputs_transposed,
-):
-    return lstm_with_attention_reference(
-        input=input,
-        initial_hidden_state=initial_hidden_state,
-        initial_cell_state=initial_cell_state,
-        initial_attention_weighted_encoder_context=(
-            initial_attention_weighted_encoder_context
-        ),
-        gates_w=gates_w,
-        gates_b=gates_b,
-        decoder_input_lengths=decoder_input_lengths,
-        encoder_outputs_transposed=encoder_outputs_transposed,
-        weighted_prev_attention_context_w=None,
-        weighted_prev_attention_context_b=None,
-        weighted_decoder_hidden_state_t_w=weighted_decoder_hidden_state_t_w,
-        weighted_decoder_hidden_state_t_b=weighted_decoder_hidden_state_t_b,
-        weighted_encoder_outputs=weighted_encoder_outputs,
-        coverage_weights=coverage_weights,
-        attention_v=attention_v,
-        attention_zeros=attention_zeros,
-        compute_attention_logits=compute_coverage_attention_logits,
-    )
-
-
-def milstm_reference(
-        input,
-        hidden_input,
-        cell_input,
-        gates_w,
-        gates_b,
-        alpha,
-        beta1,
-        beta2,
-        b,
-        seq_lengths,
-        forget_bias,
-        drop_states=False):
-    T = input.shape[0]
-    N = input.shape[1]
-    G = input.shape[2]
-    D = hidden_input.shape[hidden_input.ndim - 1]
-    hidden = np.zeros(shape=(T + 1, N, D))
-    cell = np.zeros(shape=(T + 1, N, D))
-    assert hidden.shape[0] == T + 1
-    assert cell.shape[0] == T + 1
-    assert hidden.shape[1] == N
-    assert cell.shape[1] == N
-    cell[0, :, :] = cell_input
-    hidden[0, :, :] = hidden_input
-    for t in range(T):
-        input_t = input[t].reshape(1, N, G)
-        hidden_t_prev = hidden[t].reshape(1, N, D)
-        cell_t_prev = cell[t].reshape(1, N, D)
-        gates = np.dot(hidden_t_prev, gates_w.T) + gates_b
-        gates = (alpha * gates * input_t) + \
-                    (beta1 * gates) + \
-                    (beta2 * input_t) + \
-                    b
-        hidden_t, cell_t = lstm_unit(
-            hidden_t_prev,
-            cell_t_prev,
-            gates,
-            seq_lengths,
-            t,
-            forget_bias=forget_bias,
-            drop_states=drop_states,
-        )
-        hidden[t + 1] = hidden_t
-        cell[t + 1] = cell_t
-    return (
-        hidden[1:],
-        hidden[-1].reshape(1, N, D),
-        cell[1:],
-        cell[-1].reshape(1, N, D)
-    )
-
-
-def layer_norm_milstm_reference(
-        input,
-        hidden_input,
-        cell_input,
-        gates_w,
-        gates_b,
-        alpha,
-        beta1,
-        beta2,
-        b,
-        gates_t_norm_scale,
-        gates_t_norm_bias,
-        seq_lengths,
-        forget_bias,
-        drop_states=False):
-    T = input.shape[0]
-    N = input.shape[1]
-    G = input.shape[2]
-    D = hidden_input.shape[hidden_input.ndim - 1]
-    hidden = np.zeros(shape=(T + 1, N, D))
-    cell = np.zeros(shape=(T + 1, N, D))
-    assert hidden.shape[0] == T + 1
-    assert cell.shape[0] == T + 1
-    assert hidden.shape[1] == N
-    assert cell.shape[1] == N
-    cell[0, :, :] = cell_input
-    hidden[0, :, :] = hidden_input
-    for t in range(T):
-        input_t = input[t].reshape(1, N, G)
-        hidden_t_prev = hidden[t].reshape(1, N, D)
-        cell_t_prev = cell[t].reshape(1, N, D)
-        gates = np.dot(hidden_t_prev, gates_w.T) + gates_b
-        gates = (alpha * gates * input_t) + \
-                    (beta1 * gates) + \
-                    (beta2 * input_t) + \
-                    b
-        gates = layer_norm_with_scale_and_bias_ref(
-            gates, gates_t_norm_scale, gates_t_norm_bias
-        )
-        hidden_t, cell_t = lstm_unit(
-            hidden_t_prev,
-            cell_t_prev,
-            gates,
-            seq_lengths,
-            t,
-            forget_bias=forget_bias,
-            drop_states=drop_states,
-        )
-        hidden[t + 1] = hidden_t
-        cell[t + 1] = cell_t
-    return (
-        hidden[1:],
-        hidden[-1].reshape(1, N, D),
-        cell[1:],
-        cell[-1].reshape(1, N, D)
-    )
-
-
-def lstm_input():
-    '''
-    Create input tensor where each dimension is from 1 to 4, ndim=3 and
-    last dimension size is a factor of 4
-    '''
-    dims_ = st.tuples(
-        st.integers(min_value=1, max_value=4),  # t
-        st.integers(min_value=1, max_value=4),  # n
-        st.integers(min_value=1, max_value=4),  # d
-    )
-
-    def create_input(dims):
-        dims = list(dims)
-        dims[2] *= 4
-        return hu.arrays(dims)
-
-    return dims_.flatmap(create_input)
-
-
-def _prepare_attention(t, n, dim_in, encoder_dim,
-                          forward_only=False, T=None,
-                          dim_out=None, residual=False,
-                          final_dropout=False):
-    if dim_out is None:
-        dim_out = [dim_in]
-    print("Dims: t={} n={} dim_in={} dim_out={}".format(t, n, dim_in, dim_out))
-
-    model = ModelHelper(name='external')
-
-    def generate_input_state(shape):
-        return np.random.random(shape).astype(np.float32)
-
-    initial_states = []
-    for layer_id, d in enumerate(dim_out):
-        h, c = model.net.AddExternalInputs(
-            "hidden_init_{}".format(layer_id),
-            "cell_init_{}".format(layer_id),
-        )
-        initial_states.extend([h, c])
-        workspace.FeedBlob(h, generate_input_state((1, n, d)))
-        workspace.FeedBlob(c, generate_input_state((1, n, d)))
-
-    awec_init = model.net.AddExternalInputs([
-        'initial_attention_weighted_encoder_context',
-    ])
-    initial_states.append(awec_init)
-    workspace.FeedBlob(
-        awec_init,
-        generate_input_state((1, n, encoder_dim)),
-    )
-
-    # Due to convoluted RNN scoping logic we make sure that things
-    # work from a namescope
-    with scope.NameScope("test_name_scope"):
-        (
-            input_blob,
-            seq_lengths,
-            encoder_outputs,
-            weighted_encoder_outputs,
-        ) = model.net.AddScopedExternalInputs(
-            'input_blob',
-            'seq_lengths',
-            'encoder_outputs',
-            'weighted_encoder_outputs',
-        )
-
-        layer_input_dim = dim_in
-        cells = []
-        for layer_id, d in enumerate(dim_out):
-
-            cell = rnn_cell.MILSTMCell(
-                name='decoder_{}'.format(layer_id),
-                forward_only=forward_only,
-                input_size=layer_input_dim,
-                hidden_size=d,
-                forget_bias=0.0,
-                memory_optimization=False,
-            )
-            cells.append(cell)
-            layer_input_dim = d
-
-        decoder_cell = rnn_cell.MultiRNNCell(
-            cells,
-            name='decoder',
-            residual_output_layers=range(1, len(cells)) if residual else None,
-        )
-
-        attention_cell = rnn_cell.AttentionCell(
-            encoder_output_dim=encoder_dim,
-            encoder_outputs=encoder_outputs,
-            encoder_lengths=None,
-            decoder_cell=decoder_cell,
-            decoder_state_dim=dim_out[-1],
-            name='attention_decoder',
-            attention_type=AttentionType.Recurrent,
-            weighted_encoder_outputs=weighted_encoder_outputs,
-            attention_memory_optimization=True,
-        )
-        if final_dropout:
-            # dropout ratio of 0.0 used to test mechanism but not interfere
-            # with numerical tests
-            attention_cell = rnn_cell.DropoutCell(
-                internal_cell=attention_cell,
-                dropout_ratio=0.0,
-                name='dropout',
-                forward_only=forward_only,
-                is_test=False,
-            )
-
-        attention_cell = (
-            attention_cell if T is None
-            else rnn_cell.UnrolledCell(attention_cell, T)
-        )
-
-        output_indices = decoder_cell.output_indices
-        output_indices.append(2 * len(cells))
-        outputs_with_grads = [2 * i for i in output_indices]
-
-        final_output, state_outputs = attention_cell.apply_over_sequence(
-            model=model,
-            inputs=input_blob,
-            seq_lengths=seq_lengths,
-            initial_states=initial_states,
-            outputs_with_grads=outputs_with_grads,
-        )
-
-    workspace.RunNetOnce(model.param_init_net)
-
-    workspace.FeedBlob(
-        seq_lengths,
-        np.random.randint(1, t + 1, size=(n,)).astype(np.int32)
-    )
-
-    return {
-        'final_output': final_output,
-        'net': model.net,
-        'initial_states': initial_states,
-        'input_blob': input_blob,
-        'encoder_outputs': encoder_outputs,
-        'weighted_encoder_outputs': weighted_encoder_outputs,
-        'outputs_with_grads': outputs_with_grads,
-    }
-
-
-class MulCell(rnn_cell.RNNCell):
-    def _apply(self, model, input_t,
-               seq_lengths, states, timestep, extra_inputs):
-        assert len(states) == 1
-        result = model.net.Mul([input_t, states[0]])
-        model.net.AddExternalOutput(result)
-        return [result]
-
-    def get_state_names(self):
-        return [self.scope("state")]
-
-
-def prepare_mul_rnn(model, input_blob, shape, T, outputs_with_grad, num_layers):
-    print("Shape: ", shape)
-    t, n, d = shape
-    cells = [MulCell(name="layer_{}".format(i)) for i in range(num_layers)]
-    cell = rnn_cell.MultiRNNCell(name="multi_mul_rnn", cells=cells)
-    if T is not None:
-        cell = rnn_cell.UnrolledCell(cell, T=T)
-    states = [
-        model.param_init_net.ConstantFill(
-            [], "initial_state_{}".format(i), value=1.0, shape=[1, n, d])
-        for i in range(num_layers)]
-    _, results = cell.apply_over_sequence(
-        model=model,
-        inputs=input_blob,
-        initial_states=states,
-        outputs_with_grads=[
-            x + 2 * (num_layers - 1) for x in outputs_with_grad
-        ],
-        seq_lengths=None,
-    )
-    return results[-2:]
-
-
-class RNNCellTest(hu.HypothesisTestCase):
-    @given(
-        input_tensor=hu.tensor(min_dim=3, max_dim=3, max_value=3),
-        num_layers=st.integers(1, 4),
-        outputs_with_grad=st.sampled_from(
-            [[0], [1], [0, 1]]
-        ),
-    )
-    @ht_settings(max_examples=10, deadline=None)
-    def test_unroll_mul(self, input_tensor, num_layers, outputs_with_grad):
-        outputs = []
-        nets = []
-        input_blob = None
-        for T in [input_tensor.shape[0], None]:
-            model = ModelHelper("rnn_mul_{}".format(
-                "unroll" if T else "dynamic"))
-            input_blob = model.net.AddExternalInputs("input_blob")
-            outputs.append(
-                prepare_mul_rnn(model, input_blob, input_tensor.shape, T,
-                                outputs_with_grad, num_layers))
-            workspace.RunNetOnce(model.param_init_net)
-            nets.append(model.net)
-            workspace.blobs[input_blob] = input_tensor
-
-        gradient_checker.NetGradientChecker.CompareNets(
-            nets, outputs, outputs_with_grad_ids=outputs_with_grad,
-            inputs_with_grads=[input_blob],
-        )
-
-    @given(
-        input_tensor=hu.tensor(min_dim=3, max_dim=3, max_value=3),
-        forget_bias=st.floats(-10.0, 10.0),
-        drop_states=st.booleans(),
-        dim_out=st.lists(
-            elements=st.integers(min_value=1, max_value=3),
-            min_size=1, max_size=3,
-        ),
-        outputs_with_grads=st.sampled_from(
-            [[0], [1], [0, 1], [0, 2], [0, 1, 2, 3]]
-        )
-    )
-    @ht_settings(max_examples=10, deadline=None)
-    @utils.debug
-    def test_unroll_lstm(self, input_tensor, dim_out, outputs_with_grads,
-                         **kwargs):
-        lstms = [
-            _prepare_rnn(
-                *input_tensor.shape,
-                create_rnn=rnn_cell.LSTM,
-                outputs_with_grads=outputs_with_grads,
-                T=T,
-                two_d_initial_states=False,
-                dim_out=dim_out,
-                **kwargs
-            ) for T in [input_tensor.shape[0], None]
-        ]
-        outputs, nets, inputs = zip(*lstms)
-        workspace.FeedBlob(inputs[0][-1], input_tensor)
-
-        assert inputs[0] == inputs[1]
-        gradient_checker.NetGradientChecker.CompareNets(
-            nets, outputs, outputs_with_grads,
-            inputs_with_grads=inputs[0],
-        )
-
-    @given(
-        input_tensor=hu.tensor(min_dim=3, max_dim=3, max_value=3),
-        encoder_length=st.integers(min_value=1, max_value=3),
-        encoder_dim=st.integers(min_value=1, max_value=3),
-        hidden_units=st.integers(min_value=1, max_value=3),
-        num_layers=st.integers(min_value=1, max_value=3),
-        residual=st.booleans(),
-        final_dropout=st.booleans(),
-    )
-    @ht_settings(max_examples=10, deadline=None)
-    @utils.debug
-    def test_unroll_attention(self, input_tensor, encoder_length,
-                                    encoder_dim, hidden_units,
-                                    num_layers, residual,
-                                    final_dropout):
-
-        dim_out = [hidden_units] * num_layers
-        encoder_tensor = np.random.random(
-            (encoder_length, input_tensor.shape[1], encoder_dim),
-        ).astype('float32')
-
-        print('Decoder input shape: {}'.format(input_tensor.shape))
-        print('Encoder output shape: {}'.format(encoder_tensor.shape))
-
-        # Necessary because otherwise test fails for networks with fewer
-        # layers than previous test. TODO: investigate why.
-        workspace.ResetWorkspace()
-
-        net, unrolled = [
-            _prepare_attention(
-                t=input_tensor.shape[0],
-                n=input_tensor.shape[1],
-                dim_in=input_tensor.shape[2],
-                encoder_dim=encoder_dim,
-                T=T,
-                dim_out=dim_out,
-                residual=residual,
-                final_dropout=final_dropout,
-            ) for T in [input_tensor.shape[0], None]
-        ]
-
-        workspace.FeedBlob(net['input_blob'], input_tensor)
-        workspace.FeedBlob(net['encoder_outputs'], encoder_tensor)
-        workspace.FeedBlob(
-            net['weighted_encoder_outputs'],
-            np.random.random(encoder_tensor.shape).astype('float32'),
-        )
-
-        for input_name in [
-            'input_blob',
-            'encoder_outputs',
-            'weighted_encoder_outputs',
-        ]:
-            assert net[input_name] == unrolled[input_name]
-        for state_name, unrolled_state_name in zip(
-            net['initial_states'],
-            unrolled['initial_states'],
-        ):
-            assert state_name == unrolled_state_name
-
-        inputs_with_grads = net['initial_states'] + [
-            net['input_blob'],
-            net['encoder_outputs'],
-            net['weighted_encoder_outputs'],
-        ]
-
-        gradient_checker.NetGradientChecker.CompareNets(
-            [net['net'], unrolled['net']],
-            [[net['final_output']], [unrolled['final_output']]],
-            [0],
-            inputs_with_grads=inputs_with_grads,
-            threshold=0.000001,
-        )
-
-    @given(
-        input_tensor=hu.tensor(min_dim=3, max_dim=3),
-        forget_bias=st.floats(-10.0, 10.0),
-        forward_only=st.booleans(),
-        drop_states=st.booleans(),
-    )
-    @ht_settings(max_examples=10, deadline=None)
-    def test_layered_lstm(self, input_tensor, **kwargs):
-        for outputs_with_grads in [[0], [1], [0, 1, 2, 3]]:
-            for memory_optim in [False, True]:
-                _, net, inputs = _prepare_rnn(
-                    *input_tensor.shape,
-                    create_rnn=rnn_cell.LSTM,
-                    outputs_with_grads=outputs_with_grads,
-                    memory_optim=memory_optim,
-                    **kwargs
-                )
-                workspace.FeedBlob(inputs[-1], input_tensor)
-                workspace.RunNetOnce(net)
-                workspace.ResetWorkspace()
-
-    def test_lstm(self):
-        self.lstm_base(lstm_type=(rnn_cell.LSTM, lstm_reference))
-
-    def test_milstm(self):
-        self.lstm_base(lstm_type=(rnn_cell.MILSTM, milstm_reference))
-
-    @unittest.skip("This is currently numerically unstable")
-    def test_norm_lstm(self):
-        self.lstm_base(
-            lstm_type=(rnn_cell.LayerNormLSTM, layer_norm_lstm_reference),
-        )
-
-    @unittest.skip("This is currently numerically unstable")
-    def test_norm_milstm(self):
-        self.lstm_base(
-            lstm_type=(rnn_cell.LayerNormMILSTM, layer_norm_milstm_reference)
-        )
-
-    @given(
-        seed=st.integers(0, 2**32 - 1),
-        input_tensor=lstm_input(),
-        forget_bias=st.floats(-10.0, 10.0),
-        fwd_only=st.booleans(),
-        drop_states=st.booleans(),
-        memory_optim=st.booleans(),
-        outputs_with_grads=st.sampled_from([[0], [1], [0, 1, 2, 3]]),
-    )
-    @ht_settings(max_examples=10, deadline=None)
-    def lstm_base(self, seed, lstm_type, outputs_with_grads, memory_optim,
-                  input_tensor, forget_bias, fwd_only, drop_states):
-        np.random.seed(seed)
-        create_lstm, ref = lstm_type
-        ref = partial(ref, forget_bias=forget_bias)
-
-        t, n, d = input_tensor.shape
-        assert d % 4 == 0
-        d = d // 4
-        ref = partial(ref, forget_bias=forget_bias, drop_states=drop_states)
-
-        net = _prepare_rnn(t, n, d, create_lstm,
-                            outputs_with_grads=outputs_with_grads,
-                            memory_optim=memory_optim,
-                            forget_bias=forget_bias,
-                            forward_only=fwd_only,
-                            drop_states=drop_states)[1]
-        # here we don't provide a real input for the net but just for one of
-        # its ops (RecurrentNetworkOp). So have to hardcode this name
-        workspace.FeedBlob("test_name_scope/external/recurrent/i2h",
-                           input_tensor)
-        op = net._net.op[-1]
-        inputs = [workspace.FetchBlob(name) for name in op.input]
-
-        # Validate forward only mode is in effect
-        if fwd_only:
-            for arg in op.arg:
-                self.assertFalse(arg.name == 'backward_step_net')
-
-        self.assertReferenceChecks(
-            hu.cpu_do,
-            op,
-            inputs,
-            ref,
-            outputs_to_check=list(range(4)),
-        )
-
-        # Checking for input, gates_t_w and gates_t_b gradients
-        if not fwd_only:
-            for param in range(5):
-                self.assertGradientChecks(
-                    device_option=hu.cpu_do,
-                    op=op,
-                    inputs=inputs,
-                    outputs_to_check=param,
-                    outputs_with_grads=outputs_with_grads,
-                    threshold=0.01,
-                    stepsize=0.005,
-                )
-
-    def test_lstm_extract_predictor_net(self):
-        model = ModelHelper(name="lstm_extract_test")
-
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
-            output, _, _, _ = rnn_cell.LSTM(
-                model=model,
-                input_blob="input",
-                seq_lengths="seqlengths",
-                initial_states=("hidden_init", "cell_init"),
-                dim_in=20,
-                dim_out=40,
-                scope="test",
-                drop_states=True,
-                return_last_layer_only=True,
-            )
-        # Run param init net to get the shapes for all inputs
-        shapes = {}
-        workspace.RunNetOnce(model.param_init_net)
-        for b in workspace.Blobs():
-            shapes[b] = workspace.FetchBlob(b).shape
-
-        # But export in CPU
-        (predict_net, export_blobs) = ExtractPredictorNet(
-            net_proto=model.net.Proto(),
-            input_blobs=["input"],
-            output_blobs=[output],
-            device=core.DeviceOption(caffe2_pb2.CPU, 1),
-        )
-
-        # Create the net and run once to see it is valid
-        # Populate external inputs with correctly shaped random input
-        # and also ensure that the export_blobs was constructed correctly.
-        workspace.ResetWorkspace()
-        shapes['input'] = [10, 4, 20]
-        shapes['cell_init'] = [1, 4, 40]
-        shapes['hidden_init'] = [1, 4, 40]
-
-        print(predict_net.Proto().external_input)
-        self.assertTrue('seqlengths' in predict_net.Proto().external_input)
-        for einp in predict_net.Proto().external_input:
-            if einp == 'seqlengths':
-                workspace.FeedBlob(
-                    "seqlengths",
-                    np.array([10] * 4, dtype=np.int32)
-                )
-            else:
-                workspace.FeedBlob(
-                    einp,
-                    np.zeros(shapes[einp]).astype(np.float32),
-                )
-                if einp != 'input':
-                    self.assertTrue(einp in export_blobs)
-
-        print(str(predict_net.Proto()))
-        self.assertTrue(workspace.CreateNet(predict_net.Proto()))
-        self.assertTrue(workspace.RunNet(predict_net.Proto().name))
-
-        # Validate device options set correctly for the RNNs
-        for op in predict_net.Proto().op:
-            if op.type == 'RecurrentNetwork':
-                for arg in op.arg:
-                    if arg.name == "step_net":
-                        for step_op in arg.n.op:
-                            self.assertEqual(0, step_op.device_option.device_type)
-                            self.assertEqual(1, step_op.device_option.device_id)
-                    elif arg.name == 'backward_step_net':
-                        self.assertEqual(caffe2_pb2.NetDef(), arg.n)
-
-    def test_lstm_params(self):
-        model = ModelHelper(name="lstm_params_test")
-
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
-            output, _, _, _ = rnn_cell.LSTM(
-                model=model,
-                input_blob="input",
-                seq_lengths="seqlengths",
-                initial_states=None,
-                dim_in=20,
-                dim_out=40,
-                scope="test",
-                drop_states=True,
-                return_last_layer_only=True,
-            )
-        for param in model.GetParams():
-            self.assertNotEqual(model.get_param_info(param), None)
-
-    def test_milstm_params(self):
-        model = ModelHelper(name="milstm_params_test")
-
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
-            output, _, _, _ = rnn_cell.MILSTM(
-                model=model,
-                input_blob="input",
-                seq_lengths="seqlengths",
-                initial_states=None,
-                dim_in=20,
-                dim_out=[40, 20],
-                scope="test",
-                drop_states=True,
-                return_last_layer_only=True,
-            )
-        for param in model.GetParams():
-            self.assertNotEqual(model.get_param_info(param), None)
-
-    def test_layer_norm_lstm_params(self):
-        model = ModelHelper(name="layer_norm_lstm_params_test")
-
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
-            output, _, _, _ = rnn_cell.LayerNormLSTM(
-                model=model,
-                input_blob="input",
-                seq_lengths="seqlengths",
-                initial_states=None,
-                dim_in=20,
-                dim_out=40,
-                scope="test",
-                drop_states=True,
-                return_last_layer_only=True,
-            )
-        for param in model.GetParams():
-            self.assertNotEqual(model.get_param_info(param), None)
-
-    @given(encoder_output_length=st.integers(1, 3),
-           encoder_output_dim=st.integers(1, 3),
-           decoder_input_length=st.integers(1, 3),
-           decoder_state_dim=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           **hu.gcs)
-    @ht_settings(max_examples=10, deadline=None)
-    def test_lstm_with_regular_attention(
-        self,
-        encoder_output_length,
-        encoder_output_dim,
-        decoder_input_length,
-        decoder_state_dim,
-        batch_size,
-        gc,
-        dc,
-    ):
-        self.lstm_with_attention(
-            partial(
-                rnn_cell.LSTMWithAttention,
-                attention_type=AttentionType.Regular,
-            ),
-            encoder_output_length,
-            encoder_output_dim,
-            decoder_input_length,
-            decoder_state_dim,
-            batch_size,
-            lstm_with_regular_attention_reference,
-            gc,
-        )
-
-    @given(encoder_output_length=st.integers(1, 3),
-           encoder_output_dim=st.integers(1, 3),
-           decoder_input_length=st.integers(1, 3),
-           decoder_state_dim=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           **hu.gcs)
-    @ht_settings(max_examples=10, deadline=None)
-    def test_lstm_with_recurrent_attention(
-        self,
-        encoder_output_length,
-        encoder_output_dim,
-        decoder_input_length,
-        decoder_state_dim,
-        batch_size,
-        gc,
-        dc,
-    ):
-        self.lstm_with_attention(
-            partial(
-                rnn_cell.LSTMWithAttention,
-                attention_type=AttentionType.Recurrent,
-            ),
-            encoder_output_length,
-            encoder_output_dim,
-            decoder_input_length,
-            decoder_state_dim,
-            batch_size,
-            lstm_with_recurrent_attention_reference,
-            gc,
-        )
-
-    @given(encoder_output_length=st.integers(2, 2),
-           encoder_output_dim=st.integers(4, 4),
-           decoder_input_length=st.integers(3, 3),
-           decoder_state_dim=st.integers(4, 4),
-           batch_size=st.integers(5, 5),
-           **hu.gcs)
-    @ht_settings(max_examples=2, deadline=None)
-    def test_lstm_with_dot_attention_same_dim(
-        self,
-        encoder_output_length,
-        encoder_output_dim,
-        decoder_input_length,
-        decoder_state_dim,
-        batch_size,
-        gc,
-        dc,
-    ):
-        self.lstm_with_attention(
-            partial(
-                rnn_cell.LSTMWithAttention,
-                attention_type=AttentionType.Dot,
-            ),
-            encoder_output_length,
-            encoder_output_dim,
-            decoder_input_length,
-            decoder_state_dim,
-            batch_size,
-            lstm_with_dot_attention_reference_same_dim,
-            gc,
-        )
-
-    @given(encoder_output_length=st.integers(1, 3),
-           encoder_output_dim=st.integers(4, 4),
-           decoder_input_length=st.integers(1, 3),
-           decoder_state_dim=st.integers(5, 5),
-           batch_size=st.integers(1, 3),
-           **hu.gcs)
-    @ht_settings(max_examples=2, deadline=None)
-    def test_lstm_with_dot_attention_different_dim(
-        self,
-        encoder_output_length,
-        encoder_output_dim,
-        decoder_input_length,
-        decoder_state_dim,
-        batch_size,
-        gc,
-        dc,
-    ):
-        self.lstm_with_attention(
-            partial(
-                rnn_cell.LSTMWithAttention,
-                attention_type=AttentionType.Dot,
-            ),
-            encoder_output_length,
-            encoder_output_dim,
-            decoder_input_length,
-            decoder_state_dim,
-            batch_size,
-            lstm_with_dot_attention_reference_different_dim,
-            gc,
-        )
-
-    @given(encoder_output_length=st.integers(2, 3),
-           encoder_output_dim=st.integers(1, 3),
-           decoder_input_length=st.integers(1, 3),
-           decoder_state_dim=st.integers(1, 3),
-           batch_size=st.integers(1, 3),
-           **hu.gcs)
-    @ht_settings(max_examples=5, deadline=None)
-    def test_lstm_with_coverage_attention(
-        self,
-        encoder_output_length,
-        encoder_output_dim,
-        decoder_input_length,
-        decoder_state_dim,
-        batch_size,
-        gc,
-        dc,
-    ):
-        self.lstm_with_attention(
-            partial(
-                rnn_cell.LSTMWithAttention,
-                attention_type=AttentionType.SoftCoverage,
-            ),
-            encoder_output_length,
-            encoder_output_dim,
-            decoder_input_length,
-            decoder_state_dim,
-            batch_size,
-            lstm_with_coverage_attention_reference,
-            gc,
-        )
-
-    def lstm_with_attention(
-        self,
-        create_lstm_with_attention,
-        encoder_output_length,
-        encoder_output_dim,
-        decoder_input_length,
-        decoder_state_dim,
-        batch_size,
-        ref,
-        gc,
-    ):
-        model = ModelHelper(name='external')
-        with core.DeviceScope(gc):
-            (
-                encoder_outputs,
-                decoder_inputs,
-                decoder_input_lengths,
-                initial_decoder_hidden_state,
-                initial_decoder_cell_state,
-                initial_attention_weighted_encoder_context,
-            ) = model.net.AddExternalInputs(
-                'encoder_outputs',
-                'decoder_inputs',
-                'decoder_input_lengths',
-                'initial_decoder_hidden_state',
-                'initial_decoder_cell_state',
-                'initial_attention_weighted_encoder_context',
-            )
-            create_lstm_with_attention(
-                model=model,
-                decoder_inputs=decoder_inputs,
-                decoder_input_lengths=decoder_input_lengths,
-                initial_decoder_hidden_state=initial_decoder_hidden_state,
-                initial_decoder_cell_state=initial_decoder_cell_state,
-                initial_attention_weighted_encoder_context=(
-                    initial_attention_weighted_encoder_context
-                ),
-                encoder_output_dim=encoder_output_dim,
-                encoder_outputs=encoder_outputs,
-                encoder_lengths=None,
-                decoder_input_dim=decoder_state_dim,
-                decoder_state_dim=decoder_state_dim,
-                scope='external/LSTMWithAttention',
-            )
-            op = model.net._net.op[-2]
-        workspace.RunNetOnce(model.param_init_net)
-
-        # This is original decoder_inputs after linear layer
-        decoder_input_blob = op.input[0]
-
-        workspace.FeedBlob(
-            decoder_input_blob,
-            np.random.randn(
-                decoder_input_length,
-                batch_size,
-                decoder_state_dim * 4,
-            ).astype(np.float32))
-        workspace.FeedBlob(
-            'external/LSTMWithAttention/encoder_outputs_transposed',
-            np.random.randn(
-                batch_size,
-                encoder_output_dim,
-                encoder_output_length,
-            ).astype(np.float32),
-        )
-        workspace.FeedBlob(
-            'external/LSTMWithAttention/weighted_encoder_outputs',
-            np.random.randn(
-                encoder_output_length,
-                batch_size,
-                encoder_output_dim,
-            ).astype(np.float32),
-        )
-        workspace.FeedBlob(
-            'external/LSTMWithAttention/coverage_weights',
-            np.random.randn(
-                encoder_output_length,
-                batch_size,
-                encoder_output_dim,
-            ).astype(np.float32),
-        )
-        workspace.FeedBlob(
-            decoder_input_lengths,
-            np.random.randint(
-                0,
-                decoder_input_length + 1,
-                size=(batch_size,)
-            ).astype(np.int32))
-        workspace.FeedBlob(
-            initial_decoder_hidden_state,
-            np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)
-        )
-        workspace.FeedBlob(
-            initial_decoder_cell_state,
-            np.random.randn(1, batch_size, decoder_state_dim).astype(np.float32)
-        )
-        workspace.FeedBlob(
-            initial_attention_weighted_encoder_context,
-            np.random.randn(
-                1, batch_size, encoder_output_dim).astype(np.float32)
-        )
-        workspace.FeedBlob(
-            'external/LSTMWithAttention/initial_coverage',
-            np.zeros((1, batch_size, encoder_output_length)).astype(np.float32),
-        )
-        inputs = [workspace.FetchBlob(name) for name in op.input]
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=ref,
-            grad_reference=None,
-            output_to_grad=None,
-            outputs_to_check=list(range(6)),
-        )
-        gradients_to_check = [
-            index for (index, input_name) in enumerate(op.input)
-            if input_name != 'decoder_input_lengths'
-        ]
-        for param in gradients_to_check:
-            self.assertGradientChecks(
-                device_option=gc,
-                op=op,
-                inputs=inputs,
-                outputs_to_check=param,
-                outputs_with_grads=[0, 4],
-                threshold=0.01,
-                stepsize=0.001,
-            )
-
-    @given(seed=st.integers(0, 2**32 - 1),
-           n=st.integers(1, 10),
-           d=st.integers(1, 10),
-           t=st.integers(1, 10),
-           dtype=st.sampled_from([np.float32, np.float16]),
-           use_sequence_lengths=st.booleans(),
-           **hu.gcs)
-    @ht_settings(max_examples=10, deadline=None)
-    def test_lstm_unit_recurrent_network(
-            self, seed, n, d, t, dtype, dc, use_sequence_lengths, gc):
-        np.random.seed(seed)
-        if dtype == np.float16:
-            # only supported with CUDA/HIP
-            assume(gc.device_type == workspace.GpuDeviceType)
-            dc = [do for do in dc if do.device_type == workspace.GpuDeviceType]
-
-        if use_sequence_lengths:
-            op_inputs = ['hidden_t_prev', 'cell_t_prev', 'gates_t',
-                         'seq_lengths', 'timestep']
-        else:
-            op_inputs = ['hidden_t_prev', 'cell_t_prev', 'gates_t', 'timestep']
-        op = core.CreateOperator(
-            'LSTMUnit',
-            op_inputs,
-            ['hidden_t', 'cell_t'],
-            sequence_lengths=use_sequence_lengths,
-        )
-        cell_t_prev = np.random.randn(1, n, d).astype(dtype)
-        hidden_t_prev = np.random.randn(1, n, d).astype(dtype)
-        gates = np.random.randn(1, n, 4 * d).astype(dtype)
-        seq_lengths = np.random.randint(1, t + 1, size=(n,)).astype(np.int32)
-        timestep = np.random.randint(0, t, size=(1,)).astype(np.int32)
-        if use_sequence_lengths:
-            inputs = [hidden_t_prev, cell_t_prev, gates, seq_lengths, timestep]
-        else:
-            inputs = [hidden_t_prev, cell_t_prev, gates, timestep]
-        input_device_options = {'timestep': hu.cpu_do}
-        self.assertDeviceChecks(
-            dc, op, inputs, [0],
-            input_device_options=input_device_options)
-
-        kwargs = {}
-        if dtype == np.float16:
-            kwargs['threshold'] = 1e-1  # default is 1e-4
-
-        def lstm_unit_reference(*args, **kwargs):
-            return lstm_unit(*args, sequence_lengths=use_sequence_lengths, **kwargs)
-
-        self.assertReferenceChecks(
-            gc, op, inputs, lstm_unit_reference,
-            input_device_options=input_device_options,
-            **kwargs)
-
-        kwargs = {}
-        if dtype == np.float16:
-            kwargs['threshold'] = 0.5  # default is 0.005
-
-        for i in range(2):
-            self.assertGradientChecks(
-                gc, op, inputs, i, [0, 1],
-                input_device_options=input_device_options,
-                **kwargs)
-
-    @given(input_length=st.integers(2, 5),
-           dim_in=st.integers(1, 3),
-           max_num_units=st.integers(1, 3),
-           num_layers=st.integers(2, 3),
-           batch_size=st.integers(1, 3))
-    @ht_settings(max_examples=10, deadline=None)
-    def test_multi_lstm(
-        self,
-        input_length,
-        dim_in,
-        max_num_units,
-        num_layers,
-        batch_size,
-    ):
-        model = ModelHelper(name='external')
-        (
-            input_sequence,
-            seq_lengths,
-        ) = model.net.AddExternalInputs(
-            'input_sequence',
-            'seq_lengths',
-        )
-        dim_out = [
-            np.random.randint(1, max_num_units + 1)
-            for _ in range(num_layers)
-        ]
-        h_all, h_last, c_all, c_last = rnn_cell.LSTM(
-            model=model,
-            input_blob=input_sequence,
-            seq_lengths=seq_lengths,
-            initial_states=None,
-            dim_in=dim_in,
-            dim_out=dim_out,
-            # scope='test',
-            outputs_with_grads=(0,),
-            return_params=False,
-            memory_optimization=False,
-            forget_bias=0.0,
-            forward_only=False,
-            return_last_layer_only=True,
-        )
-
-        workspace.RunNetOnce(model.param_init_net)
-
-        seq_lengths_val = np.random.randint(
-            1,
-            input_length + 1,
-            size=(batch_size),
-        ).astype(np.int32)
-        input_sequence_val = np.random.randn(
-            input_length,
-            batch_size,
-            dim_in,
-        ).astype(np.float32)
-        workspace.FeedBlob(seq_lengths, seq_lengths_val)
-        workspace.FeedBlob(input_sequence, input_sequence_val)
-
-        hidden_input_list = []
-        cell_input_list = []
-        i2h_w_list = []
-        i2h_b_list = []
-        gates_w_list = []
-        gates_b_list = []
-
-        for i in range(num_layers):
-            hidden_input_list.append(
-                workspace.FetchBlob(
-                    'layer_{}/initial_hidden_state'.format(i)),
-            )
-            cell_input_list.append(
-                workspace.FetchBlob(
-                    'layer_{}/initial_cell_state'.format(i)),
-            )
-            # Input projection for the first layer is produced outside
-            # of the cell ans thus not scoped
-            prefix = 'layer_{}/'.format(i) if i > 0 else ''
-            i2h_w_list.append(
-                workspace.FetchBlob('{}i2h_w'.format(prefix)),
-            )
-            i2h_b_list.append(
-                workspace.FetchBlob('{}i2h_b'.format(prefix)),
-            )
-            gates_w_list.append(
-                workspace.FetchBlob('layer_{}/gates_t_w'.format(i)),
-            )
-            gates_b_list.append(
-                workspace.FetchBlob('layer_{}/gates_t_b'.format(i)),
-            )
-
-        workspace.RunNetOnce(model.net)
-        h_all_calc = workspace.FetchBlob(h_all)
-        h_last_calc = workspace.FetchBlob(h_last)
-        c_all_calc = workspace.FetchBlob(c_all)
-        c_last_calc = workspace.FetchBlob(c_last)
-
-        h_all_ref, h_last_ref, c_all_ref, c_last_ref = multi_lstm_reference(
-            input_sequence_val,
-            hidden_input_list,
-            cell_input_list,
-            i2h_w_list,
-            i2h_b_list,
-            gates_w_list,
-            gates_b_list,
-            seq_lengths_val,
-            forget_bias=0.0,
-        )
-
-        h_all_delta = np.abs(h_all_ref - h_all_calc).sum()
-        h_last_delta = np.abs(h_last_ref - h_last_calc).sum()
-        c_all_delta = np.abs(c_all_ref - c_all_calc).sum()
-        c_last_delta = np.abs(c_last_ref - c_last_calc).sum()
-
-        self.assertAlmostEqual(h_all_delta, 0.0, places=5)
-        self.assertAlmostEqual(h_last_delta, 0.0, places=5)
-        self.assertAlmostEqual(c_all_delta, 0.0, places=5)
-        self.assertAlmostEqual(c_last_delta, 0.0, places=5)
-
-        input_values = {
-            'input_sequence': input_sequence_val,
-            'seq_lengths': seq_lengths_val,
-        }
-        for param in model.GetParams():
-            value = workspace.FetchBlob(param)
-            input_values[str(param)] = value
-
-        output_sum = model.net.SumElements(
-            [h_all],
-            'output_sum',
-            average=True,
-        )
-        fake_loss = model.net.Tanh(
-            output_sum,
-        )
-        for param in model.GetParams():
-            gradient_checker.NetGradientChecker.Check(
-                model.net,
-                outputs_with_grad=[fake_loss],
-                input_values=input_values,
-                input_to_check=str(param),
-                print_net=False,
-                step_size=0.0001,
-                threshold=0.05,
-            )
-
-
-if __name__ == "__main__":
-    workspace.GlobalInit([
-        'caffe2',
-        '--caffe2_log_level=0',
-    ])
-    unittest.main()
diff --git a/caffe2/python/operator_test/roi_align_rotated_op_test.py b/caffe2/python/operator_test/roi_align_rotated_op_test.py
deleted file mode 100644
index fcbcb555440b..000000000000
--- a/caffe2/python/operator_test/roi_align_rotated_op_test.py
+++ /dev/null
@@ -1,210 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-import copy
-
-
-class RoIAlignRotatedOp(hu.HypothesisTestCase):
-    def bbox_xywh_to_xyxy(self, boxes):
-        """
-        Convert from [center_x center_y w h] format to [x1 y1 x2 y2].
-        """
-        w, h = boxes[:, 2], boxes[:, 3]
-        boxes[:, 0] -= w / 2.0  # x1 = center_x - width/2
-        boxes[:, 1] -= h / 2.0  # y1 = center_y - height/2
-        boxes[:, 2] = boxes[:, 0] + w  # x2 = x1 + width
-        boxes[:, 3] = boxes[:, 1] + h  # y2 = y1 + height
-        return boxes
-
-    @given(
-        H=st.integers(min_value=50, max_value=100),
-        W=st.integers(min_value=50, max_value=100),
-        C=st.integers(min_value=1, max_value=3),
-        num_rois=st.integers(min_value=0, max_value=10),
-        pooled_size=st.sampled_from([7, 14]),
-        **hu.gcs
-    )
-    def test_horizontal_rois(self, H, W, C, num_rois, pooled_size, gc, dc):
-        """
-        Test that results match with RoIAlign when angle=0.
-        """
-        X = np.random.randn(1, C, H, W).astype(np.float32)
-        R = np.zeros((num_rois, 6)).astype(np.float32)
-        angle = 0.0
-        for i in range(num_rois):
-            x = np.random.uniform(1, W - 1)
-            y = np.random.uniform(1, H - 1)
-            w = np.random.uniform(1, min(x, W - x))
-            h = np.random.uniform(1, min(y, H - y))
-            R[i] = [0, x, y, w, h, angle]
-
-        op = core.CreateOperator(
-            "RoIAlignRotated",
-            ["X", "R"],
-            ["Y"],
-            pooled_h=pooled_size,
-            pooled_w=pooled_size,
-            sampling_ratio=0,
-        )
-
-        def roialign_ref(X, R):
-            # Remove angle and convert from [center_x center_y w h]
-            # to [x1 y1 x2 y2] format.
-            R_ref = copy.deepcopy(R[:, 0:5])
-            R_ref[:, 1:5] = self.bbox_xywh_to_xyxy(R_ref[:, 1:5])
-
-            ref_op = core.CreateOperator(
-                "RoIAlign",
-                ["X_ref", "R_ref"],
-                ["Y_ref"],
-                pooled_h=pooled_size,
-                pooled_w=pooled_size,
-                sampling_ratio=0,
-            )
-            workspace.FeedBlob("X_ref", X)
-            workspace.FeedBlob("R_ref", R_ref)
-            workspace.RunOperatorOnce(ref_op)
-            return [workspace.FetchBlob("Y_ref")]
-
-        self.assertReferenceChecks(
-            device_option=gc, op=op, inputs=[X, R], reference=roialign_ref
-        )
-        if core.IsGPUDeviceType(gc.device_type):
-            self.assertGradientChecks(gc, op, [X, R], 0, [0])
-
-    @given(
-        H=st.integers(min_value=50, max_value=100),
-        W=st.integers(min_value=50, max_value=100),
-        C=st.integers(min_value=1, max_value=3),
-        num_rois=st.integers(min_value=0, max_value=10),
-        pooled_size=st.sampled_from([7, 14]),
-        angle=st.sampled_from([-270, -180, -90, 90, 180, 270]),
-        **hu.gcs
-    )
-    def test_simple_rotations(
-        self, H, W, C, num_rois, pooled_size, angle, gc, dc
-    ):
-        """
-        Test with right-angled rotations that don't need interpolation.
-        """
-        X = np.random.randn(1, C, H, W).astype(np.float32)
-        R = np.zeros((num_rois, 6)).astype(np.float32)
-        for i in range(num_rois):
-            x = np.random.uniform(1, W - 1)
-            y = np.random.uniform(1, H - 1)
-            w = np.random.uniform(1, min(x, W - x, y, H - y))
-            h = np.random.uniform(1, min(x, W - x, y, H - y))
-            R[i] = [0, x, y, w, h, angle]
-
-        op = core.CreateOperator(
-            "RoIAlignRotated",
-            ["X", "R"],
-            ["Y"],
-            pooled_h=pooled_size,
-            pooled_w=pooled_size,
-            sampling_ratio=0,
-        )
-
-        def roialign_rot90(m, k=1, axes=(0,1)):
-            axes = tuple(axes)
-            if len(axes) != 2:
-                raise ValueError("len(axes) must be 2.")
-
-            m = np.asanyarray(m)
-
-            if axes[0] == axes[1] or np.absolute(axes[0] - axes[1]) == m.ndim:
-                raise ValueError("Axes must be different.")
-
-            if (axes[0] >= m.ndim or axes[0] < -m.ndim or
-                    axes[1] >= m.ndim or axes[1] < -m.ndim):
-                raise ValueError(
-                    "Axes={} out of range for array of ndim={}.".format(axes, m.ndim))
-
-            k %= 4
-
-            if k == 0:
-                return m[:]
-            if k == 2:
-                return roialign_flip(roialign_flip(m, axes[0]), axes[1])
-
-            axes_list = np.arange(0, m.ndim)
-            (axes_list[axes[0]], axes_list[axes[1]]) = (axes_list[axes[1]],
-                                                        axes_list[axes[0]])
-
-            if k == 1:
-                return np.transpose(roialign_flip(m,axes[1]), axes_list)
-            else:
-                # k == 3
-                return roialign_flip(np.transpose(m, axes_list), axes[1])
-
-        def roialign_flip(m, axis):
-            if not hasattr(m, 'ndim'):
-                m = np.asarray(m)
-            indexer = [slice(None)] * m.ndim
-            try:
-                indexer[axis] = slice(None, None, -1)
-            except IndexError as e:
-                raise ValueError("axis=%i is invalid for the %i-dimensional input array"
-                                 % (axis, m.ndim)) from e
-            return m[tuple(indexer)]
-
-        def roialign_ref(X, R):
-            # `angle` denotes counter-clockwise rotation. Rotate the input
-            # feature map in the opposite (clockwise) direction and perform
-            # standard RoIAlign. We assume all RoIs have the same angle.
-            #
-            # Also note that we need to have our own version of np.rot90,
-            # since axes isn't an argument until 1.12.0 and doesn't exist
-            # on all tested platforms.
-            norm_angle = (angle + 360) % 360
-            X_ref = roialign_rot90(X, k=-norm_angle / 90, axes=(2, 3))
-
-            # Rotate RoIs clockwise wrt the center of the input feature
-            # map to make them horizontal and convert from
-            # [center_x center_y w h] to [x1 y1 x2 y2] format.
-            roi_x, roi_y = R[:, 1], R[:, 2]
-            if norm_angle == 90:
-                new_roi_x = H - roi_y - 1
-                new_roi_y = roi_x
-            elif norm_angle == 180:
-                new_roi_x = W - roi_x - 1
-                new_roi_y = H - roi_y - 1
-            elif norm_angle == 270:
-                new_roi_x = roi_y
-                new_roi_y = W - roi_x - 1
-            else:
-                raise NotImplementedError
-            R_ref = copy.deepcopy(R[:, 0:5])
-            R_ref[:, 1], R_ref[:, 2] = new_roi_x, new_roi_y
-            R_ref[:, 1:5] = self.bbox_xywh_to_xyxy(R_ref[:, 1:5])
-
-            ref_op = core.CreateOperator(
-                "RoIAlign",
-                ["X_ref", "R_ref"],
-                ["Y_ref"],
-                pooled_h=pooled_size,
-                pooled_w=pooled_size,
-                sampling_ratio=0,
-            )
-            workspace.FeedBlob("X_ref", X_ref)
-            workspace.FeedBlob("R_ref", R_ref)
-            workspace.RunOperatorOnce(ref_op)
-            return [workspace.FetchBlob("Y_ref")]
-
-        self.assertReferenceChecks(
-            device_option=gc, op=op, inputs=[X, R], reference=roialign_ref
-        )
-        if core.IsGPUDeviceType(gc.device_type):
-            self.assertGradientChecks(gc, op, [X, R], 0, [0])
-
-
-if __name__ == '__main__':
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/rowwise_counter_test.py b/caffe2/python/operator_test/rowwise_counter_test.py
deleted file mode 100644
index a9dacc5a6d86..000000000000
--- a/caffe2/python/operator_test/rowwise_counter_test.py
+++ /dev/null
@@ -1,69 +0,0 @@
-
-
-import unittest
-
-import caffe2.python.hypothesis_test_util as hu
-import numpy as np
-from caffe2.python import core, workspace
-
-
-def update_counter_ref(prev_iter, update_counter, indices, curr_iter, counter_halflife):
-    prev_iter_out = prev_iter.copy()
-    update_counter_out = update_counter.copy()
-
-    counter_neg_log_rho = np.log(2) / counter_halflife
-    for i in indices:
-        iter_diff = curr_iter[0] - prev_iter_out[i]
-        prev_iter_out[i] = curr_iter[0]
-        update_counter_out[i] = (
-            1.0 + np.exp(-iter_diff * counter_neg_log_rho) * update_counter_out[i]
-        )
-    return prev_iter_out, update_counter_out
-
-
-class TestRowWiseCounter(hu.HypothesisTestCase):
-    def test_rowwise_counter(self):
-        h = 8 * 20
-        n = 5
-        curr_iter = np.array([100], dtype=np.int64)
-
-        update_counter = np.random.randint(99, size=h).astype(np.float64)
-        prev_iter = np.random.rand(h, 1).astype(np.int64)
-        indices = np.unique(np.random.randint(0, h, size=n))
-        indices.sort(axis=0)
-        counter_halflife = 1
-
-        net = core.Net("test_net")
-        net.Proto().type = "dag"
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("curr_iter", curr_iter)
-        workspace.FeedBlob("update_counter", update_counter)
-        workspace.FeedBlob("prev_iter", prev_iter)
-
-        net.RowWiseCounter(
-            ["prev_iter", "update_counter", "indices", "curr_iter"],
-            ["prev_iter", "update_counter"],
-            counter_halflife=counter_halflife,
-        )
-
-        workspace.RunNetOnce(net)
-
-        prev_iter_out = workspace.FetchBlob("prev_iter")
-        update_counter_out = workspace.FetchBlob("update_counter")
-
-        prev_iter_out_ref, update_counter_out_ref = update_counter_ref(
-            prev_iter,
-            update_counter,
-            indices,
-            curr_iter,
-            counter_halflife=counter_halflife,
-        )
-        assert np.allclose(prev_iter_out, prev_iter_out_ref, rtol=1e-3)
-        assert np.allclose(update_counter_out, update_counter_out_ref, rtol=1e-3)
-
-
-if __name__ == "__main__":
-    global_options = ["caffe2"]
-    core.GlobalInit(global_options)
-    unittest.main()
diff --git a/caffe2/python/operator_test/scale_op_test.py b/caffe2/python/operator_test/scale_op_test.py
deleted file mode 100644
index b4b53ddcfb88..000000000000
--- a/caffe2/python/operator_test/scale_op_test.py
+++ /dev/null
@@ -1,66 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestScaleOps(serial.SerializedTestCase):
-    @serial.given(dim=st.sampled_from([[1, 386, 1], [386, 1, 1],
-                                       [1, 256, 1], [256, 1, 1],
-                                       [1024, 256, 1], [1, 1024, 1],
-                                       [1, 1, 1]]),
-                    scale=st.floats(0.0, 10.0),
-                    num_tensors=st.integers(1, 10),
-                    **hu.gcs)
-    def test_scale_ops(self, dim, scale, num_tensors, gc, dc):
-        in_tensors = []
-        in_tensor_ps = []
-        out_tensors = []
-        out_ref_tensors = []
-        # initialize tensors
-        for i in range(num_tensors):
-            tensor = "X_{}".format(i)
-            X = np.random.rand(*dim).astype(np.float32) - 0.5
-            in_tensors.append(tensor)
-            in_tensor_ps.append(X)
-            out_tensor = "O_{}".format(i)
-            out_tensors.append(out_tensor)
-            workspace.FeedBlob(tensor, X, device_option=gc)
-
-        # run ScaleBlobs operator
-        scale_blobs_op = core.CreateOperator(
-            "ScaleBlobs",
-            in_tensors,
-            out_tensors,
-            scale=scale,
-        )
-        scale_blobs_op.device_option.CopyFrom(gc)
-        workspace.RunOperatorOnce(scale_blobs_op)
-
-        # run Scale op for each tensor and compare with ScaleBlobs
-        for i in range(num_tensors):
-            tensor = "X_{}".format(i)
-            out_ref_tensor = "O_ref_{}".format(i)
-            scale_op = core.CreateOperator(
-                "Scale",
-                [tensor],
-                [out_ref_tensor],
-                scale=scale,
-            )
-            scale_op.device_option.CopyFrom(gc)
-            workspace.RunOperatorOnce(scale_op)
-            o_ref = workspace.FetchBlob(out_ref_tensor)
-            o = workspace.FetchBlob(out_tensors[i])
-            np.testing.assert_allclose(o, o_ref)
-
-if __name__ == '__main__':
-    import unittest
-
-    unittest.main()
diff --git a/caffe2/python/operator_test/segment_ops_test.py b/caffe2/python/operator_test/segment_ops_test.py
deleted file mode 100644
index f991a7dde211..000000000000
--- a/caffe2/python/operator_test/segment_ops_test.py
+++ /dev/null
@@ -1,775 +0,0 @@
-
-
-
-
-
-from functools import partial
-from hypothesis import given, settings
-
-import numpy as np
-import unittest
-import hypothesis.strategies as st
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-def sparse_lengths_sum_ref(D, I, L, normalize_by_lengths=False):
-    R = np.zeros(shape=(L.size,) + D.shape[1:], dtype=np.float32)
-    line = 0
-    for g in range(L.size):
-        for _ in range(L[g]):
-            if len(D.shape) > 1:
-                R[g, :] += D[I[line], :]
-            else:
-                R[g] += D[I[line]]
-            line += 1
-
-        if normalize_by_lengths and L[g] > 1:
-            if len(D.shape) > 1:
-                R[g, :] = R[g, :] / L[g]
-            else:
-                R[g] = R[g] / L[g]
-
-    return [R]
-
-def sparse_lengths_mean_ref(D, I, L):
-    return sparse_lengths_sum_ref(D, I, L, normalize_by_lengths=True)
-
-
-class TesterBase:
-    def segment_reduce_op(self, data, segment_ids, reducer, indices=None):
-        segments = self.split(data, segment_ids, indices)
-        output = np.zeros((len(segments), ) + data.shape[1:])
-        for i, segment in enumerate(segments):
-            if len(segment) > 0:
-                output[i] = reducer(segment)
-            else:
-                output[i] = 0.0
-        return output
-
-    def segment_reduce_grad_op(
-        self,
-        data,
-        segment_ids,
-        reducer_grad,
-        grad_out,
-        output,
-        indices=None
-    ):
-        segments = self.split(data, segment_ids, indices)
-        segment_grads = [
-            reducer_grad(grad_out[i], [output[i]], [segment])
-            for i, segment in enumerate(segments)
-        ]
-        return self.unsplit(data.shape[1:], segment_grads, segment_ids)
-
-    def _test(self, prefix, input_strategy, refs, gpu=False, **kwargs):
-        tester = self
-        operator_args = kwargs.pop('operator_args', {})
-        threshold = kwargs.pop('threshold', 1e-4)
-        grad_check = kwargs.pop('grad_check', True)
-
-        @given(X=input_strategy, **hu.gcs)
-        def test_segment_ops(self, X, gc, dc):
-            if not gpu and gc.device_type > 0:
-                return
-            for op_name, ref, grad_ref in refs:
-                inputs = ['input%d' % i for i in range(0, len(X))]
-                op = core.CreateOperator(
-                    prefix + op_name, inputs, ['output'], **operator_args
-                )
-                print('Operator %s, ' % op.type, gc.device_type)
-
-                def seg_reduce(data, *args):
-                    indices, segments = (
-                        args if len(args) == 2 else (None, args[0])
-                    )
-                    out = tester.segment_reduce_op(
-                        data=data,
-                        segment_ids=segments,
-                        indices=indices,
-                        reducer=ref
-                    )
-                    return (out, )
-
-                def seg_reduce_grad(grad_out, outputs, inputs):
-                    data = inputs[0]
-                    args = inputs[1:]
-                    indices, segments = (
-                        args if len(args) == 2 else (None, args[0])
-                    )
-                    # grad r.t. data
-                    grad_val = tester.segment_reduce_grad_op(
-                        data, segments, grad_ref, grad_out, outputs[0], indices
-                    )
-                    # if sparse, include indices along with data gradient
-                    data_grad_slice = (
-                        (grad_val, indices) if indices is not None else grad_val
-                    )
-                    # other inputs don't have gradient
-                    return (data_grad_slice, ) + (None, ) * (len(inputs) - 1)
-
-                kwargs = {}
-                if grad_check:
-                    kwargs['output_to_grad'] = 'output'
-                    kwargs['grad_reference'] = seg_reduce_grad
-                self.assertReferenceChecks(
-                    device_option=gc,
-                    op=op,
-                    inputs=X,
-                    reference=seg_reduce,
-                    threshold=threshold,
-                    **kwargs
-                )
-        return test_segment_ops
-
-
-class SegmentsTester(TesterBase):
-    def split(self, data, segment_ids, indices=None):
-        """
-        Given:
-          data[M1 x M2 x ... x Md]
-                          the input data
-          indices[N]      the index of each entry of segment_ids into data,
-                          where 0 <= index[i] < M1,
-                          with default indices=[0,1,...N]
-          segment_ids[N]  the segment_id for each entry of indices,
-
-        returns K outputs, each one containing data entries corresponding
-        to one of the segments present in `segment_ids`.
-        """
-        if segment_ids.size == 0:
-            return []
-        K = max(segment_ids) + 1
-        outputs = [
-            np.zeros(
-                (np.count_nonzero(segment_ids == seg_id), ) + data.shape[1:],
-                dtype=data.dtype
-            ) for seg_id in range(0, K)
-        ]
-        counts = np.zeros(K, dtype=int)
-        for i, seg_id in enumerate(segment_ids):
-            data_idx = i if indices is None else indices[i]
-            outputs[seg_id][counts[seg_id]] = data[data_idx]
-            counts[seg_id] += 1
-        return outputs
-
-    def unsplit(self, extra_shape, inputs, segment_ids):
-        """ Inverse operation to `split`, with indices=None """
-        output = np.zeros((len(segment_ids), ) + extra_shape)
-        if len(segment_ids) == 0:
-            return output
-        K = max(segment_ids) + 1
-        counts = np.zeros(K, dtype=int)
-        for i, seg_id in enumerate(segment_ids):
-            output[i] = inputs[seg_id][counts[seg_id]]
-            counts[seg_id] += 1
-        return output
-
-
-class LengthsTester(TesterBase):
-    def split(self, data, lengths, indices=None):
-        K = len(lengths)
-        outputs = [
-            np.zeros((lengths[seg_id], ) + data.shape[1:],
-                     dtype=data.dtype) for seg_id in range(0, K)
-        ]
-        start = 0
-        for i in range(0, K):
-            for j in range(0, lengths[i]):
-                data_index = start + j
-                if indices is not None:
-                    data_index = indices[data_index]
-                outputs[i][j] = data[data_index]
-            start += lengths[i]
-        return outputs
-
-    def unsplit(self, extra_shape, inputs, lengths):
-        N = sum(lengths)
-        output = np.zeros((N, ) + extra_shape)
-        K = len(lengths)
-        assert len(inputs) == K
-        current = 0
-        for i in range(0, K):
-            for j in range(0, lengths[i]):
-                output[current] = inputs[i][j]
-                current += 1
-        return output
-
-
-def sum_grad(grad_out, outputs, inputs):
-    return np.repeat(
-        np.expand_dims(grad_out, axis=0),
-        inputs[0].shape[0],
-        axis=0
-    )
-
-
-def logsumexp(x):
-    return np.log(np.sum(np.exp(x), axis=0))
-
-
-def logsumexp_grad(grad_out, outputs, inputs):
-    sum_exps = np.sum(np.exp(inputs[0]), axis=0)
-    return np.repeat(
-        np.expand_dims(grad_out / sum_exps, 0),
-        inputs[0].shape[0],
-        axis=0
-    ) * np.exp(inputs[0])
-
-
-def logmeanexp(x):
-    return np.log(np.mean(np.exp(x), axis=0))
-
-
-def mean(x):
-    return np.mean(x, axis=0)
-
-
-def mean_grad(grad_out, outputs, inputs):
-    return np.repeat(
-        np.expand_dims(grad_out / inputs[0].shape[0], 0),
-        inputs[0].shape[0],
-        axis=0
-    )
-
-
-def max_fwd(x):
-    return np.amax(x, axis=0)
-
-
-def max_grad(grad_out, outputs, inputs):
-    flat_inputs = inputs[0].flatten()
-    flat_outputs = np.array(outputs[0]).flatten()
-    flat_grad_in = np.zeros(flat_inputs.shape)
-    flat_grad_out = np.array(grad_out).flatten()
-    blocks = inputs[0].shape[0]
-    if blocks == 0:
-        return np.zeros(inputs[0].shape)
-    block_size = flat_inputs.shape[0] // blocks
-
-    for i in range(block_size):
-        out_grad = flat_grad_out[i]
-        out = flat_outputs[i]
-        for j in range(blocks):
-            idx = j * block_size + i
-            # we can produce multiple outputs for max
-            if out == flat_inputs[idx]:
-                flat_grad_in[idx] = out_grad
-
-    return np.resize(flat_grad_in, inputs[0].shape)
-
-
-REFERENCES_ALL = [
-    ('Sum', partial(np.sum, axis=0), sum_grad),
-    ('Mean', partial(np.mean, axis=0), mean_grad),
-]
-
-REFERENCES_SORTED = [
-    ('RangeSum', partial(np.sum, axis=0), sum_grad),
-    ('RangeLogSumExp', logsumexp, logsumexp_grad),
-    # gradient is the same as sum
-    ('RangeLogMeanExp', logmeanexp, logsumexp_grad),
-    ('RangeMean', mean, mean_grad),
-    ('RangeMax', max_fwd, max_grad),
-]
-
-REFERENCES_LENGTHS_ONLY = [
-    ('Max', partial(np.amax, axis=0), max_grad),
-]
-
-
-def sparse_lengths_weighted_sum_ref(D, W, I, L):
-    R = np.zeros(shape=(len(L), ) + D.shape[1:], dtype=D.dtype)
-    line = 0
-    for g in range(len(L)):
-        for _ in range(L[g]):
-            if len(D.shape) > 1:
-                R[g, :] += W[line] * D[I[line], :]
-            else:
-                R[g] += W[line] * D[I[line]]
-            line += 1
-    return [R]
-
-
-def sparse_lengths_weighted_sum_grad_ref(
-        GO, fwd_out, fwd_in, grad_on_weights=False):
-    D, W, I, L = fwd_in
-    GI = np.zeros(shape=(len(I), ) + D.shape[1:], dtype=D.dtype)
-    GW = np.zeros(shape=W.shape, dtype=W.dtype) if grad_on_weights else None
-    line = 0
-    for g in range(len(L)):
-        for _ in range(L[g]):
-            if len(GO.shape) > 1:
-                GI[line, :] = W[line] * GO[g, :]
-            else:
-                GI[line] = W[line] * GO[g]
-            if GW is not None:
-                if len(GO.shape) > 1:
-                    GW[line] = np.dot(GO[g].flatten(), D[I[line], :].flatten())
-                else:
-                    GW[line] = np.dot(GO[g].flatten(), D[I[line]].flatten())
-            line += 1
-    print(GW)
-    return [(GI, I), GW, None, None]
-
-
-class TestSegmentOps(hu.HypothesisTestCase):
-    def test_sorted_segment_ops(self):
-        SegmentsTester()._test(
-            'SortedSegment',
-            hu.segmented_tensor(
-                dtype=np.float32,
-                is_sorted=True,
-                allow_empty=True
-            ),
-            REFERENCES_ALL + REFERENCES_SORTED
-        )(self)
-
-    def test_unsorted_segment_ops(self):
-        SegmentsTester()._test(
-            'UnsortedSegment',
-            hu.segmented_tensor(
-                dtype=np.float32,
-                is_sorted=False,
-                allow_empty=True
-            ),
-            REFERENCES_ALL,
-        )(self)
-
-    def test_unsorted_segment_ops_gpu(self):
-        SegmentsTester()._test(
-            'UnsortedSegment',
-            hu.segmented_tensor(
-                dtype=np.float32,
-                is_sorted=False,
-                allow_empty=True,
-            ),
-            REFERENCES_ALL,
-            gpu=workspace.has_gpu_support,
-            grad_check=False,
-        )(self)
-
-    def test_sparse_sorted_segment_ops(self):
-        SegmentsTester()._test(
-            'SparseSortedSegment',
-            hu.sparse_segmented_tensor(
-                dtype=np.float32,
-                is_sorted=True,
-                allow_empty=True
-            ),
-            REFERENCES_ALL
-        )(self)
-
-    def test_sparse_unsorted_segment_ops(self):
-        SegmentsTester()._test(
-            'SparseUnsortedSegment',
-            hu.sparse_segmented_tensor(
-                dtype=np.float32,
-                is_sorted=False,
-                allow_empty=True
-            ),
-            REFERENCES_ALL
-        )(self)
-
-    def test_lengths_ops(self):
-        LengthsTester()._test(
-            'Lengths',
-            hu.lengths_tensor(
-                dtype=np.float32,
-                min_value=1,
-                max_value=5,
-                allow_empty=True
-            ),
-            REFERENCES_ALL + REFERENCES_LENGTHS_ONLY,
-        )(self)
-
-    def test_sparse_lengths_ops(self):
-        for itype in [np.int32, np.int64]:
-            LengthsTester()._test(
-                'SparseLengths',
-                hu.sparse_lengths_tensor(
-                    dtype=np.float32,
-                    min_value=1,
-                    max_value=5,
-                    allow_empty=True,
-                    itype=itype,
-                ),
-                REFERENCES_ALL,
-            )(self)
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(**hu.gcs)
-    def test_unsorted_sums_large(self, gc, dc):
-        X = np.random.rand(10000, 32, 12).astype(np.float32)
-        segments = np.random.randint(0, 10000, size=10000).astype(np.int32)
-        op = core.CreateOperator("UnsortedSegmentSum", ["X", "segments"], "out")
-        self.assertDeviceChecks(dc, op, [X, segments], [0])
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(**hu.gcs)
-    def test_sorted_segment_range_mean(self, gc, dc):
-        X = np.random.rand(6, 32, 12).astype(np.float32)
-        segments = np.array([0, 0, 1, 1, 2, 3]).astype(np.int32)
-        op = core.CreateOperator(
-            "SortedSegmentRangeMean",
-            ["X", "segments"],
-            "out"
-        )
-        self.assertDeviceChecks(dc, op, [X, segments], [0])
-        self.assertGradientChecks(gc, op, [X, segments], 0, [0])
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(**hu.gcs)
-    def test_sorted_segment_range_log_mean_exp(self, gc, dc):
-        X = np.random.rand(7, 32, 12).astype(np.float32)
-        segments = np.array([0, 0, 1, 1, 2, 2, 3]).astype(np.int32)
-        op = core.CreateOperator(
-            "SortedSegmentRangeLogMeanExp",
-            ["X", "segments"],
-            "out"
-        )
-        self.assertDeviceChecks(dc, op, [X, segments], [0])
-        self.assertGradientChecks(gc, op, [X, segments], 0, [0])
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(**hu.gcs)
-    def test_unsorted_means_large(self, gc, dc):
-        X = np.random.rand(10000, 31, 19).astype(np.float32)
-        segments = np.random.randint(0, 10000, size=10000).astype(np.int32)
-        op = core.CreateOperator("UnsortedSegmentMean", ["X", "segments"], "out")
-        self.assertDeviceChecks(dc, op, [X, segments], [0])
-
-    @serial.given(
-        inputs=hu.lengths_tensor(
-            dtype=np.float32,
-            min_value=1,
-            max_value=5,
-            allow_empty=True,
-        ),
-        **hu.gcs
-    )
-    def test_lengths_sum(self, inputs, gc, dc):
-        X, Y = inputs
-        op = core.CreateOperator("LengthsSum", ["X", "Y"], "out")
-
-        def ref(D, L):
-            R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
-            line = 0
-            for g in range(L.size):
-                for _ in range(L[g]):
-                    if len(D.shape) > 1:
-                        R[g, :] += D[line, :]
-                    else:
-                        R[g] += D[line]
-                    line += 1
-            return [R]
-
-        self.assertReferenceChecks(gc, op, [X, Y], ref)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
-
-    @serial.given(
-        inputs=hu.sparse_lengths_tensor(
-            dtype=np.float32,
-            min_value=1,
-            max_value=5,
-            allow_empty=True
-        ),
-        **hu.gcs
-    )
-    def test_sparse_lengths_sum(self, inputs, gc, dc):
-        X, Y, Z = inputs
-        op = core.CreateOperator("SparseLengthsSum", ["X", "Y", "Z"], "out")
-
-        def ref(D, I, L):
-            R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
-            line = 0
-            for g in range(L.size):
-                for _ in range(L[g]):
-                    if len(D.shape) > 1:
-                        R[g, :] += D[I[line], :]
-                    else:
-                        R[g] += D[I[line]]
-                    line += 1
-            return [R]
-
-        self.assertReferenceChecks(gc, op, [X, Y, Z], ref)
-        self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
-        self.assertGradientChecks(gc, op, [X, Y, Z], 0, [0])
-
-    @serial.given(
-        inputs=hu.lengths_tensor(
-            dtype=np.float32,
-            min_value=1,
-            max_value=5,
-            allow_empty=True,
-        ),
-        **hu.gcs
-    )
-    def test_lengths_mean(self, inputs, gc, dc):
-        X, Y = inputs
-        op = core.CreateOperator("LengthsMean", ["X", "Y"], "out")
-
-        def ref(D, L):
-            R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
-            line = 0
-            for g in range(L.size):
-                for _ in range(L[g]):
-                    if len(D.shape) > 1:
-                        R[g, :] += D[line, :]
-                    else:
-                        R[g] += D[line]
-                    line += 1
-                if L[g] > 1:
-                    if len(D.shape) > 1:
-                        R[g, :] = R[g, :] / L[g]
-                    else:
-                        R[g] = R[g] / L[g]
-
-            return [R]
-
-        self.assertReferenceChecks(gc, op, [X, Y], ref)
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-        self.assertGradientChecks(gc, op, [X, Y], 0, [0])
-
-    @serial.given(
-        inputs=hu.sparse_lengths_tensor(
-            dtype=np.float32,
-            min_value=1,
-            max_value=5,
-            allow_empty=True
-        ),
-        **hu.gcs
-    )
-    def test_sparse_lengths_mean(self, inputs, gc, dc):
-        X, Y, Z = inputs
-        op = core.CreateOperator("SparseLengthsMean", ["X", "Y", "Z"], "out")
-
-        def ref(D, I, L):
-            R = np.zeros(shape=(L.size, ) + D.shape[1:], dtype=D.dtype)
-            line = 0
-            for g in range(L.size):
-                for _ in range(L[g]):
-                    if len(D.shape) > 1:
-                        R[g, :] += D[I[line], :]
-                    else:
-                        R[g] += D[I[line]]
-                    line += 1
-
-                if L[g] > 1:
-                    if len(D.shape) > 1:
-                        R[g, :] = R[g, :] / L[g]
-                    else:
-                        R[g] = R[g] / L[g]
-
-            return [R]
-
-        self.assertReferenceChecks(gc, op, [X, Y, Z], ref)
-        self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
-        self.assertGradientChecks(gc, op, [X, Y, Z], 0, [0])
-
-    @serial.given(
-        grad_on_weights=st.booleans(),
-        inputs=hu.sparse_lengths_tensor(
-            dtype=np.float32,
-            min_value=1,
-            max_value=5,
-            allow_empty=True
-        ),
-        seed=st.integers(min_value=0, max_value=100),
-        **hu.gcs
-    )
-    def test_sparse_lengths_weighted_sum(
-            self, grad_on_weights, inputs, seed, gc, dc):
-        D, I, L = inputs
-
-        np.random.seed(int(seed))
-
-        W = np.random.rand(I.size).astype(np.float32)
-        op = core.CreateOperator(
-            "SparseLengthsWeightedSum",
-            ["D", "W", "I", "L"],
-            "out",
-            grad_on_weights=grad_on_weights)
-        self.assertDeviceChecks(dc, op, [D, W, I, L], [0])
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[D, W, I, L],
-            reference=sparse_lengths_weighted_sum_ref,
-            threshold=1e-4,
-            output_to_grad='out',
-            grad_reference=partial(
-                sparse_lengths_weighted_sum_grad_ref,
-                grad_on_weights=grad_on_weights),
-        )
-        self.assertGradientChecks(gc, op, [D, W, I, L], 0, [0])
-        if grad_on_weights:
-            self.assertGradientChecks(gc, op, [D, W, I, L], 1, [0])
-
-    @given(**hu.gcs)
-    def test_sparse_lengths_indices_in_gradient_sum_gpu(self, gc, dc):
-        X = np.random.rand(3, 3, 4, 5).astype(np.float32)
-        Y = np.asarray([3, 3, 2]).astype(np.int32)
-        Z = np.random.randint(0, 50, size=8).astype(np.int64)
-        op = core.CreateOperator(
-            "SparseLengthsIndicesInGradientSumGradient", ["X", "Y", "Z"], "out"
-        )
-        self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
-
-    @given(**hu.gcs)
-    def test_sparse_lengths_indices_in_gradient_mean_gpu(self, gc, dc):
-        X = np.random.rand(3, 3, 4, 5).astype(np.float32)
-        Y = np.asarray([3, 3, 2]).astype(np.int32)
-        Z = np.random.randint(0, 50, size=8).astype(np.int64)
-        op = core.CreateOperator(
-            "SparseLengthsIndicesInGradientMeanGradient", ["X", "Y", "Z"], "out"
-        )
-        self.assertDeviceChecks(dc, op, [X, Y, Z], [0])
-
-    @given(**hu.gcs_cpu_only)
-    def test_legacy_sparse_and_lengths_sum_gradient(self, gc, dc):
-        X = np.random.rand(3, 64).astype(np.float32)
-        Y = np.asarray([20, 20, 10]).astype(np.int32)
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        test_net = core.Net("test_net")
-        test_net.SparseLengthsSumGradient(["X", "Y"], "out1")
-        test_net.LengthsSumGradient(["X", "Y"], "out2")
-        workspace.RunNetOnce(test_net)
-        out1 = workspace.FetchBlob("out1")
-        out2 = workspace.FetchBlob("out2")
-        self.assertTrue((out1 == out2).all())
-
-    @given(**hu.gcs)
-    @settings(deadline=10000)
-    def test_sparse_lengths_sum_invalid_index(self, gc, dc):
-        D = np.random.rand(50, 3, 4, 5).astype(np.float32)
-        I = (np.random.randint(0, 10000, size=10) + 10000).astype(np.int64)
-        L = np.asarray([4, 4, 2]).astype(np.int32)
-        op = core.CreateOperator(
-            "SparseLengthsSum",
-            ["D", "I", "L"],
-            "out")
-        workspace.FeedBlob('D', D)
-        workspace.FeedBlob('I', I)
-        workspace.FeedBlob('L', L)
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(op)
-
-    @serial.given(**hu.gcs_cpu_only)
-    def test_sparse_lengths_positional_weighted_sum(
-            self, gc, dc):
-        D = np.random.rand(50, 3, 4, 5).astype(np.float32)
-        W = np.random.rand(50).astype(np.float32)
-        indices = np.random.randint(0, 50, size=10).astype(np.int64)
-        L = np.asarray([4, 4, 2]).astype(np.int32)
-        op = core.CreateOperator(
-            "SparseLengthsPositionalWeightedSum",
-            ["D", "W", "indices", "L"],
-            "out")
-
-        def ref_sparse(D, W, indices, L):
-            workspace.FeedBlob("L", L)
-            lengths_range_fill_op = core.CreateOperator(
-                "LengthsRangeFill", ["L"], ["L_pos_seq"])
-            workspace.RunOperatorOnce(lengths_range_fill_op)
-
-            workspace.FeedBlob("W", W)
-            gather_op = core.CreateOperator(
-                "Gather", ["W", "L_pos_seq"], ["W_gathered"])
-            workspace.RunOperatorOnce(gather_op)
-
-            workspace.FeedBlob("D", D)
-            workspace.FeedBlob("indices", indices)
-            sparse_op = core.CreateOperator(
-                "SparseLengthsWeightedSum",
-                ["D", "W_gathered", "indices", "L"],
-                "out_ref")
-            workspace.RunOperatorOnce(sparse_op)
-
-            return (workspace.FetchBlob("out_ref"),)
-
-        self.assertReferenceChecks(
-            gc, op, [D, W, indices, L], ref_sparse)
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
-    @given(
-        input=hu.tensor(min_dim=2, max_dim=2, max_value=20, dtype=np.float16),
-        data_strategy=st.data(),
-        is_mean=st.booleans(),
-        **hu.gcs
-    )
-    @settings(deadline=None)
-    def test_sparse_lengths_fp16(self, input, data_strategy, is_mean, gc, dc):
-        m = input.shape[0]
-
-        lengths = data_strategy.draw(
-            hu.tensor(
-                max_dim=1,
-                max_value=input.shape[0],
-                dtype=np.int32,
-                elements=st.integers(min_value=0, max_value=27),
-            )
-        )
-        lengths_sum = int(np.sum(lengths).item())
-
-        indices = data_strategy.draw(
-            hu.arrays(
-                [lengths_sum], dtype=np.int64, elements=st.sampled_from(np.arange(m))
-            )
-        )
-        if is_mean:
-            op = core.CreateOperator(
-                "SparseLengthsMean", ["input", "indices", "lengths"], "out"
-            )
-            self.assertReferenceChecks(gc, op, [input, indices, lengths], sparse_lengths_mean_ref)
-
-        else:
-            op = core.CreateOperator(
-                "SparseLengthsSum", ["input", "indices", "lengths"], "out"
-            )
-            self.assertReferenceChecks(gc, op, [input, indices, lengths], sparse_lengths_sum_ref)
-
-   # @given(
-   #     inputs=hu.lengths_tensor(
-   #         dtype=np.float32,
-   #         min_value=1,
-   #         max_value=5,
-   #         min_dim=1,
-   #         max_dim=1,
-   #         allow_empty=False,
-   #     ),
-   #     **hu.gcs
-   # )
-   # def test_lengths_max_gpu(self, inputs, gc, dc):
-   #     def lengths_max_ref(I, L):
-   #         R = np.zeros(shape=(len(L)), dtype=I.dtype)
-   #         line = 0
-   #         for g in range(len(L)):
-   #             for i in range(L[g]):
-   #                 if i == 0:
-   #                     R[g] = I[line]
-   #                 else:
-   #                     R[g] = max(R[g], I[line])
-   #                 line += 1
-   #         return [R]
-
-   #     X, lengths = inputs
-   #     op = core.CreateOperator("LengthsMax", ["X", "lengths"], "out")
-   #     self.assertDeviceChecks(dc, op, [X, lengths], [0])
-   #     self.assertReferenceChecks(
-   #         device_option=gc,
-   #         op=op,
-   #         inputs=[X, lengths],
-   #         reference=lengths_max_ref,
-   #         threshold=1e-4,
-   #         output_to_grad='out',
-   #     )
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/self_binning_histogram_test.py b/caffe2/python/operator_test/self_binning_histogram_test.py
deleted file mode 100644
index c4cd849f407f..000000000000
--- a/caffe2/python/operator_test/self_binning_histogram_test.py
+++ /dev/null
@@ -1,300 +0,0 @@
-import unittest
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core, workspace
-from hypothesis import given, settings
-
-
-class TestSelfBinningHistogramBase:
-    def __init__(self, bin_spacing, dtype, abs=False):
-        self.bin_spacing = bin_spacing
-        self.dtype = dtype
-        self.abs = abs
-
-    def _check_histogram(self, arrays, num_bins, expected_values=None, expected_counts=None):
-        # Check that sizes match and counts add up.
-        values = workspace.FetchBlob("histogram_values")
-        counts = workspace.FetchBlob("histogram_counts")
-        self.assertTrue(np.size(values) == num_bins)
-        self.assertTrue(np.size(counts) == num_bins)
-        self.assertTrue(np.sum(counts) == sum([np.size(array) for array in arrays]))
-
-        # Check counts
-        if expected_counts is None:
-            # Check that counts are correct for the returned values if expected_counts is not given.
-            expected_counts = np.zeros(num_bins, dtype='i')
-            for array in arrays:
-                for input_val in array:
-                    input_val = abs(input_val) if self.abs else input_val
-                    found = False
-                    for pos in range(np.size(values)):
-                        if values[pos] > input_val:
-                            found = True
-                            break
-                    self.assertTrue(found, f"input value must fit inside values array: "
-                                           f"input={input_val}, last_value={values[-1]}")
-                    if self.bin_spacing == "linear":
-                        self.assertTrue(pos > 0,
-                                        f"input should not be smaller than the first bin value: "
-                                        f"input={input_val}, 1st bin value={values[pos]}")
-                    if pos == 0:
-                        self.assertEqual(self.bin_spacing, "logarithmic")
-                        expected_counts[pos] += 1
-                    else:
-                        expected_counts[pos - 1] += 1
-        self.assertTrue(np.array_equal(expected_counts, counts), f"expected:{expected_counts}\ncounts:{counts}")
-        # Check values
-        if expected_values is not None:
-            self.assertTrue(np.allclose(expected_values, values, rtol=1e-02, atol=1e-05),
-                            f"expected:{expected_values}\nvalues:{values}")
-        # Ideally, the output values are sorted in a non-decreasing order.
-        for idx in range(len(values) - 1):
-            self.assertTrue(values[idx] <= values[idx + 1])
-        if self.abs:
-            self.assertTrue(values[0] >= 0)
-
-
-    def _run_single_op_net(self, arrays, num_bins, logspacing_start=None):
-        for i in range(len(arrays)):
-            workspace.FeedBlob(
-                "X{}".format(i), arrays[i]
-            )
-        net = core.Net("test_net")
-        if logspacing_start is not None:
-            net.SelfBinningHistogram(
-                ["X{}".format(i) for i in range(len(arrays))],
-                ["histogram_values", "histogram_counts"],
-                num_bins=num_bins,
-                bin_spacing=self.bin_spacing,
-                logspacing_start=logspacing_start,
-                abs=self.abs
-            )
-        else:
-            net.SelfBinningHistogram(
-                ["X{}".format(i) for i in range(len(arrays))],
-                ["histogram_values", "histogram_counts"],
-                num_bins=num_bins,
-                bin_spacing=self.bin_spacing,
-                abs=self.abs
-            )
-        workspace.RunNetOnce(net)
-
-    @given(rows=st.integers(1, 1000), cols=st.integers(1, 1000), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_histogram_device_consistency(self, rows, cols, gc, dc):
-        X = np.random.rand(rows, cols)
-        op = core.CreateOperator(
-            "SelfBinningHistogram",
-            ["X"],
-            ["histogram_values", "histogram_counts"],
-            num_bins=1000,
-            bin_spacing=self.bin_spacing,
-        )
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    def test_histogram_bin_to_fewer(self):
-        X = np.array([-2.0, -2.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 9.0], dtype=self.dtype)
-        if self.bin_spacing == 'linear':
-            if not self.abs:
-                expected_values = [-2., 0.2, 2.4, 4.6, 6.8, 9.]
-                expected_counts = [5, 2, 2, 1, 1, 0]
-            else:
-                expected_values = [0., 1.8, 3.6, 5.4, 7.2, 9.]
-                expected_counts = [4, 4, 1, 1, 1, 0]
-        else:
-            expected_values = [1.e-24, 9.8e-20, 9.6e-15, 9.4e-10, 9.2e-05, 9.]
-            if not self.abs:
-                expected_counts = [5, 0, 0, 0, 6, 0]
-            else:
-                expected_counts = [3, 0, 0, 0, 8, 0]
-        self._run_single_op_net([X], 5)
-        self._check_histogram(
-            [X],
-            6,
-            expected_values=expected_values,
-            expected_counts=expected_counts
-        )
-
-    def test_histogram_bin_to_more(self):
-        X = np.array([-2.0, -2.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 9.0], dtype=self.dtype)
-        self._run_single_op_net([X], 100)
-        self._check_histogram(
-            [X],
-            101,
-        )
-
-    def test_histogram_bin_to_two(self):
-        """This test roughly tests [min,max+EPSILON] and [N,0]"""
-        X = np.array([-2.0, -2.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 9.0], dtype=self.dtype)
-        if self.bin_spacing == 'linear':
-            if not self.abs:
-                expected_values = [-2., 9.]
-            else:
-                expected_values = [0., 9.]
-        else:
-            expected_values = [1.e-24, 9.]
-        expected_counts = [11, 0]
-        self._run_single_op_net([X], 1)
-        self._check_histogram(
-            [X],
-            2,
-            expected_values=expected_values,
-            expected_counts=expected_counts
-        )
-
-    def test_histogram_min_max_equal(self):
-        """This test uses exact value match, so is only relevant for float type."""
-        X = np.array([0., 0., 0., 0., 0.], dtype='f')
-        logspacing_start = np.float64(1e-24)
-        self._run_single_op_net([X], 3, logspacing_start)
-        if self.bin_spacing == "linear":
-            self._check_histogram(
-                [X],
-                4,
-                expected_values=np.array([0., 0., 0., 0.], dtype='f'),
-                expected_counts=[5, 0, 0, 0]
-            )
-        else:
-            self.assertEqual(self.bin_spacing, "logarithmic")
-            self._check_histogram(
-                [X],
-                4,
-                expected_values=np.array([logspacing_start] * 4, dtype='f'),
-                expected_counts=[5, 0, 0, 0],
-            )
-
-    def test_histogram_min_max_equal_nonzero(self):
-        X = np.array([1., 1., 1., 1., 1.], dtype=self.dtype)
-        logspacing_start = 1e-24
-        self._run_single_op_net([X], 3, logspacing_start)
-        self._check_histogram(
-            [X],
-            4,
-            expected_values=[1., 1., 1., 1.],
-            expected_counts=[5, 0, 0, 0]
-        )
-
-    def test_histogram_empty_input_tensor(self):
-        X = np.array([], dtype=self.dtype)
-        self._run_single_op_net([X], 1)
-        self._check_histogram(
-            [X],
-            2,
-            expected_values=[0., 0.],
-            expected_counts=[0, 0]
-        )
-        self._run_single_op_net([X], 10)
-        self._check_histogram(
-            [X],
-            11,
-            expected_values=[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
-            expected_counts=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-        )
-
-    def test_histogram_multi_input(self):
-        X1 = np.array([-2.0, -2.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 9.0], dtype=self.dtype)
-        X2 = np.array([-5.0, -3.0, 7, 7, 0.0, 1.0, 2.0, -3.0, 4.0, 6.0, 9.0], dtype=self.dtype)
-        if self.bin_spacing == 'linear':
-            if not self.abs:
-                expected_values = [-5., -2.2, 0.6, 3.4, 6.2, 9.]
-                expected_counts = [3, 6, 5, 4, 4, 0]
-            else:
-                expected_values = [0., 1.8, 3.6, 5.4, 7.2, 9.]
-                expected_counts = [6, 7, 3, 4, 2, 0]
-        else:
-            expected_values = [1.e-24, 9.8e-20, 9.6e-15, 9.4e-10, 9.2e-05, 9.]
-            if not self.abs:
-                expected_counts = [9, 0, 0, 0, 13, 0]
-            else:
-                expected_counts = [4, 0, 0, 0, 18, 0]
-        self._run_single_op_net([X1, X2], 5)
-        self._check_histogram(
-            [X1, X2],
-            6,
-            expected_values=expected_values,
-            expected_counts=expected_counts
-        )
-
-    def test_histogram_very_small_range_for_stride_underflow(self):
-        """Tests a large number of bins for a very small range of values.
-
-        This test uses float type. 1-e302 is very small, and with 1M bins, it
-        causes numeric underflow. This test is to show that this is handled.
-
-        Note: this test was flaky due to how compiler and OS handls floats.
-        Previously, 1-e38 does not induce overflow and cuases test error for some
-        combinations of compiler and OS. Now 1-e302 should be small enough.
-        """
-        X = np.array([0, 1e-302], dtype='f')
-        large_bin_number = 1000000
-        self._run_single_op_net([X], large_bin_number)
-        self._check_histogram(
-            [X],
-            large_bin_number + 1,
-            expected_counts=[2] + [0] * large_bin_number  # [2, 0, 0, ..., 0]
-        )
-
-
-    def test_histogram_insufficient_bins(self):
-        with self.assertRaisesRegex(
-            RuntimeError, "Number of bins must be greater than or equal to 1."
-        ):
-            self._run_single_op_net([np.random.rand(111)], 0)
-
-
-class TestSelfBinningHistogramLinear(TestSelfBinningHistogramBase, hu.HypothesisTestCase):
-    def __init__(self, *args, **kwargs):
-        TestSelfBinningHistogramBase.__init__(self, bin_spacing="linear", dtype='d')
-        hu.HypothesisTestCase.__init__(self, *args, **kwargs)
-
-class TestSelfBinningHistogramLogarithmic(TestSelfBinningHistogramBase, hu.HypothesisTestCase):
-    def __init__(self, *args, **kwargs):
-        TestSelfBinningHistogramBase.__init__(self, bin_spacing="logarithmic", dtype='d')
-        hu.HypothesisTestCase.__init__(self, *args, **kwargs)
-
-class TestSelfBinningHistogramLinearFloat(TestSelfBinningHistogramBase, hu.HypothesisTestCase):
-    def __init__(self, *args, **kwargs):
-        TestSelfBinningHistogramBase.__init__(self, bin_spacing="linear", dtype='f')
-        hu.HypothesisTestCase.__init__(self, *args, **kwargs)
-
-class TestSelfBinningHistogramLogarithmicFloat(TestSelfBinningHistogramBase, hu.HypothesisTestCase):
-    def __init__(self, *args, **kwargs):
-        TestSelfBinningHistogramBase.__init__(self, bin_spacing="logarithmic", dtype='f')
-        hu.HypothesisTestCase.__init__(self, *args, **kwargs)
-
-class TestSelfBinningHistogramLinearWithAbs(TestSelfBinningHistogramBase, hu.HypothesisTestCase):
-    def __init__(self, *args, **kwargs):
-        TestSelfBinningHistogramBase.__init__(self, bin_spacing="linear", dtype='d', abs=True)
-        hu.HypothesisTestCase.__init__(self, *args, **kwargs)
-
-class TestSelfBinningHistogramLogarithmicWithAbs(TestSelfBinningHistogramBase, hu.HypothesisTestCase):
-    def __init__(self, *args, **kwargs):
-        TestSelfBinningHistogramBase.__init__(self, bin_spacing="logarithmic", dtype='d', abs=True)
-        hu.HypothesisTestCase.__init__(self, *args, **kwargs)
-
-class TestSelfBinningHistogramLinearFloatWithAbs(TestSelfBinningHistogramBase, hu.HypothesisTestCase):
-    def __init__(self, *args, **kwargs):
-        TestSelfBinningHistogramBase.__init__(self, bin_spacing="linear", dtype='f', abs=True)
-        hu.HypothesisTestCase.__init__(self, *args, **kwargs)
-
-class TestSelfBinningHistogramLogarithmicFloatWithAbs(TestSelfBinningHistogramBase, hu.HypothesisTestCase):
-    def __init__(self, *args, **kwargs):
-        TestSelfBinningHistogramBase.__init__(self, bin_spacing="logarithmic", dtype='f', abs=True)
-        hu.HypothesisTestCase.__init__(self, *args, **kwargs)
-
-class TestSelfBinningHistogramLinearWithNoneAbs(TestSelfBinningHistogramBase, hu.HypothesisTestCase):
-    def __init__(self, *args, **kwargs):
-        TestSelfBinningHistogramBase.__init__(self, bin_spacing="linear", dtype='d', abs=None)
-        hu.HypothesisTestCase.__init__(self, *args, **kwargs)
-
-class TestSelfBinningHistogramLinearFloatWithNoneAbs(TestSelfBinningHistogramBase, hu.HypothesisTestCase):
-    def __init__(self, *args, **kwargs):
-        TestSelfBinningHistogramBase.__init__(self, bin_spacing="linear", dtype='f', abs=None)
-        hu.HypothesisTestCase.__init__(self, *args, **kwargs)
-
-if __name__ == "__main__":
-    global_options = ["caffe2"]
-    core.GlobalInit(global_options)
-    unittest.main()
diff --git a/caffe2/python/operator_test/selu_op_test.py b/caffe2/python/operator_test/selu_op_test.py
deleted file mode 100644
index 73cb0736dcee..000000000000
--- a/caffe2/python/operator_test/selu_op_test.py
+++ /dev/null
@@ -1,100 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-
-
-class TestSelu(serial.SerializedTestCase):
-
-    @serial.given(X=hu.tensor(),
-           engine=st.sampled_from(["", "CUDNN"]),
-            **hu.gcs)
-    def test_selu_1(self, X, gc, dc, engine):
-        alpha = 1.0
-        scale = 2.0
-        op = core.CreateOperator("Selu", ["X"], ["Y"],
-                                 alpha=alpha, scale=scale, engine=engine)
-        X = TestSelu.fix0(X)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-        self.assertReferenceChecks(
-            gc, op, [X], lambda x: TestSelu.selu_ref(x, alpha=alpha, scale=scale)
-        )
-
-    @given(X=hu.tensor(),
-           engine=st.sampled_from(["", "CUDNN"]),
-            **hu.gcs)
-    @settings(deadline=10000)
-    def test_selu_2(self, X, gc, dc, engine):
-        alpha = 1.6732
-        scale = 1.0507
-        op = core.CreateOperator("Selu", ["X"], ["Y"],
-                                 alpha=alpha, scale=scale, engine=engine)
-
-        X = TestSelu.fix0(X)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=1e-2, threshold=1e-2)
-        self.assertReferenceChecks(
-            gc, op, [X], lambda x: TestSelu.selu_ref(x, alpha=alpha, scale=scale)
-        )
-
-    @given(X=hu.tensor(),
-           engine=st.sampled_from(["", "CUDNN"]),
-            **hu.gcs)
-    @settings(deadline=10000)
-    def test_selu_3(self, X, gc, dc, engine):
-        alpha = 1.3
-        scale = 1.1
-        op = core.CreateOperator("Selu", ["X"], ["Y"],
-                                 alpha=alpha, scale=scale, engine=engine)
-
-        X = TestSelu.fix0(X)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-        self.assertReferenceChecks(
-            gc, op, [X], lambda x: TestSelu.selu_ref(x, alpha=alpha, scale=scale)
-        )
-
-    @given(X=hu.tensor(),
-           engine=st.sampled_from(["", "CUDNN"]),
-            **hu.gcs)
-    def test_selu_inplace(self, X, gc, dc, engine):
-        alpha = 1.3
-        scale = 1.1
-        op = core.CreateOperator("Selu", ["X"], ["X"],
-                                 alpha=alpha, scale=scale, engine=engine)
-
-        X = TestSelu.fix0(X)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-        # inplace gradient
-        Y = TestSelu.selu_ref(X, alpha=alpha, scale=scale)
-        dX = np.ones_like(X)
-        op2 = core.CreateOperator("SeluGradient", ["Y", "dX"], ["dX"],
-                                  alpha=alpha, scale=scale, engine=engine)
-        self.assertDeviceChecks(dc, op2, [Y, dX], [0])
-
-    @staticmethod
-    def fix0(X):
-        # go away from the origin point to avoid kink problems
-        X += 0.02 * np.sign(X)
-        X[X == 0.0] += 0.02
-        return X
-
-    @staticmethod
-    def selu_ref(x, scale, alpha):
-        ret = scale * ((x > 0) * x + (x <= 0) * (alpha * (np.exp(x) - 1)))
-        return [ret]
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/sequence_ops_test.py b/caffe2/python/operator_test/sequence_ops_test.py
deleted file mode 100644
index cb07a96fa0f7..000000000000
--- a/caffe2/python/operator_test/sequence_ops_test.py
+++ /dev/null
@@ -1,442 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from functools import partial
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-from caffe2.python import workspace
-
-
-def _gen_test_add_padding(with_pad_data=True,
-                          is_remove=False):
-    def gen_with_size(args):
-        lengths, inner_shape = args
-        data_dim = [sum(lengths)] + inner_shape
-        lengths = np.array(lengths, dtype=np.int32)
-        if with_pad_data:
-            return st.tuples(
-                st.just(lengths),
-                hu.arrays(data_dim),
-                hu.arrays(inner_shape),
-                hu.arrays(inner_shape))
-        else:
-            return st.tuples(st.just(lengths), hu.arrays(data_dim))
-
-    min_len = 4 if is_remove else 0
-    lengths = st.lists(
-        st.integers(min_value=min_len, max_value=10),
-        min_size=0,
-        max_size=5)
-    inner_shape = st.lists(
-        st.integers(min_value=1, max_value=3),
-        min_size=0,
-        max_size=2)
-    return st.tuples(lengths, inner_shape).flatmap(gen_with_size)
-
-
-def _add_padding_ref(
-        start_pad_width, end_pad_width, ret_lengths,
-        data, lengths, start_padding=None, end_padding=None):
-    if start_padding is None:
-        start_padding = np.zeros(data.shape[1:], dtype=data.dtype)
-    end_padding = (
-        end_padding if end_padding is not None else start_padding)
-    out_size = data.shape[0] + (
-        start_pad_width + end_pad_width) * len(lengths)
-    out = np.ndarray((out_size,) + data.shape[1:])
-    in_ptr = 0
-    out_ptr = 0
-    for length in lengths:
-        out[out_ptr:(out_ptr + start_pad_width)] = start_padding
-        out_ptr += start_pad_width
-        out[out_ptr:(out_ptr + length)] = data[in_ptr:(in_ptr + length)]
-        in_ptr += length
-        out_ptr += length
-        out[out_ptr:(out_ptr + end_pad_width)] = end_padding
-        out_ptr += end_pad_width
-    lengths_out = lengths + (start_pad_width + end_pad_width)
-    if ret_lengths:
-        return (out, lengths_out)
-    else:
-        return (out, )
-
-
-def _remove_padding_ref(start_pad_width, end_pad_width, data, lengths):
-    pad_width = start_pad_width + end_pad_width
-    out_size = data.shape[0] - (
-        start_pad_width + end_pad_width) * len(lengths)
-    out = np.ndarray((out_size,) + data.shape[1:])
-    in_ptr = 0
-    out_ptr = 0
-    for length in lengths:
-        out_length = length - pad_width
-        out[out_ptr:(out_ptr + out_length)] = data[
-            (in_ptr + start_pad_width):(in_ptr + length - end_pad_width)]
-        in_ptr += length
-        out_ptr += out_length
-    lengths_out = lengths - (start_pad_width + end_pad_width)
-    return (out, lengths_out)
-
-
-def _gather_padding_ref(start_pad_width, end_pad_width, data, lengths):
-    start_padding = np.zeros(data.shape[1:], dtype=data.dtype)
-    end_padding = np.zeros(data.shape[1:], dtype=data.dtype)
-    pad_width = start_pad_width + end_pad_width
-    ptr = 0
-    for length in lengths:
-        for _ in range(start_pad_width):
-            start_padding += data[ptr]
-            ptr += 1
-        ptr += length - pad_width
-        for _ in range(end_pad_width):
-            end_padding += data[ptr]
-            ptr += 1
-    return (start_padding, end_padding)
-
-
-class TestSequenceOps(serial.SerializedTestCase):
-    @given(start_pad_width=st.integers(min_value=1, max_value=2),
-           end_pad_width=st.integers(min_value=0, max_value=2),
-           args=_gen_test_add_padding(with_pad_data=True),
-           ret_lengths=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_add_padding(
-        self, start_pad_width, end_pad_width, args, ret_lengths, gc, dc
-    ):
-        lengths, data, start_padding, end_padding = args
-        start_padding = np.array(start_padding, dtype=np.float32)
-        end_padding = np.array(end_padding, dtype=np.float32)
-        outputs = ['output', 'lengths_out'] if ret_lengths else ['output']
-        op = core.CreateOperator(
-            'AddPadding', ['data', 'lengths', 'start_padding', 'end_padding'],
-            outputs,
-            padding_width=start_pad_width,
-            end_padding_width=end_pad_width
-        )
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[data, lengths, start_padding, end_padding],
-            reference=partial(
-                _add_padding_ref, start_pad_width, end_pad_width, ret_lengths
-            )
-        )
-
-    def _local_test_add_padding_shape_and_type(
-        self,
-        data,
-        start_pad_width,
-        end_pad_width,
-        ret_lengths,
-        lengths=None,
-    ):
-        if ret_lengths and lengths is None:
-            return
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("data", data)
-        if lengths is not None:
-            workspace.FeedBlob("lengths", np.array(lengths).astype(np.int32))
-
-        op = core.CreateOperator(
-            'AddPadding',
-            ['data'] if lengths is None else ['data', 'lengths'],
-            ['output', 'lengths_out'] if ret_lengths else ['output'],
-            padding_width=start_pad_width,
-            end_padding_width=end_pad_width
-        )
-
-        add_padding_net = core.Net("add_padding_net")
-        add_padding_net.Proto().op.extend([op])
-        assert workspace.RunNetOnce(
-            add_padding_net
-        ), "Failed to run the add_padding_net"
-
-        shapes, types = workspace.InferShapesAndTypes(
-            [add_padding_net],
-        )
-
-        expected_shape = list(data.shape)
-        expected_shape[0] += (1 if lengths is None else len(lengths)) * (start_pad_width + end_pad_width)
-        self.assertEqual(shapes["output"], expected_shape)
-        self.assertEqual(types["output"], core.DataType.FLOAT)
-        if ret_lengths:
-            if lengths is None:
-                self.assertEqual(shapes["lengths_out"], [1])
-            else:
-                self.assertEqual(shapes["lengths_out"], [len(lengths)])
-            self.assertEqual(types["lengths_out"], core.DataType.INT32)
-
-
-    def test_add_padding_shape_and_type_3(
-        self
-    ):
-        for start_pad_width in range(3):
-            for end_pad_width in range(3):
-                for ret_lengths in [True, False]:
-                    self._local_test_add_padding_shape_and_type(
-                        data=np.random.rand(1, 2).astype(np.float32),
-                        lengths=None,
-                        start_pad_width=start_pad_width,
-                        end_pad_width=end_pad_width,
-                        ret_lengths=ret_lengths,
-                    )
-
-    def test_add_padding_shape_and_type_4(
-        self
-    ):
-        for start_pad_width in range(3):
-            for end_pad_width in range(3):
-                for ret_lengths in [True, False]:
-                    self._local_test_add_padding_shape_and_type(
-                        data=np.random.rand(3, 1, 2).astype(np.float32),
-                        lengths=[1, 1, 1],
-                        start_pad_width=start_pad_width,
-                        end_pad_width=end_pad_width,
-                        ret_lengths=ret_lengths,
-                    )
-
-    def test_add_padding_shape_and_type_5(
-        self
-    ):
-        for start_pad_width in range(3):
-            for end_pad_width in range(3):
-                for ret_lengths in [True, False]:
-                    self._local_test_add_padding_shape_and_type(
-                        data=np.random.rand(3, 2, 1).astype(np.float32),
-                        lengths=None,
-                        start_pad_width=start_pad_width,
-                        end_pad_width=end_pad_width,
-                        ret_lengths=ret_lengths,
-                    )
-
-    @given(start_pad_width=st.integers(min_value=0, max_value=3),
-           end_pad_width=st.integers(min_value=0, max_value=3),
-           num_dims=st.integers(min_value=1, max_value=4),
-           num_groups=st.integers(min_value=0, max_value=4),
-           ret_lengths=st.booleans(),
-           **hu.gcs)
-    @settings(deadline=1000)
-    def test_add_padding_shape_and_type(
-        self, start_pad_width, end_pad_width, num_dims, num_groups, ret_lengths, gc, dc
-    ):
-        np.random.seed(666)
-        lengths = []
-        for _ in range(num_groups):
-            lengths.append(np.random.randint(0, 3))
-        if sum(lengths) == 0:
-            lengths = []
-
-        data_shape = []
-        for _ in range(num_dims):
-            data_shape.append(np.random.randint(1, 4))
-        if sum(lengths) > 0:
-            data_shape[0] = sum(lengths)
-
-        data = np.random.randn(*data_shape).astype(np.float32)
-
-        self._local_test_add_padding_shape_and_type(
-            data=data,
-            lengths=lengths if len(lengths) else None,
-            start_pad_width=start_pad_width,
-            end_pad_width=end_pad_width,
-            ret_lengths=ret_lengths,
-        )
-
-    @given(start_pad_width=st.integers(min_value=1, max_value=2),
-           end_pad_width=st.integers(min_value=0, max_value=2),
-           args=_gen_test_add_padding(with_pad_data=False),
-           **hu.gcs)
-    def test_add_zero_padding(self, start_pad_width, end_pad_width, args, gc, dc):
-        lengths, data = args
-        op = core.CreateOperator(
-            'AddPadding',
-            ['data', 'lengths'],
-            ['output', 'lengths_out'],
-            padding_width=start_pad_width,
-            end_padding_width=end_pad_width)
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [data, lengths],
-            partial(_add_padding_ref, start_pad_width, end_pad_width, True))
-
-    @given(start_pad_width=st.integers(min_value=1, max_value=2),
-           end_pad_width=st.integers(min_value=0, max_value=2),
-           data=hu.tensor(min_dim=1, max_dim=3),
-           **hu.gcs)
-    def test_add_padding_no_length(self, start_pad_width, end_pad_width, data, gc, dc):
-        op = core.CreateOperator(
-            'AddPadding',
-            ['data'],
-            ['output', 'output_lens'],
-            padding_width=start_pad_width,
-            end_padding_width=end_pad_width)
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [data],
-            partial(
-                _add_padding_ref, start_pad_width, end_pad_width, True,
-                lengths=np.array([data.shape[0]])))
-
-    # Uncomment the following seed to make this fail.
-    # @seed(302934307671667531413257853548643485645)
-    # See https://github.com/caffe2/caffe2/issues/1547
-    @unittest.skip("flaky test")
-    @given(start_pad_width=st.integers(min_value=1, max_value=2),
-           end_pad_width=st.integers(min_value=0, max_value=2),
-           args=_gen_test_add_padding(with_pad_data=False, is_remove=True),
-           **hu.gcs)
-    def test_remove_padding(self, start_pad_width, end_pad_width, args, gc, dc):
-        lengths, data = args
-        op = core.CreateOperator(
-            'RemovePadding',
-            ['data', 'lengths'],
-            ['output', 'lengths_out'],
-            padding_width=start_pad_width,
-            end_padding_width=end_pad_width)
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[data, lengths],
-            reference=partial(_remove_padding_ref, start_pad_width, end_pad_width))
-
-    @given(start_pad_width=st.integers(min_value=0, max_value=2),
-           end_pad_width=st.integers(min_value=0, max_value=2),
-           args=_gen_test_add_padding(with_pad_data=True),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_gather_padding(self, start_pad_width, end_pad_width, args, gc, dc):
-        lengths, data, start_padding, end_padding = args
-        padded_data, padded_lengths = _add_padding_ref(
-            start_pad_width, end_pad_width, True, data,
-            lengths, start_padding, end_padding)
-        op = core.CreateOperator(
-            'GatherPadding',
-            ['data', 'lengths'],
-            ['start_padding', 'end_padding'],
-            padding_width=start_pad_width,
-            end_padding_width=end_pad_width)
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[padded_data, padded_lengths],
-            reference=partial(_gather_padding_ref, start_pad_width, end_pad_width))
-
-    @given(data=hu.tensor(min_dim=3, max_dim=3, dtype=np.float32,
-                          elements=hu.floats(min_value=-np.inf,
-                                             max_value=np.inf),
-                          min_value=1, max_value=10),
-                          **hu.gcs)
-    @settings(deadline=10000)
-    def test_reverse_packed_segs(self, data, gc, dc):
-        max_length = data.shape[0]
-        batch_size = data.shape[1]
-        lengths = np.random.randint(max_length + 1, size=batch_size)
-
-        op = core.CreateOperator(
-            "ReversePackedSegs",
-            ["data", "lengths"],
-            ["reversed_data"])
-
-        def op_ref(data, lengths):
-            rev_data = np.array(data, copy=True)
-            for i in range(batch_size):
-                seg_length = lengths[i]
-                for j in range(seg_length):
-                    rev_data[j][i] = data[seg_length - 1 - j][i]
-            return (rev_data,)
-
-        def op_grad_ref(grad_out, outputs, inputs):
-            return op_ref(grad_out, inputs[1]) + (None,)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[data, lengths],
-            reference=op_ref,
-            output_to_grad='reversed_data',
-            grad_reference=op_grad_ref)
-
-    @given(data=hu.tensor(min_dim=1, max_dim=3, dtype=np.float32,
-                          elements=hu.floats(min_value=-np.inf,
-                                             max_value=np.inf),
-                          min_value=10, max_value=10),
-           indices=st.lists(st.integers(min_value=0, max_value=9),
-                            min_size=0,
-                            max_size=10),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_remove_data_blocks(self, data, indices, gc, dc):
-        indices = np.array(indices)
-
-        op = core.CreateOperator(
-            "RemoveDataBlocks",
-            ["data", "indices"],
-            ["shrunk_data"])
-
-        def op_ref(data, indices):
-            unique_indices = np.unique(indices) if len(indices)>0 else np.array([],dtype=np.int64)
-            sorted_indices = np.sort(unique_indices)
-            shrunk_data = np.delete(data, sorted_indices, axis=0)
-            return (shrunk_data,)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[data, indices],
-            reference=op_ref)
-
-    @given(elements=st.lists(st.integers(min_value=0, max_value=9),
-                             min_size=0,
-                             max_size=10),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_find_duplicate_elements(self, elements, gc, dc):
-        mapping = {
-            0: "a",
-            1: "b",
-            2: "c",
-            3: "d",
-            4: "e",
-            5: "f",
-            6: "g",
-            7: "h",
-            8: "i",
-            9: "j"}
-        data = np.array([mapping[e] for e in elements], dtype='|S')
-
-        op = core.CreateOperator(
-            "FindDuplicateElements",
-            ["data"],
-            ["indices"])
-
-        def op_ref(data):
-            unique_data = []
-            indices = []
-            for i, e in enumerate(data):
-                if e in unique_data:
-                    indices.append(i)
-                else:
-                    unique_data.append(e)
-            return (np.array(indices, dtype=np.int64),)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[data],
-            reference=op_ref)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/shape_inference_test.py b/caffe2/python/operator_test/shape_inference_test.py
deleted file mode 100644
index 4be2cdb22d47..000000000000
--- a/caffe2/python/operator_test/shape_inference_test.py
+++ /dev/null
@@ -1,672 +0,0 @@
-
-
-
-
-
-import numpy as np
-import unittest
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, test_util, model_helper, brew, build
-
-
-@unittest.skipIf(build.CAFFE2_NO_OPERATOR_SCHEMA,
-                 'Built with CAFFE2_NO_OPERATOR_SCHEMA')
-class TestShapeInference(test_util.TestCase):
-
-    def testShapeInferenceSimpleFC(self):
-        m = model_helper.ModelHelper(name="test_model")
-
-        brew.fc(m, "data", "fc1", dim_in=96, dim_out=32)
-        brew.fc(m, "fc1", "fc2", dim_in=32, dim_out=55)
-
-        for b in [0, 64]:
-            (shapes, types) = workspace.InferShapesAndTypes(
-                [m.param_init_net, m.net],
-                {'data': [b, 96]}
-            )
-
-            self.assertEqual(shapes['data'], [b, 96])
-            self.assertEqual(shapes['fc1_w'], [32, 96])
-            self.assertEqual(shapes['fc1_b'], [32])
-            self.assertEqual(shapes['fc1'], [b, 32])
-            self.assertEqual(shapes['fc2_w'], [55, 32])
-            self.assertEqual(shapes['fc2_b'], [55])
-            self.assertEqual(shapes['fc2'], [b, 55])
-
-    def testFCAxis2(self):
-        model = model_helper.ModelHelper(name="test_model")
-        model.net.FC(["x", "w", "b"], ["y"], axis=2)
-        workspace.FeedBlob("x", np.random.rand(4, 20, 36).astype(np.float32))
-        workspace.FeedBlob("w", np.random.rand(36, 36).astype(np.float32))
-        workspace.FeedBlob("b", np.random.rand(36,).astype(np.float32))
-        self.InferTensorRunAndCompare(model)
-
-    def testFCTransposed(self):
-        model = model_helper.ModelHelper(name="test_model")
-        model.net.FCTransposed(["x", "wt", "b"], ["y"])
-        workspace.FeedBlob("x", np.random.rand(20, 36).astype(np.float32))
-        workspace.FeedBlob("wt", np.random.rand(36, 48).astype(np.float32))
-        workspace.FeedBlob("b", np.random.rand(48,).astype(np.float32))
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceSlice(self):
-        model = model_helper.ModelHelper(name="test_model")
-        model.net.Slice(["x"], ["y"], starts=[0, 0, 0, 0], ends=[-1, -1, -3, -1])
-        workspace.FeedBlob("x", np.random.rand(64, 1, 255, 384).astype(np.float32))
-
-        slice_starts = np.array([0, 0, 0, 0]).astype(np.int32)
-        slice_ends = np.array([-1, -1, -3, -1]).astype(np.int32)
-        slice_starts = model.net.GivenTensorIntFill(
-            [], shape=[4], values=slice_starts)
-        slice_ends = model.net.GivenTensorIntFill(
-            [], shape=[4], values=slice_ends)
-        model.net.Slice(["x2", slice_starts, slice_ends], ["y2"])
-        workspace.FeedBlob("x2", np.random.rand(64, 1, 255, 384).astype(np.float32))
-
-        self.InferTensorRunAndCompare(model, ["y2"])
-
-    def testShapeInferenceDistances(self):
-        model = model_helper.ModelHelper(name="test_model")
-        model.net.L1Distance(["x1", "y1"], "dl1_D1")
-        model.net.SquaredL2Distance(["x1", "y1"], "dl2_D1")
-        model.net.CosineSimilarity(["x1", "y1"], "dcos_D1")
-        model.net.DotProduct(["x1", "y1"], "ddot_D1")
-        model.net.DotProductWithPadding(["x1", "y1"], "ddotpad_D1")
-
-        model.net.L1Distance(["x2", "y2"], "dl1_D2")
-        model.net.SquaredL2Distance(["x2", "y2"], "dl2_D2")
-        model.net.CosineSimilarity(["x2", "y2"], "dcos_D2")
-        model.net.DotProduct(["x2", "y2"], "ddot_D2")
-        model.net.DotProductWithPadding(["x2", "z2"], "ddotpad_D2")
-
-        workspace.FeedBlob("x1", np.random.rand(10).astype(np.float32))
-        workspace.FeedBlob("y1", np.random.rand(10).astype(np.float32))
-
-        workspace.FeedBlob("x2", np.random.rand(10, 5).astype(np.float32))
-        workspace.FeedBlob("y2", np.random.rand(10, 5).astype(np.float32))
-        workspace.FeedBlob("z2", np.random.rand(10, 4).astype(np.float32))
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceReduceBackFrontX(self):
-        model = model_helper.ModelHelper(name="test_model")
-        model.net.ReduceBackSum(["x"], ["x_back_sum"])
-        model.net.ReduceBackMean(["x"], ["x_back_mean"])
-        model.net.ReduceBackMax(["x"], ["x_back_max"])
-        model.net.ReduceFrontSum(["x"], ["x_front_sum"])
-        model.net.ReduceFrontMean(["x"], ["x_front_mean"])
-        model.net.ReduceFrontMax(["x"], ["x_front_max"])
-
-        workspace.FeedBlob("x", np.random.rand(10, 12, 18).astype(np.float32))
-        self.InferTensorRunAndCompare(model)
-
-    def testGather(self):
-        model = model_helper.ModelHelper(name="test_model")
-        model.net.Gather(["X", "idx"], "Y")
-        workspace.FeedBlob("X", np.random.rand(100, 4, 5).astype(np.float32))
-        workspace.FeedBlob("idx", np.array([[3, 18], [99, 4], [2, 5]]).astype(np.int32))
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceConvNet(self):
-        model = model_helper.ModelHelper(name="convtest")
-        model.NHWC2NCHW("data", "data_nchw")
-        brew.conv(model, "data_nchw", 'conv1', 3, 64,
-                   weight_init=("MSRAFill", {}), kernel=7,
-                   stride=2, pad=3, no_bias=0)
-        brew.spatial_bn(model, 'conv1', 'conv1_spatbn_relu', 64, epsilon=1e-3, is_test=False)
-        brew.relu(model, 'conv1_spatbn_relu', 'conv1_spatbn_relu')
-        brew.max_pool(model, 'conv1_spatbn_relu', 'pool1', kernel=3, stride=2)
-        brew.fc(model, 'pool1', 'fc', dim_in=(64 * 56 * 56), dim_out=100)
-        brew.dropout(model, 'fc', 'fc_drop', is_test=False)
-        model.Sigmoid('fc_drop', 'fc_sigm')
-        brew.softmax(model, 'fc_sigm', 'softmax')
-        model.LabelCrossEntropy(['softmax', 'label'], 'xent')
-        loss = model.AveragedLoss('xent', 'loss')
-
-        model.AddGradientOperators([loss])
-
-        LR = model.param_init_net.ConstantFill(
-            [], 'LR', shape=[1], value=0.1
-        )
-
-        for param in model.GetParams():
-            param_grad = model.param_to_grad[param]
-            param_momentum = model.param_init_net.ConstantFill(
-                [param], param + '_momentum', value=0.0
-            )
-            model.net.MomentumSGDUpdate(
-                [param_grad, param_momentum, LR, param],
-                [param_grad, param_momentum, param],
-            )
-
-        workspace.FeedBlob(
-            "data",
-            np.random.rand(16, 227, 227, 3).astype(np.float32),
-        )
-        workspace.FeedBlob(
-            "label",
-            (100 * np.random.rand(16)).astype(np.int32),
-        )
-        workspace.FeedBlob(
-            "label",
-            (100 * np.random.rand(16)).astype(np.int32),
-        )
-        # Then do automatic comparison test: run the next once to
-        # initialize everything
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceTranspose(self):
-        model = model_helper.ModelHelper(name="test_model")
-
-        workspace.FeedBlob(
-            "tensor",
-            np.random.rand(4, 2, 3, 3, 5).astype(np.float32)
-        )
-
-        # Testing with axes undefined
-        brew.transpose(
-            model,
-            ["tensor"],
-            "transpose",
-        )
-        self.InferTensorRunAndCompare(model)
-
-        # Testing with axes defined
-        brew.transpose(
-            model,
-            ["tensor"],
-            "transpose",
-            axes=np.random.permutation(5)
-        )
-
-        return self.InferTensorRunAndCompare(model)
-
-    def testShapeInferencePad(self):
-        model = model_helper.ModelHelper(name="padtest")
-        model.PadImage("data", 'padded', pad_t=100, pad_l=37, pad_b=28,
-                       pad_r=20, mode="constant", order="NCHW")
-
-        workspace.FeedBlob(
-            "data",
-            np.random.rand(16, 3, 228, 228).astype(np.float32),
-        )
-
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceTwoClass(self):
-        model = model_helper.ModelHelper(name="twoclass")
-        model.MakeTwoClass("v", "v2")
-        workspace.FeedBlob("v", np.random.rand(32).astype(np.float32))
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferencePadZero(self):
-        model = model_helper.ModelHelper(name="padtest")
-        model.PadImage("data", 'padded', pad=0, mode="constant",
-                       order="NCHW")
-
-        workspace.FeedBlob(
-            "data",
-            np.random.rand(16, 3, 228, 228).astype(np.float32),
-        )
-
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceMatMul(self):
-        model = model_helper.ModelHelper(name="test_model")
-
-        model.MatMul(["x", "y"], "MatMul")
-
-        workspace.FeedBlob("x", np.random.rand(10, 5).astype(np.float32))
-        workspace.FeedBlob("y", np.random.rand(5, 10).astype(np.float32))
-
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceSoftmaxWithLoss(self):
-        model = model_helper.ModelHelper(name="test_model")
-
-        model.SoftmaxWithLoss(
-            ["logits", "labels"],
-            ["softmax", "loss"],
-        )
-
-        # 2D Shape of [batch_size, num_classes]
-        workspace.FeedBlob(
-            "logits",
-            np.random.rand(4, 3).astype(np.float32),
-        )
-
-        # Shape of size batch_size with all values [0, num_classes)
-        workspace.FeedBlob(
-            "labels",
-            np.random.randint(low=0, high=3, size=(4, 1)).astype(np.int32),
-        )
-        self.InferTensorRunAndCompare(model)
-
-        # Testing with 1D labels arg
-        workspace.FeedBlob(
-            "logits",
-            np.random.rand(4, 3).astype(np.float32),
-        )
-
-        workspace.FeedBlob(
-            "labels",
-            np.random.randint(low=0, high=3, size=4).astype(np.int32),
-        )
-        self.InferTensorRunAndCompare(model)
-
-        # Testing with weight_tensor
-        model.SoftmaxWithLoss(
-            ["logits", "labels", "weight_tensor"],
-            ["softmax", "loss"],
-        )
-
-        workspace.FeedBlob(
-            "logits",
-            np.random.rand(4, 3).astype(np.float32),
-        )
-
-        workspace.FeedBlob(
-            "labels",
-            np.random.randint(low=0, high=3, size=4).astype(np.int32),
-        )
-
-        workspace.FeedBlob(
-            "weight_tensor",
-            np.random.rand(4).astype(np.float32),
-        )
-        self.InferTensorRunAndCompare(model)
-
-        # Test spatial model
-        model = model_helper.ModelHelper(name="test_model")
-        workspace.FeedBlob(
-            "img",
-            np.random.rand(32, 19, 33, 28).astype(np.float32)
-        )
-        workspace.FeedBlob(
-            "img_labels",
-            (np.random.rand(32, 33, 28) * 19).astype(np.int32)
-        )
-        model.SpatialSoftmaxWithLoss(
-            ["img", "img_labels"],
-            ["softmax_img", "loss"],
-        )
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceIm2Col(self):
-        # Test with NCHW
-        model = model_helper.ModelHelper(name="test_model")
-        model.Im2Col("X", "Y", pad=1, kernel=4, dilation=2, stride=2,
-                     order="NCHW")
-
-        workspace.FeedBlob(
-            "X",
-            np.random.rand(16, 3, 228, 228).astype(np.float32),
-        )
-
-        self.InferTensorRunAndCompare(model)
-
-        # Test with NHWC
-        model = model_helper.ModelHelper(name="test_model")
-        model.Im2Col("X", "Y", pad=1, kernel=4, dilation=2, stride=2,
-                     order="NHWC")
-
-        workspace.FeedBlob(
-            "X",
-            np.random.rand(16, 228, 228, 3).astype(np.float32),
-        )
-
-        self.InferTensorRunAndCompare(model)
-
-        # Test with different width and height
-        model = model_helper.ModelHelper(name="test_model")
-        model.Im2Col("X", "Y", pad=1, kernel_h=8, kernel_w=4,
-                     dilation=2, stride=2)
-
-        workspace.FeedBlob(
-            "X",
-            np.random.rand(16, 3, 228, 114).astype(np.float32),
-        )
-
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceTile(self):
-        m = model_helper.ModelHelper(name="test_model")
-
-        workspace.FeedBlob(
-            "tensor",
-            np.random.rand(4, 2, 3, 3, 5).astype(np.float32)
-        )
-
-        # Testing with axes undefined
-        for i in range(0, 4):
-            m.net.Tile(
-                "tensor", "tiled_tensor_{}".format(i), tiles=5, axis=i)
-        self.InferTensorRunAndCompare(m)
-
-    def testShapeInferenceFlatten(self):
-        model = model_helper.ModelHelper(name="test_model")
-        model.FlattenToVec("X", "FlatVec")
-        model.FlattenToVec("empty", "EmptyFlatVec")
-        workspace.FeedBlob("X", np.random.rand(17, 5, 13).astype(np.float32))
-        workspace.FeedBlob("empty", np.random.rand(0, 2, 3).astype(np.float32))
-
-        self.InferTensorRunAndCompare(model)
-
-        # test Flatten with default axis (=1)
-        model = model_helper.ModelHelper(name="test_model")
-        model.Flatten("X", "Flat")
-        model.Flatten("empty", "EmptyFlat")
-        workspace.FeedBlob("X", np.random.rand(17, 5, 13).astype(np.float32))
-        workspace.FeedBlob("empty", np.random.rand(0, 2, 3).astype(np.float32))
-
-        self.InferTensorRunAndCompare(model)
-
-        # test Flatten with axis
-        model = model_helper.ModelHelper(name="test_model")
-        x = np.random.randn(17, 5, 13)
-        for axis in range(x.ndim + 1):
-            model.Flatten("x", "Flat", axis=axis)
-            workspace.FeedBlob("x", x)
-            self.InferTensorRunAndCompare(model)
-
-        empty = np.random.randn(0, 5, 13)
-        for axis in range(empty.ndim + 1):
-            model.Flatten("empty", "Flat", axis=axis)
-            workspace.FeedBlob("empty", empty)
-            self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceReshape(self):
-        model = model_helper.ModelHelper(name="test_model")
-        model.Reshape("X", ["Reshaped", "Old_Shape"], shape=[8, 0, -1, 2])
-        workspace.FeedBlob("X", np.random.rand(4, 26, 32).astype(np.float32))
-
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceUnique(self):
-        for n in [0, 1]:
-            model = model_helper.ModelHelper(name="test_model")
-            model.Unique("X", ["Y"])
-            model.Unique("X", ["Z", "remap"])
-            workspace.FeedBlob("X", np.random.rand(n).astype(np.int64))
-            self.InferTensorRunAndCompare(model)
-
-    def testLengthsSum(self):
-        model = model_helper.ModelHelper(name="test_model")
-        model.LengthsSum(["X", "length"], ["sum"])
-        workspace.FeedBlob("X", np.random.rand(6, 32).astype(np.float32))
-        workspace.FeedBlob("length", np.array([1, 2, 3], dtype=np.int32))
-
-        self.InferTensorRunAndCompare(model)
-
-    def testLengthsPad(self):
-        model = model_helper.ModelHelper(name="test_model")
-        model.LengthsPad(
-            ["X", "length"],
-            ["X_padded"],
-            target_length=10,
-            padding_value=-1.0,
-        )
-        workspace.FeedBlob("X", np.random.rand(6, 32).astype(np.float32))
-        workspace.FeedBlob("length", np.array([1, 2, 3], dtype=np.int32))
-
-        self.InferTensorRunAndCompare(model)
-
-    def testConcat(self):
-        net = core.Net("concat")
-
-        net.Concat(["A", "B"], ["C", "splits"], axis=1)
-        net.Concat(["C", "D"], ["E", "splitsE"], order="NCHW")
-        net.Concat(["E", "F"], ["G", "splitsG"], add_axis=1, order="NHWC")
-        (shapes, types) = workspace.InferShapesAndTypes(
-            [net],
-            {
-                'A': [10, 12, 9, 10],
-                'B': [10, 9, 9, 10],
-                'D': [10, 2, 9, 10],
-                'F': [10, 23, 9, 10]
-            }
-        )
-        self.assertEqual(shapes['C'], [10, 21, 9, 10])
-        self.assertEqual(shapes['splits'], [2])
-        self.assertEqual(shapes['E'], [10, 23, 9, 10])
-        self.assertEqual(shapes['G'], [10, 23, 9, 2, 10])
-
-    def testConcatInt32(self):
-        net = core.Net("concat")
-
-        net.Concat(["A", "B"], ["C", "splits"], axis=1)
-        net.Concat(["C", "D"], ["E", "splitsE"], order="NCHW")
-        net.Concat(["E", "F"], ["G", "splitsG"], add_axis=1, order="NHWC")
-        (shapes, types) = workspace.InferShapesAndTypes(
-            [net],
-            blob_dimensions={
-                'A': [10, 12, 9, 10],
-                'B': [10, 9, 9, 10],
-                'D': [10, 2, 9, 10],
-                'F': [10, 23, 9, 10]
-            },
-            blob_types={
-                'A': core.DataType.INT32,
-                'B': core.DataType.INT32,
-                'D': core.DataType.INT32,
-                'F': core.DataType.INT32,
-            }
-        )
-        self.assertEqual(shapes['C'], [10, 21, 9, 10])
-        self.assertEqual(shapes['splits'], [2])
-        self.assertEqual(shapes['E'], [10, 23, 9, 10])
-        self.assertEqual(shapes['G'], [10, 23, 9, 2, 10])
-        self.assertEqual(types['C'], core.DataType.INT32)
-        self.assertEqual(types['splits'], core.DataType.INT32)
-        self.assertEqual(types['E'], core.DataType.INT32)
-        self.assertEqual(types['G'], core.DataType.INT32)
-
-    def testSqueeze(self):
-        net = core.Net("sq")
-        net.Squeeze(["data"], ["data_squeezed"], dims=[3, 1])
-        (shapes, types) = workspace.InferShapesAndTypes(
-            [net],
-            {'data': [64, 1, 96, 1, 4]}
-        )
-        self.assertEqual(shapes['data_squeezed'], [64, 96, 4])
-
-    def testCast(self):
-        model = model_helper.ModelHelper(name="test_model")
-
-        types = [
-            ('bool', bool, caffe2_pb2.TensorProto.BOOL),
-            #('byte', None, caffe2_pb2.TensorProto.BYTE),
-            ('int8', np.int8, caffe2_pb2.TensorProto.INT8),
-            ('uint8', np.uint8, caffe2_pb2.TensorProto.UINT8),
-            ('int16', np.int16, caffe2_pb2.TensorProto.INT16),
-            ('uint16', np.uint16, caffe2_pb2.TensorProto.UINT16),
-            #('float16', np.float16, caffe2_pb2.TensorProto.FLOAT16),
-            ('int32', np.int32, caffe2_pb2.TensorProto.INT32),
-            ('float', np.float32, caffe2_pb2.TensorProto.FLOAT),
-            ('int64', np.int64, caffe2_pb2.TensorProto.INT64),
-            ('double', np.float64, caffe2_pb2.TensorProto.DOUBLE),
-            #('string', None, caffe2_pb2.TensorProto.STRING),
-        ]
-
-        for (xstr, xnp, _) in types:
-            xname = 'X%s' % xstr
-            workspace.FeedBlob(xname, np.random.rand(1).astype(xnp))
-            for (ystr, _, yc2) in types:
-                yname = 'Y%s_to_%s' % (xstr, ystr)
-                model.Cast(xname, yname, to=yc2)
-
-        self.InferTensorRunAndCompare(model)
-
-    def testShapeInferenceRoiPool(self):
-        for is_test in [True, False]:
-            model = model_helper.ModelHelper(name="test_model")
-            outputs = ['Y'] if is_test else ['Y', 'argmaxes']
-            model.net.RoIPool(
-                ['X', 'R'], outputs, pooled_h=4, pooled_w=5, is_test=is_test)
-            workspace.FeedBlob(
-                "X",
-                np.random.rand(100, 3, 4, 5).astype(np.float32))
-            workspace.FeedBlob(
-                "R",
-                np.random.rand(2, 5).astype(np.float32))
-            self.InferTensorRunAndCompare(model)
-
-    def testShapeInferencePow(self):
-        model = model_helper.ModelHelper(name="powtest")
-        model.Pow("x", 'y', exponent=-1.0)
-        workspace.FeedBlob('x', np.random.rand(1, 2, 3, 4).astype(np.float32))
-        self.InferTensorRunAndCompare(model)
-
-    def testInt8Conversion(self):
-        model = model_helper.ModelHelper(name="fp32_int8_conversion_test")
-        model.FloatToFused8BitRowwiseQuantized('x', 'x_8bit')
-        model.Fused8BitRowwiseQuantizedToFloat('x_8bit', 'x_recovered')
-        workspace.FeedBlob('x', np.random.rand(100, 150).astype(np.float32))
-        self.InferTensorRunAndCompare(model)
-        x = workspace.FetchBlob('x')
-        x_recovered = workspace.FetchBlob('x_recovered')
-        # TODO: find a tighter bound
-        assert(np.allclose(x, x_recovered, atol=1e-2))
-
-        model = model_helper.ModelHelper(name="fp32_int8_conversion_test")
-        model.FloatToFused8BitRowwiseQuantizedHalfScaleBias('x', 'x_8bit')
-        model.Fused8BitRowwiseQuantizedHalfScaleBiasToFloat('x_8bit', 'x_recovered')
-        workspace.FeedBlob('x', np.random.rand(100, 150).astype(np.float32))
-        self.InferTensorRunAndCompare(model)
-        x = workspace.FetchBlob('x')
-        x_recovered = workspace.FetchBlob('x_recovered')
-        # TODO: find a tighter bound
-        assert(np.allclose(x, x_recovered, atol=1e-2))
-
-
-    def testHalfInt8Conversion(self):
-        model = model_helper.ModelHelper(name="fp16_int8_conversion_test")
-        model.HalfFloatToFused8BitRowwiseQuantized('x', 'x_8bit')
-        model.Fused8BitRowwiseQuantizedToHalfFloat('x_8bit', 'x_recovered')
-        workspace.FeedBlob('x', np.random.rand(100, 150).astype(np.float16))
-        self.InferTensorRunAndCompare(model)
-        x = workspace.FetchBlob('x')
-        x_recovered = workspace.FetchBlob('x_recovered')
-        # TODO: find a tighter bound
-        assert(np.allclose(x, x_recovered, atol=1e-2))
-
-        model = model_helper.ModelHelper(name="fp16_int8_conversion_test")
-        model.HalfFloatToFused8BitRowwiseQuantizedHalfScaleBias('x', 'x_8bit')
-        model.Fused8BitRowwiseQuantizedHalfScaleBiasToHalfFloat('x_8bit', 'x_recovered')
-        workspace.FeedBlob('x', np.random.rand(100, 150).astype(np.float16))
-        self.InferTensorRunAndCompare(model)
-        x = workspace.FetchBlob('x')
-        x_recovered = workspace.FetchBlob('x_recovered')
-        # TODO: find a tighter bound
-        assert(np.allclose(x, x_recovered, atol=1e-2))
-
-
-    def testLearningRateOp(self):
-        net = core.Net("lr_test")
-        iteration = net.ConstantFill(
-            [],
-            "iteration",
-            shape=[1],
-            value=0,
-            dtype=core.DataType.INT64,
-        )
-        lr = net.LearningRate(
-            [iteration],
-            net.NextScopedBlob("weight_decay"),
-            base_lr=0.5,
-            policy="constantWarmup",
-            multiplier=0.0,
-            num_iter=0,
-        )
-        (shapes, types) = workspace.InferShapesAndTypes(
-            [net],
-        )
-        self.assertEqual(shapes['weight_decay'], [1])
-
-    def testShapeOp(self):
-        model = model_helper.ModelHelper(name="shape_op_test")
-        model.Shape('x', 'y')
-        workspace.FeedBlob('x', np.random.rand(100, 150).astype(np.float32))
-        self.InferTensorRunAndCompare(model)
-
-    def InferTensorRunAndCompare(self, model, expected_uninferred_blobs=None):
-        '''
-        Runs shape inference, and then the model to check
-        that the inferred shapes agree with the actual ones
-
-        'expected_uninferred_blobs' is the list of blobs for which type and
-        shape cannot be inferred.
-        '''
-        (shapes, types) = workspace.InferShapesAndTypes(
-            [model.param_init_net, model.net],
-        )
-
-        # .. Create net
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.CreateNet(model.net, True)
-        workspace.RunNet(model.Proto().name)
-
-        # ... and then check the shapes mismatch
-        correct_shapes = {}
-        correct_types = {}
-        for b in workspace.Blobs():
-            arr = workspace.FetchBlob(b)
-            correct_shapes[b] = arr.shape
-            if type(arr) is np.ndarray:
-                if arr.dtype == np.dtype('float32'):
-                    correct_types[b] = caffe2_pb2.TensorProto.FLOAT
-                elif arr.dtype == np.dtype('int32'):
-                    correct_types[b] = caffe2_pb2.TensorProto.INT32
-                # BYTE
-                # STRING
-                elif arr.dtype == np.dtype('bool'):
-                    correct_types[b] = caffe2_pb2.TensorProto.BOOL
-                elif arr.dtype == np.dtype('uint8'):
-                    correct_types[b] = caffe2_pb2.TensorProto.UINT8
-                elif arr.dtype == np.dtype('int8'):
-                    correct_types[b] = caffe2_pb2.TensorProto.INT8
-                elif arr.dtype == np.dtype('uint16'):
-                    correct_types[b] = caffe2_pb2.TensorProto.UINT16
-                elif arr.dtype == np.dtype('int16'):
-                    correct_types[b] = caffe2_pb2.TensorProto.INT16
-                elif arr.dtype == np.dtype('int64'):
-                    correct_types[b] = caffe2_pb2.TensorProto.INT64
-                elif arr.dtype == np.dtype('float16'):
-                    correct_types[b] = caffe2_pb2.TensorProto.FLOAT16
-                elif arr.dtype == np.dtype('float64'):
-                    correct_types[b] = caffe2_pb2.TensorProto.DOUBLE
-                else:
-                    correct_types[b] = "unknown {}".format(arr.dtype)
-            else:
-                correct_types[b] = str(type(arr))
-
-        if expected_uninferred_blobs is None:
-            expected_uninferred_blobs = []
-        for b in correct_shapes:
-            # skip blobs for which shape couldn't be inferred
-            if b in expected_uninferred_blobs:
-                continue
-            self.assertTrue(
-                np.array_equal(
-                    np.array(shapes[b]).astype(np.int32),
-                    np.array(correct_shapes[b]).astype(np.int32)
-                ),
-                "Shape {} mismatch: {} vs. correct {}".format(
-                    b, shapes[b], correct_shapes[b]
-                )
-            )
-            self.assertFalse(
-                b not in types and b in correct_types,
-                "Type for {} not defined".format(b),
-            )
-            self.assertEqual(
-                types[b],
-                correct_types[b],
-                "Type {} mismatch: {} vs. {}".format(
-                    b, types[b], correct_types[b],
-                )
-            )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py b/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
deleted file mode 100644
index 03b50bfc952d..000000000000
--- a/caffe2/python/operator_test/sinusoid_position_encoding_op_test.py
+++ /dev/null
@@ -1,72 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-import math
-
-MAX_TEST_EMBEDDING_SIZE = 20
-MAX_TEST_SEQUENCE_LENGTH = 10
-MAX_TEST_BATCH_SIZE = 5
-MIN_TEST_ALPHA = 5000.0
-MAX_TEST_ALPHA = 20000.0
-MIN_TEST_AMPLITUDE = 0.1
-MAX_TEST_AMPLITUDE = 10.0
-
-
-class TestSinusoidPositionEncodingOp(serial.SerializedTestCase):
-    @given(
-        positions_vec=hu.arrays(
-            dims=[MAX_TEST_SEQUENCE_LENGTH],
-            dtype=np.int32,
-            elements=st.integers(1, MAX_TEST_SEQUENCE_LENGTH)
-        ),
-        embedding_size=st.integers(1, MAX_TEST_EMBEDDING_SIZE),
-        batch_size=st.integers(1, MAX_TEST_BATCH_SIZE),
-        alpha=st.floats(MIN_TEST_ALPHA, MAX_TEST_ALPHA),
-        amplitude=st.floats(MIN_TEST_AMPLITUDE, MAX_TEST_AMPLITUDE),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=10000)
-    def test_sinusoid_embedding(
-        self, positions_vec, embedding_size, batch_size, alpha, amplitude, gc, dc
-    ):
-        positions = np.tile(positions_vec, [batch_size, 1]).transpose()
-
-        op = core.CreateOperator(
-            "SinusoidPositionEncoding",
-            ["positions"],
-            ["output"],
-            embedding_size=embedding_size,
-            alpha=alpha,
-            amplitude=amplitude,
-        )
-
-        def sinusoid_encoding(dim, position):
-            x = 1. * position / math.pow(alpha, 1. * dim / embedding_size)
-            if dim % 2 == 0:
-                return amplitude * math.sin(x)
-            else:
-                return amplitude * math.cos(x)
-
-        def sinusoid_embedding_op(positions):
-            output_shape = (len(positions), len(positions[0]), embedding_size)
-            ar = np.zeros(output_shape)
-            for i, position_vector in enumerate(positions):
-                for j, position in enumerate(position_vector):
-                    for k in range(embedding_size):
-                        ar[i, j, k] = sinusoid_encoding(k, position)
-            return [ar]
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[positions],
-            reference=sinusoid_embedding_op,
-        )
diff --git a/caffe2/python/operator_test/softmax_ops_test.py b/caffe2/python/operator_test/softmax_ops_test.py
deleted file mode 100644
index 8ec92ae1af9e..000000000000
--- a/caffe2/python/operator_test/softmax_ops_test.py
+++ /dev/null
@@ -1,684 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-
-
-class TestSoftmaxOps(serial.SerializedTestCase):
-
-    @serial.given(n=st.sampled_from([0, 2, 4, 71, 103]),
-                  D=st.sampled_from([0, 4, 8, 64, 79, 256, 333]),
-                  engine=st.sampled_from([None, 'CUDNN']),
-                  **hu.gcs)
-    def test_softmax(self, n, D, engine, gc, dc):
-        # n = number of examples, D = |labels|
-        # Initialize X and add 1e-2 for numerical stability
-        X = np.random.rand(n, D).astype(np.float32)
-        X = X + 1e-2
-
-        # Reference implementation of cross entropy with soft labels
-        def label_softmax(X):
-            probs = np.zeros((n, D))
-            rowmax = np.zeros(n)
-
-            if D == 0:
-                return [probs]
-
-            for i in range(n):
-                rowmax[i] = max(X[i, ])
-                # We need to subtract the max to avoid numerical issues
-                probs[i] = X[i] - rowmax[i]
-                exps = np.exp(probs[i, ])
-                norm = sum(exps)
-                probs[i, ] = exps / norm
-
-            return [probs]
-
-        op = core.CreateOperator(
-            "Softmax",
-            ["X"],
-            ["probs"],
-            engine=engine
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=label_softmax,
-        )
-
-    @given(n=st.sampled_from([0, 2, 4, 71, 103, 555, 751, 1201]),
-                  D=st.sampled_from([0, 4, 8, 64, 79, 256, 333, 1000]),
-                  engine=st.sampled_from([None, 'CUDNN']),
-                  **hu.gcs)
-    @settings(deadline=10000)
-    def test_softmax_grad(self, n, D, engine, gc, dc):
-        # n = number of examples, D = |labels|
-        # Initialize X and add 1e-2 for numerical stability
-        Y = np.random.rand(n, D).astype(np.float32)
-        dY = np.random.rand(n, D).astype(np.float32)
-        Y = Y + 1e-2
-
-        # Reference implementation of cross entropy with soft labels
-        def label_softmax_grad(X, dY):
-            dX = Y * 0.0
-            for i in range(n):
-                d = np.dot(Y[i, :], dY[i, :])
-                dX[i, :] = Y[i, :] * (dY[i, :] - d)
-            return [dX]
-
-        op = core.CreateOperator(
-            "SoftmaxGradient",
-            ["Y", "dY"],
-            ["dX"],
-            engine=engine
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[Y, dY],
-            reference=label_softmax_grad,
-        )
-
-    @given(axis=st.integers(min_value=1, max_value=4),
-           engine=st.sampled_from([None, 'CUDNN']),
-           **hu.gcs)
-    def test_softmax_axis(self, axis, engine, gc, dc):
-        np.random.seed(1)
-        X = np.random.randn(1, 2, 3, 2, 1).astype(np.float32)
-        X = X + 1e-2
-
-        def prod(xs):
-            p = 1
-            for x in xs:
-                p *= x
-            return p
-
-        N = prod(list(X.shape)[:axis])
-        D = prod(list(X.shape)[axis:])
-
-        # Reference implementation of cross entropy with soft labels
-        def label_softmax(X):
-            X_ = X.reshape(N, D)
-            probs = np.zeros((N, D))
-            rowmax = np.zeros(N)
-            for i in range(N):
-                rowmax[i] = max(X_[i, ])
-                # We need to subtract the max to avoid numerical issues
-                probs[i] = X_[i] - rowmax[i]
-                exps = np.exp(probs[i, ])
-                norm = sum(exps)
-                probs[i, ] = exps / norm
-
-            return [probs.reshape(*X.shape)]
-
-        op = core.CreateOperator(
-            "Softmax",
-            ["X"],
-            ["probs"],
-            axis=axis,
-            engine=engine,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=label_softmax,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X], 0, [0], stepsize=1e-4, threshold=1e-2)
-
-    @given(n=st.integers(2, 10), D=st.integers(4, 16),
-           only_loss=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_softmax_with_loss(self, n, D, gc, only_loss, dc):
-        # n = number of examples, D = |labels|
-        # Initialize X and add 1e-2 for numerical stability
-        np.random.seed(2603)
-        X = np.random.rand(n, D).astype(np.float32)
-        X = X + 1e-2
-
-        # Initialize label
-        label = (np.random.rand(n) * D).astype(np.int32)
-
-        # Reference implementation of cross entropy with soft labels
-        def label_softmax_crossent(X, label):
-            probs = np.zeros((n, D))
-            rowmax = np.zeros(n)
-            for i in range(n):
-                rowmax[i] = max(X[i, ])
-                # We need to subtract the max to avoid numerical issues
-                probs[i] = X[i] - rowmax[i]
-                exps = np.exp(probs[i, ])
-                norm = sum(exps)
-                probs[i, ] = exps / norm
-
-            label_xent = [-np.log(max(probs[i][label[i]], 1e-20))
-                          for i in range(n)]
-            avgloss = np.sum(label_xent) / float(n)
-            return (probs, avgloss)
-
-        op = core.CreateOperator(
-            "SoftmaxWithLoss",
-            ["X", "label"],
-            ["probs", "avgloss"],
-            only_loss=only_loss,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, label],
-            reference=label_softmax_crossent,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X, label], 0, [1], stepsize=1e-4, threshold=1e-2)
-
-    @given(
-        n=st.integers(2, 5),
-        D=st.integers(4, 16),
-        only_loss=st.booleans(),
-        label_prob=st.booleans(),
-        **hu.gcs
-    )
-    @settings(deadline=10000)
-    def test_softmax_with_loss_axis_2(
-        self, n, D, only_loss, label_prob,
-        gc, dc
-    ):
-        np.random.seed(2603)
-        X = np.random.rand(n, n, D).astype(np.float32)
-        X = X + 1e-2
-
-        if label_prob:
-            label = np.random.rand(n, n, D).astype(np.float32)
-            label /= label.sum(axis=2, keepdims=True)
-        else:
-            label = (np.random.rand(n, n) * D).astype(np.int32)
-
-        # Reference implementation of cross entropy with soft labels
-        def label_softmax_crossent(X, label):
-            probs = np.zeros((n, n, D))
-            rowmax = np.zeros((n, n))
-            for i in range(n):
-                for j in range(n):
-                    rowmax[i, j] = max(X[i, j, ])
-                    # We need to subtract the max to avoid numerical issues
-                    probs[i, j] = X[i, j] - rowmax[i, j]
-                    exps = np.exp(probs[i, j, ])
-                    norm = sum(exps)
-                    probs[i, j, ] = exps / norm
-            label_xent = 0
-            for i in range(n):
-                for j in range(n):
-                    if label_prob:
-                        for k in range(D):
-                            label_xent += (
-                                -np.log(max(probs[i, j, k], 1e-20)) *
-                                label[i, j, k]
-                            )
-                    else:
-                        label_xent += -np.log(max(probs[i, j, label[i, j]], 1e-20))
-
-            avgloss = label_xent / float(n * n)
-            return (probs, avgloss)
-
-        op = core.CreateOperator(
-            "SoftmaxWithLoss",
-            ["X", "label"],
-            ["probs", "avgloss"],
-            only_loss=only_loss,
-            label_prob=label_prob,
-            axis=2,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, label],
-            reference=label_softmax_crossent,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X, label], 0, [1], stepsize=1e-4, threshold=1e-2)
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(**hu.gcs_gpu_only)
-    def test_softmax_with_loss_large(self, gc, dc):
-        np.random.seed(2603)
-        for n in [32]:
-            for D in [1000, 2000, 20000]:
-                # n = number of examples, D = |labels|
-                # Initialize X and add 1e-2 for numerical stability
-                X = np.random.rand(n, D).astype(np.float32)
-                X = X + 1e-2
-
-                # Initialize label
-                label = (np.random.rand(n) * D).astype(np.int32)
-
-                # Reference implementation of cross entropy with soft labels
-                def label_softmax_crossent(X, label):
-                    probs = np.zeros((n, D))
-                    rowmax = np.zeros(n)
-                    for i in range(n):
-                        rowmax[i] = max(X[i, ])
-                        # We need to subtract the max to avoid numerical issues
-                        probs[i] = X[i] - rowmax[i]
-                        exps = np.exp(probs[i, ])
-                        norm = sum(exps)
-                        probs[i, ] = exps / norm
-
-                    label_xent = [-np.log(max(probs[i][label[i]], 1e-20))
-                                  for i in range(n)]
-                    avgloss = np.sum(label_xent) / float(n)
-                    return (probs, avgloss)
-
-                op = core.CreateOperator(
-                    "SoftmaxWithLoss",
-                    ["X", "label"],
-                    ["probs", "avgloss"]
-                )
-
-                self.assertReferenceChecks(
-                    device_option=gc,
-                    op=op,
-                    inputs=[X, label],
-                    reference=label_softmax_crossent,
-                )
-
-    @given(n=st.integers(2, 10), D=st.integers(4, 16), **hu.gcs)
-    @settings(deadline=None)
-    def test_softmax_with_loss_label_prob(self, n, D, gc, dc):
-        # n = number of examples, D = |labels|
-        # Initialize X and add 1e-2 for numerical stability
-        np.random.seed(2603)
-        X = np.random.rand(n, D).astype(np.float32)
-        X = X + 1e-2
-
-        # Initialize label
-        label = np.random.rand(D, n).astype(np.float32)
-
-        # normalize labels to sum to 1
-        label /= np.sum(label, axis=0)
-        label = label.transpose()
-
-        # Reference implementation of cross entropy with soft labels
-        def label_softmax_crossent(X, label):
-            probs = np.zeros((n, D))
-            rowmax = np.zeros(n)
-            for i in range(n):
-                rowmax[i] = max(X[i, ])
-                # We need to subtract the max to avoid numerical issues
-                probs[i] = X[i] - rowmax[i]
-                exps = np.exp(probs[i, ])
-                norm = sum(exps)
-                probs[i, ] = exps / norm
-
-            label_xent = np.zeros(X.shape)
-            for i in range(n):
-                for j in range(D):
-                    label_xent[i][j] = -np.log(
-                        max(probs[i, j], 1e-20)) * label[i, j]
-            avgloss = np.sum(label_xent) / float(n)
-            return (probs, avgloss)
-
-        op = core.CreateOperator(
-            "SoftmaxWithLoss",
-            ["X", "label"],
-            ["probs", "avgloss"],
-            label_prob=1
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, label],
-            reference=label_softmax_crossent,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X, label], 0, [1], stepsize=1e-4, threshold=1e-2)
-
-    @given(
-        n=st.integers(2, 10),
-        D=st.integers(4, 16),
-        only_loss=st.booleans(),
-        **hu.gcs)
-    @settings(deadline=None)
-    def test_softmax_with_loss_weighted(self, n, D, only_loss, gc, dc):
-        # n = number of examples, D = |labels|
-        # Initialize X and add 1e-2 for numerical stability
-        np.random.seed(2603)
-        X = np.random.rand(n, D).astype(np.float32)
-        X = X + 1e-2
-
-        # Initialize label
-        label = (np.random.rand(n) * D).astype(np.int32)
-
-        # Init weights (weight by sample)
-        weights = np.random.rand(n).astype(np.float32)
-
-        # Reference implementation of cross entropy with soft labels
-        def label_softmax_crossent_weighted(X, label, weights):
-            probs = np.zeros((n, D))
-            rowmax = np.zeros(n)
-            for i in range(n):
-                rowmax[i] = max(X[i, ])
-                # We need to subtract the max to avoid numerical issues
-                probs[i] = X[i] - rowmax[i]
-                exps = np.exp(probs[i, ])
-                norm = sum(exps)
-                probs[i, ] = exps / norm
-
-            label_xent = [-weights[i] * np.log(max(probs[i][label[i]], 1e-20))
-                          for i in range(n)]
-            avgloss = np.sum(label_xent) / sum(weights)
-            return (probs, avgloss)
-
-        op = core.CreateOperator(
-            "SoftmaxWithLoss",
-            ["X", "label", "weights"],
-            ["probs", "avgloss"],
-            only_loss=only_loss,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, label, weights],
-            reference=label_softmax_crossent_weighted,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X, label, weights], 0, [1], stepsize=1e-4, threshold=1e-2)
-
-    @given(n=st.integers(2, 10), D=st.integers(4, 16), **hu.gcs)
-    @settings(deadline=None)
-    def test_softmax_with_loss_label_prob_weighted(self, n, D, gc, dc):
-        # n = number of examples, D = |labels|
-        # Initialize X and add 1e-2 for numerical stability
-        X = np.random.rand(n, D).astype(np.float32)
-        X = X + 1e-2
-
-        # Initialize label
-        label = np.random.rand(D, n).astype(np.float32)
-
-        # normalize labels to sum to 1
-        label /= np.sum(label, axis=0)
-        label = label.transpose()
-
-        # Init weights (weight by sample)
-        weights = np.random.rand(n).astype(np.float32)
-
-        # Reference implementation of cross entropy with soft labels
-        def label_softmax_crossent_weighted(X, label, weights):
-            probs = np.zeros((n, D))
-            rowmax = np.zeros(n)
-            for i in range(n):
-                rowmax[i] = max(X[i, ])
-                # We need to subtract the max to avoid numerical issues
-                probs[i] = X[i] - rowmax[i]
-                exps = np.exp(probs[i, ])
-                norm = sum(exps)
-                probs[i, ] = exps / norm
-
-            label_xent = np.zeros(X.shape)
-            for i in range(n):
-                for j in range(D):
-                    label_xent[i][j] = -np.log(
-                        max(probs[i, j], 1e-20)) * label[i, j] * weights[i]
-            avgloss = np.sum(label_xent) / sum(weights)
-            return (probs, avgloss)
-
-        op = core.CreateOperator(
-            "SoftmaxWithLoss",
-            ["X", "label", "weights"],
-            ["probs", "avgloss"],
-            label_prob=1,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X, label, weights],
-            reference=label_softmax_crossent_weighted,
-        )
-
-        self.assertGradientChecks(
-            gc, op, [X, label, weights], 0, [1], stepsize=1e-4, threshold=1e-2)
-
-    @given(n=st.integers(2, 5), D=st.integers(2, 4),
-           weighted=st.booleans(), **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_spatial_softmax_with_loss(self, n, D, weighted, gc, dc):
-        # n = number of examples, D = |labels|
-        # Initialize X and add 1e-2 for numerical stability
-        W = 18
-        H = 12
-        np.random.seed(2603)
-        X = np.random.rand(n, D, H, W).astype(np.float32)
-        X = X + 1e-2
-
-        weighted = True
-        weights = None
-        if weighted:
-            weights = np.random.rand(n, H, W).astype(np.float32)
-
-        # Initialize label. Some of the labels are (-1), i.e "DONT CARE"
-        label = (np.random.rand(n, H, W) * (D + 1)).astype(np.int32) - 1
-
-        def label_softmax_crossent_spatial(X, label, weights=None):
-            probs = np.zeros((n, D, H, W))
-            rowmax = np.zeros((n, H, W))
-            label_xent = np.zeros((n, H, W))
-            for i in range(n):
-                for x in range(W):
-                    for y in range(H):
-                        rowmax[i, y, x] = max(X[i, :, y, x])
-                        # We need to subtract the max to avoid numerical issues
-                        probs[i, :, y, x] = X[i, :, y, x] - rowmax[i, y, x]
-                        exps = np.exp(probs[i, :, y, x])
-                        probs[i, :, y, x] = exps / sum(exps)
-
-                        label_xent[:, y, x] = \
-                            [-np.log(max(probs[j, label[i, y, x], y, x], 1e-20))
-                             for j in range(n)]
-
-            total_xent = 0.0
-            total_weight = 0.0
-            for y in range(H):
-                for x in range(W):
-                    for i in range(n):
-                        l = label[i, y, x]
-                        if (l != (-1)):
-                            w = 1.0 if weights is None else weights[i, y, x]
-                            total_xent += \
-                                -np.log(max(probs[i, l, y, x], 1e-20)) * w
-                            total_weight += w
-            print("Total weight {}".format(total_weight))
-
-            return (probs, total_xent / total_weight)
-
-        op = core.CreateOperator(
-            "SpatialSoftmaxWithLoss",
-            ["X", "label"] + ([] if weights is None else ["weights"]),
-            ["probs", "avgloss"],
-        )
-
-        inputs = [X, label] + ([] if weights is None else [weights])
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=label_softmax_crossent_spatial,
-        )
-
-        self.assertGradientChecks(
-            gc, op, inputs, 0, [1], stepsize=1e-4, threshold=1e-2)
-
-    @given(n=st.integers(4, 5), D=st.integers(3, 4),
-           weighted=st.booleans(), **hu.gcs)
-    def test_spatial_softmax_with_loss_allignore(self, n, D, weighted, gc, dc):
-        # n = number of examples, D = |labels|
-        # Initialize X and add 1e-2 for numerical stability
-        W = 18
-        H = 12
-        np.random.seed(2603)
-        X = np.random.rand(n, D, H, W).astype(np.float32)
-        X = X + 1e-2
-
-        weighted = True
-        weights = None
-        if weighted:
-            weights = np.random.rand(n, H, W).astype(np.float32)
-
-        # Initialize label. All labels as "DONT CARE"
-        label = np.zeros((n, H, W)).astype(np.int32) - 1
-        print(label)
-
-        def label_softmax_crossent_spatial(X, label, weights=None):
-            probs = np.zeros((n, D, H, W))
-            rowmax = np.zeros((n, H, W))
-            label_xent = np.zeros((n, H, W))
-            for i in range(n):
-                for x in range(W):
-                    for y in range(H):
-                        rowmax[i, y, x] = max(X[i, :, y, x])
-                        # We need to subtract the max to avoid numerical issues
-                        probs[i, :, y, x] = X[i, :, y, x] - rowmax[i, y, x]
-                        exps = np.exp(probs[i, :, y, x])
-                        probs[i, :, y, x] = exps / sum(exps)
-
-                        label_xent[:, y, x] = \
-                            [-np.log(max(probs[j, label[i, y, x], y, x], 1e-20))
-                            for j in range(n)]
-
-            return (probs, 0.0)
-
-        op = core.CreateOperator(
-            "SpatialSoftmaxWithLoss",
-            ["X", "label"] + ([] if weights is None else ["weights"]),
-            ["probs", "avgloss"],
-        )
-
-        inputs = [X, label] + ([] if weights is None else [weights])
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=label_softmax_crossent_spatial,
-        )
-
-    @given(n=st.integers(4, 5), D=st.integers(3, 4),
-           weighted=st.booleans(), **hu.gcs)
-    def test_softmax_with_loss_zero_weight(self, n, D, weighted, gc, dc):
-        # n = number of examples, D = |labels|
-        # Initialize X and add 1e-2 for numerical stability
-        np.random.seed(2603)
-        X = np.random.rand(n, D).astype(np.float32)
-        X = X + 1e-2
-
-        weights = np.zeros(n).astype(np.float32)
-
-        # Initialize label
-        label = (np.random.rand(n) * D).astype(np.int32)
-
-        def label_softmax_crossent(X, label, weights=None):
-            probs = np.zeros((n, D))
-            rowmax = np.zeros((n))
-            for i in range(n):
-                rowmax[i] = max(X[i, ])
-                # We need to subtract the max to avoid numerical issues
-                probs[i] = X[i] - rowmax[i]
-                exps = np.exp(probs[i, ])
-                norm = sum(exps)
-                probs[i, ] = exps / norm
-            return (probs, 0.0)
-
-        op = core.CreateOperator(
-            "SoftmaxWithLoss",
-            ["X", "label", "weights"],
-            ["probs", "avgloss"]
-        )
-
-        inputs = [X, label] + ([] if weights is None else [weights])
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=label_softmax_crossent,
-        )
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    def test_compare_cpugpu(self):
-        '''
-        Additional test that checks CPU and GPU returns same values
-        with larger examples. This is mainly to test the more complex
-        GPU implementation is correct.
-        '''
-        from caffe2.proto import caffe2_pb2
-
-        for _j in range(3):
-            gpuop = core.CreateOperator(
-                "SpatialSoftmaxWithLoss",
-                ["X_gpu", "label_gpu"],
-                ["probs_gpu", "avgloss_gpu"],
-                device_option=core.DeviceOption(workspace.GpuDeviceType, 0)
-            )
-
-            cpuop = core.CreateOperator(
-                "SpatialSoftmaxWithLoss",
-                ["X_cpu", "label_cpu"],
-                ["probs_cpu", "avgloss_cpu"],
-                device_option=core.DeviceOption(caffe2_pb2.CPU)
-            )
-
-            n = 8
-            D = 4
-            W = 64 + int(np.random.rand(1) * 1024)
-            H = 64 + int(np.random.rand(1) * 1024)
-
-            print("W: {} H: {}".format(W, H))
-
-            X = np.random.rand(n, D, H, W).astype(np.float32)
-            X = X + 1e-2
-
-            # Initialize label. Some of the labels are (-1), i.e "DONT CARE"
-            label = (np.random.rand(n, H, W) * (D + 1)).astype(np.int32) - 1
-
-            gpu0 = core.DeviceOption(workspace.GpuDeviceType, 0)
-            workspace.FeedBlob("X_cpu", X)
-            workspace.FeedBlob("label_cpu", label)
-            workspace.FeedBlob("X_gpu", X, device_option=gpu0)
-            workspace.FeedBlob("label_gpu", label, device_option=gpu0)
-
-            workspace.RunOperatorOnce(gpuop)
-            workspace.RunOperatorOnce(cpuop)
-
-            probs_gpu = workspace.FetchBlob("probs_gpu")
-            probs_cpu = workspace.FetchBlob("probs_cpu")
-            loss_gpu = workspace.FetchBlob("avgloss_gpu")
-            loss_cpu = workspace.FetchBlob("avgloss_cpu")
-
-            np.testing.assert_allclose(probs_gpu, probs_cpu, rtol=1e-4)
-            np.testing.assert_allclose(loss_gpu, loss_cpu, rtol=1e-1)
-
-if __name__ == "__main__":
-    import unittest
-    import random
-    random.seed(2603)
-    unittest.main()
diff --git a/caffe2/python/operator_test/softplus_op_test.py b/caffe2/python/operator_test/softplus_op_test.py
deleted file mode 100644
index f8ca1817176e..000000000000
--- a/caffe2/python/operator_test/softplus_op_test.py
+++ /dev/null
@@ -1,25 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-
-import unittest
-
-
-class TestSoftplus(hu.HypothesisTestCase):
-
-    @given(X=hu.tensor(),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_softplus(self, X, gc, dc):
-        op = core.CreateOperator("Softplus", ["X"], ["Y"])
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py b/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py
deleted file mode 100644
index 2ba21bb6d44f..000000000000
--- a/caffe2/python/operator_test/sparse_dropout_with_replacement_op_test.py
+++ /dev/null
@@ -1,68 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-import numpy as np
-
-
-class SparseDropoutWithReplacementTest(hu.HypothesisTestCase):
-    @given(**hu.gcs_cpu_only)
-    def test_no_dropout(self, gc, dc):
-        X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.int64)
-        Lengths = np.array([2, 2, 2, 2, 2]).astype(np.int32)
-        replacement_value = -1
-        self.ws.create_blob("X").feed(X)
-        self.ws.create_blob("Lengths").feed(Lengths)
-        sparse_dropout_op = core.CreateOperator(
-            "SparseDropoutWithReplacement", ["X", "Lengths"], ["Y", "LY"],
-            ratio=0.0, replacement_value=replacement_value)
-        self.ws.run(sparse_dropout_op)
-        Y = self.ws.blobs["Y"].fetch()
-        OutputLengths = self.ws.blobs["LY"].fetch()
-        self.assertListEqual(X.tolist(), Y.tolist(),
-                             "Values should stay unchanged")
-        self.assertListEqual(Lengths.tolist(), OutputLengths.tolist(),
-                             "Lengths should stay unchanged.")
-
-    @given(**hu.gcs_cpu_only)
-    def test_all_dropout(self, gc, dc):
-        X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.int64)
-        Lengths = np.array([2, 2, 2, 2, 2]).astype(np.int32)
-        replacement_value = -1
-        self.ws.create_blob("X").feed(X)
-        self.ws.create_blob("Lengths").feed(Lengths)
-        sparse_dropout_op = core.CreateOperator(
-            "SparseDropoutWithReplacement", ["X", "Lengths"], ["Y", "LY"],
-            ratio=1.0, replacement_value=replacement_value)
-        self.ws.run(sparse_dropout_op)
-        y = self.ws.blobs["Y"].fetch()
-        lengths = self.ws.blobs["LY"].fetch()
-        for elem in y:
-            self.assertEqual(elem, replacement_value, "Expected all \
-                negative elements when dropout ratio is 1.")
-        for length in lengths:
-            self.assertEqual(length, 1)
-        self.assertEqual(sum(lengths), len(y))
-
-    @given(**hu.gcs_cpu_only)
-    def test_all_dropout_empty_input(self, gc, dc):
-        X = np.array([]).astype(np.int64)
-        Lengths = np.array([0]).astype(np.int32)
-        replacement_value = -1
-        self.ws.create_blob("X").feed(X)
-        self.ws.create_blob("Lengths").feed(Lengths)
-        sparse_dropout_op = core.CreateOperator(
-            "SparseDropoutWithReplacement", ["X", "Lengths"], ["Y", "LY"],
-            ratio=1.0, replacement_value=replacement_value)
-        self.ws.run(sparse_dropout_op)
-        y = self.ws.blobs["Y"].fetch()
-        lengths = self.ws.blobs["LY"].fetch()
-        self.assertEqual(len(y), 1, "Expected single dropout value")
-        self.assertEqual(len(lengths), 1, "Expected single element \
-            in lengths array")
-        self.assertEqual(lengths[0], 1, "Expected 1 as sole length")
-        self.assertEqual(sum(lengths), len(y))
diff --git a/caffe2/python/operator_test/sparse_gradient_checker_test.py b/caffe2/python/operator_test/sparse_gradient_checker_test.py
deleted file mode 100644
index f1f85b1f9bec..000000000000
--- a/caffe2/python/operator_test/sparse_gradient_checker_test.py
+++ /dev/null
@@ -1,47 +0,0 @@
-
-
-
-
-
-import numpy as np
-from scipy.sparse import coo_matrix
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestSparseGradient(hu.HypothesisTestCase):
-    @given(M=st.integers(min_value=5, max_value=20),
-           N=st.integers(min_value=5, max_value=20),
-           K=st.integers(min_value=5, max_value=15),
-           sparsity=st.floats(min_value=0.1, max_value=1.0),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_sparse_gradient(self, M, N, K, sparsity, gc, dc):
-        X = np.random.randn(M, K).astype(np.float32)
-        X[X > sparsity] = 0
-        X_coo = coo_matrix(X)
-        val, key, seg = X_coo.data, X_coo.col, X_coo.row
-
-        val = val.astype(np.float32)
-        key = key.astype(np.int64)
-        seg = seg.astype(np.int32)
-
-        Y = np.random.randn(K, N).astype(np.float32)
-
-        op = core.CreateOperator(
-            'SparseUnsortedSegmentWeightedSum',
-            ['Y', 'val', 'key', 'seg'],
-            ['out'],
-            num_segments=M)
-
-        # Gradient check wrt Y
-        self.assertGradientChecks(
-            gc, op, [Y, val, key, seg], 0, [0])
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/sparse_itemwise_dropout_with_replacement_op_test.py b/caffe2/python/operator_test/sparse_itemwise_dropout_with_replacement_op_test.py
deleted file mode 100644
index 6d837aa6d98e..000000000000
--- a/caffe2/python/operator_test/sparse_itemwise_dropout_with_replacement_op_test.py
+++ /dev/null
@@ -1,68 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-import numpy as np
-
-
-class SparseItemwiseDropoutWithReplacementTest(hu.HypothesisTestCase):
-    @given(**hu.gcs_cpu_only)
-    def test_no_dropout(self, gc, dc):
-        X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.int64)
-        Lengths = np.array([2, 2, 2, 2, 2]).astype(np.int32)
-        replacement_value = -1
-        self.ws.create_blob("X").feed(X)
-        self.ws.create_blob("Lengths").feed(Lengths)
-        sparse_dropout_op = core.CreateOperator(
-            "SparseItemwiseDropoutWithReplacement", ["X", "Lengths"], ["Y", "LY"],
-            ratio=0.0, replacement_value=replacement_value)
-        self.ws.run(sparse_dropout_op)
-        Y = self.ws.blobs["Y"].fetch()
-        OutputLengths = self.ws.blobs["LY"].fetch()
-        self.assertListEqual(X.tolist(), Y.tolist(),
-                             "Values should stay unchanged")
-        self.assertListEqual(Lengths.tolist(), OutputLengths.tolist(),
-                             "Lengths should stay unchanged.")
-
-    @given(**hu.gcs_cpu_only)
-    def test_all_dropout(self, gc, dc):
-        X = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.int64)
-        Lengths = np.array([2, 2, 2, 2, 2]).astype(np.int32)
-        replacement_value = -1
-        self.ws.create_blob("X").feed(X)
-        self.ws.create_blob("Lengths").feed(Lengths)
-        sparse_dropout_op = core.CreateOperator(
-            "SparseItemwiseDropoutWithReplacement", ["X", "Lengths"], ["Y", "LY"],
-            ratio=1.0, replacement_value=replacement_value)
-        self.ws.run(sparse_dropout_op)
-        y = self.ws.blobs["Y"].fetch()
-        lengths = self.ws.blobs["LY"].fetch()
-        for elem in y:
-            self.assertEqual(elem, replacement_value, "Expected all \
-                negative elements when dropout ratio is 1.")
-        for length in lengths:
-            self.assertEqual(length, 2)
-        self.assertEqual(sum(lengths), len(y))
-
-    @given(**hu.gcs_cpu_only)
-    def test_all_dropout_empty_input(self, gc, dc):
-        X = np.array([]).astype(np.int64)
-        Lengths = np.array([0]).astype(np.int32)
-        replacement_value = -1
-        self.ws.create_blob("X").feed(X)
-        self.ws.create_blob("Lengths").feed(Lengths)
-        sparse_dropout_op = core.CreateOperator(
-            "SparseItemwiseDropoutWithReplacement", ["X", "Lengths"], ["Y", "LY"],
-            ratio=1.0, replacement_value=replacement_value)
-        self.ws.run(sparse_dropout_op)
-        y = self.ws.blobs["Y"].fetch()
-        lengths = self.ws.blobs["LY"].fetch()
-        self.assertEqual(len(y), 0, "Expected no dropout value")
-        self.assertEqual(len(lengths), 1, "Expected single element \
-            in lengths array")
-        self.assertEqual(lengths[0], 0, "Expected 0 as sole length")
-        self.assertEqual(sum(lengths), len(y))
diff --git a/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py b/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py
deleted file mode 100644
index fb958492cfa9..000000000000
--- a/caffe2/python/operator_test/sparse_lengths_sum_benchmark.py
+++ /dev/null
@@ -1,134 +0,0 @@
-
-
-import argparse
-import datetime
-
-import numpy as np
-from caffe2.python import core, workspace
-
-
-DTYPES = {
-    "uint8": np.uint8,
-    "uint8_fused": np.uint8,
-    "float": np.float32,
-    "float16": np.float16,
-}
-
-
-def benchmark_sparse_lengths_sum(
-    dtype_str,
-    categorical_limit,
-    embedding_size,
-    average_len,
-    batch_size,
-    iterations,
-    flush_cache,
-):
-    print("Preparing lookup table. " + str(datetime.datetime.now()))
-
-    # We will use a constant, but non-trivial value so we save initialization
-    # time.
-    data = np.ones([categorical_limit, embedding_size], dtype=np.float32)
-    data *= 17.01
-
-    if dtype_str == "uint8":
-        scale_bias = np.random.rand(categorical_limit, 2).astype(np.float32)
-        workspace.FeedBlob("scale_bias", scale_bias.astype(np.float32))
-    elif dtype_str == "uint8_fused":
-        scale_bias = np.random.randint(255, size=(categorical_limit, 8))
-        data = np.concatenate([data, scale_bias], axis=1)
-
-    print("Data has shape {} {}".format(data.shape, datetime.datetime.now()))
-    workspace.FeedBlob("X", data.astype(DTYPES[dtype_str]))
-
-    # In order to produce truly random lengths and indices, we will embed a
-    # Python operator in the net to generate them.
-    def f(_, outputs):
-        lengths = np.random.randint(
-            int(np.round(average_len * 0.75)),
-            int(np.round(average_len * 1.25)) + 1,
-            batch_size,
-        ).astype(np.int32)
-        indices = np.random.randint(0, categorical_limit, np.sum(lengths)).astype(
-            np.int64
-        )
-        outputs[0].feed(indices)
-        outputs[1].feed(lengths)
-
-    init_net = core.Net("init_net")
-    init_net.Python(f)([], ["indices", "lengths"])
-    workspace.RunNetOnce(init_net)
-
-    net = core.Net("mynet")
-    if flush_cache:
-        l3_cache_size = 30 * 2 ** 20 // 4
-        workspace.FeedBlob(
-            "huge_blob", np.random.randn(l3_cache_size).astype(np.float32)
-        )
-        net.Scale("huge_blob", "huge_blob_2x", value=2.0)
-    if dtype_str == "uint8":
-        net.SparseLengthsSum8BitsRowwise(["X", "indices", "lengths", "scale_bias"], "Y")
-    elif dtype_str == "uint8_fused":
-        net.SparseLengthsSumFused8BitRowwise(["X", "indices", "lengths"], "Y")
-    else:
-        net.SparseLengthsSum(["X", "indices", "lengths"], "Y")
-    workspace.CreateNet(net)
-
-    # Set random seed, so that repeated runs will keep the same sequence of
-    # random indices.
-    np.random.seed(1701)
-
-    print("Preparation finished. " + str(datetime.datetime.now()))
-
-    runtimes = workspace.BenchmarkNet(net.Name(), 1, iterations, True)
-    print(
-        "{} billion sums per cycle".format(
-            embedding_size
-            * workspace.FetchBlob("indices").size
-            / runtimes[2 if flush_cache else 1]
-            / 1e6
-        )
-    )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="minimal benchmark for sparse lengths sum."
-    )
-    parser.add_argument(
-        "-d",
-        "--dtype",
-        choices=list(DTYPES.keys()),
-        default="float",
-        help="The data type for the input lookup table.",
-    )
-    parser.add_argument(
-        "-e", "--embedding-size", type=int, default=6000000, help="Lookup table size."
-    )
-    parser.add_argument(
-        "--embedding-dim", type=int, default=128, help="Embedding dimension."
-    )
-    parser.add_argument(
-        "--average-len",
-        type=int,
-        default=27,
-        help="Sparse feature average lengths, default is 27",
-    )
-    parser.add_argument("--batch-size", type=int, default=100, help="The batch size.")
-    parser.add_argument(
-        "-i", "--iteration", type=int, default=100000, help="The number of iterations."
-    )
-    parser.add_argument(
-        "--flush-cache", action="store_true", help="If true, flush cache"
-    )
-    args, extra_args = parser.parse_known_args()
-    core.GlobalInit(["python"] + extra_args)
-    benchmark_sparse_lengths_sum(
-        args.dtype,
-        args.embedding_size,
-        args.embedding_dim,
-        args.average_len,
-        args.batch_size,
-        args.iteration,
-        args.flush_cache,
-    )
diff --git a/caffe2/python/operator_test/sparse_lp_regularizer_test.py b/caffe2/python/operator_test/sparse_lp_regularizer_test.py
deleted file mode 100644
index 7ea32bd69a29..000000000000
--- a/caffe2/python/operator_test/sparse_lp_regularizer_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-
-
-
-
-
-import hypothesis
-from hypothesis import given, settings, HealthCheck
-import hypothesis.strategies as st
-import numpy as np
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestSparseLpNorm(hu.HypothesisTestCase):
-
-    @staticmethod
-    def ref_lpnorm(param_in, p, reg_lambda):
-        """Reference function that should be matched by the Caffe2 operator."""
-        if p == 2.0:
-            return param_in * (1 - reg_lambda)
-        if p == 1.0:
-            reg_term = np.ones_like(param_in) * reg_lambda * np.sign(param_in)
-            param_out = param_in - reg_term
-            param_out[np.abs(param_in) <= reg_lambda] = 0.
-            return param_out
-        raise ValueError
-
-    # Suppress filter_too_much health check.
-    # Likely caused by `assume` call falling through too often.
-    @settings(suppress_health_check=[HealthCheck.filter_too_much])
-    @given(inputs=hu.tensors(n=1, min_dim=2, max_dim=2),
-           p=st.integers(min_value=1, max_value=2),
-           reg_lambda=st.floats(min_value=1e-4, max_value=1e-1),
-           data_strategy=st.data(),
-           **hu.gcs_cpu_only)
-    def test_sparse_lpnorm(self, inputs, p, reg_lambda, data_strategy, gc, dc):
-
-        param, = inputs
-        param += 0.02 * np.sign(param)
-        param[param == 0.0] += 0.02
-
-        # Create an indexing array containing values that are lists of indices,
-        # which index into param
-        indices = data_strategy.draw(
-            hu.tensor(dtype=np.int64, min_dim=1, max_dim=1,
-                      elements=st.sampled_from(np.arange(param.shape[0]))),
-        )
-        hypothesis.note('indices.shape: %s' % str(indices.shape))
-
-        # For now, the indices must be unique
-        hypothesis.assume(np.array_equal(np.unique(indices.flatten()),
-                                         np.sort(indices.flatten())))
-
-        op = core.CreateOperator(
-            "SparseLpRegularizer",
-            ["param", "indices"],
-            ["param"],
-            p=float(p),
-            reg_lambda=reg_lambda,
-        )
-
-        def ref_sparse_lp_regularizer(param, indices, grad=None):
-            param_out = np.copy(param)
-            for _, index in enumerate(indices):
-                param_out[index] = self.ref_lpnorm(
-                    param[index],
-                    p=p,
-                    reg_lambda=reg_lambda,
-                )
-            return (param_out,)
-
-        self.assertReferenceChecks(
-            gc, op, [param, indices],
-            ref_sparse_lp_regularizer
-        )
diff --git a/caffe2/python/operator_test/sparse_normalize_test.py b/caffe2/python/operator_test/sparse_normalize_test.py
deleted file mode 100644
index 30beda3e464c..000000000000
--- a/caffe2/python/operator_test/sparse_normalize_test.py
+++ /dev/null
@@ -1,96 +0,0 @@
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis
-import hypothesis.strategies as st
-import numpy as np
-from caffe2.python import core
-from hypothesis import HealthCheck, given, settings
-
-
-class TestSparseNormalize(hu.HypothesisTestCase):
-    @staticmethod
-    def ref_normalize(param_in, use_max_norm, norm):
-        param_norm = np.linalg.norm(param_in) + 1e-12
-        if (use_max_norm and param_norm > norm) or not use_max_norm:
-            param_in = param_in * norm / param_norm
-        return param_in
-
-    # Suppress filter_too_much health check.
-    # Likely caused by `assume` call falling through too often.
-    @settings(suppress_health_check=[HealthCheck.filter_too_much])
-    @given(
-        inputs=hu.tensors(n=2, min_dim=2, max_dim=2),
-        use_max_norm=st.booleans(),
-        norm=st.floats(min_value=1.0, max_value=4.0),
-        data_strategy=st.data(),
-        use_fp16=st.booleans(),
-        **hu.gcs_cpu_only
-    )
-    def test_sparse_normalize(
-        self, inputs, use_max_norm, norm, data_strategy, use_fp16, gc, dc
-    ):
-        param, grad = inputs
-        param += 0.02 * np.sign(param)
-        param[param == 0.0] += 0.02
-
-        if use_fp16:
-            param = param.astype(np.float16)
-            grad = grad.astype(np.float16)
-
-        # Create an indexing array containing values that are lists of indices,
-        # which index into param
-        indices = data_strategy.draw(
-            hu.tensor(
-                dtype=np.int64,
-                min_dim=1,
-                max_dim=1,
-                elements=st.sampled_from(np.arange(param.shape[0])),
-            )
-        )
-        hypothesis.note("indices.shape: %s" % str(indices.shape))
-
-        # For now, the indices must be unique
-        hypothesis.assume(
-            np.array_equal(np.unique(indices.flatten()), np.sort(indices.flatten()))
-        )
-
-        op1 = core.CreateOperator(
-            "Float16SparseNormalize" if use_fp16 else "SparseNormalize",
-            ["param", "indices"],
-            ["param"],
-            use_max_norm=use_max_norm,
-            norm=norm,
-        )
-
-        # Sparsify grad
-        grad = grad[indices]
-
-        op2 = core.CreateOperator(
-            "Float16SparseNormalize" if use_fp16 else "SparseNormalize",
-            ["param", "indices", "grad"],
-            ["param"],
-            use_max_norm=use_max_norm,
-            norm=norm,
-        )
-
-        def ref_sparse_normalize(param, indices, grad=None):
-            param_out = np.copy(param)
-            for _, index in enumerate(indices):
-                param_out[index] = self.ref_normalize(param[index], use_max_norm, norm)
-            return (param_out,)
-
-        # self.assertDeviceChecks(dc, op, [param, indices], [0])
-        self.assertReferenceChecks(
-            gc,
-            op1,
-            [param, indices],
-            ref_sparse_normalize,
-            threshold=1e-2 if use_fp16 else 1e-4,
-        )
-
-        self.assertReferenceChecks(
-            gc,
-            op2,
-            [param, indices, grad],
-            ref_sparse_normalize,
-            threshold=1e-2 if use_fp16 else 1e-4,
-        )
diff --git a/caffe2/python/operator_test/sparse_ops_test.py b/caffe2/python/operator_test/sparse_ops_test.py
deleted file mode 100644
index 155c00e8dc44..000000000000
--- a/caffe2/python/operator_test/sparse_ops_test.py
+++ /dev/null
@@ -1,89 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from caffe2.python.test_util import rand_array
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-class TestScatterOps(serial.SerializedTestCase):
-    # TODO(dzhulgakov): add test cases for failure scenarios
-    @given(num_args=st.integers(1, 5),
-           first_dim=st.integers(1, 20),
-           index_dim=st.integers(1, 10),
-           extra_dims=st.lists(st.integers(1, 4), min_size=0, max_size=3),
-           ind_type=st.sampled_from([np.int32, np.int64]),
-           data_type=st.sampled_from([np.float32, np.float64]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def testScatterWeightedSum(
-            self, num_args, first_dim, index_dim, extra_dims, ind_type, data_type, gc, dc):
-        ins = ['data', 'w0', 'indices']
-        for i in range(1, num_args + 1):
-            ins.extend(['x' + str(i), 'w' + str(i)])
-        op = core.CreateOperator(
-            'ScatterWeightedSum',
-            ins,
-            ['data'],
-            device_option=gc)
-        def ref(d, w0, ind, *args):
-            r = d.copy()
-            for i in ind:
-                r[i] *= w0
-            for i in range(0, len(args), 2):
-                x = args[i]
-                w = args[i+1]
-                for i, j in enumerate(ind):
-                    r[j] += w * x[i]
-            return [r]
-
-        d = rand_array(first_dim, *extra_dims)
-        ind = np.random.randint(0, first_dim, index_dim).astype(ind_type)
-        # ScatterWeightedSumOp only supports w0=1.0 in CUDAContext
-        # And it only support float32 data in CUDAContext
-        if(gc == hu.gpu_do or gc == hu.hip_do):
-            w0 = np.array(1.0).astype(np.float32)
-            data_type = np.float32
-        else:
-            w0 = rand_array()
-        d = d.astype(data_type)
-        inputs = [d, w0, ind]
-        for _ in range(1, num_args + 1):
-            x = rand_array(index_dim, *extra_dims).astype(data_type)
-            w = rand_array()
-            inputs.extend([x,w])
-        self.assertReferenceChecks(gc, op, inputs, ref, threshold=1e-3)
-
-    @given(first_dim=st.integers(1, 20),
-           index_dim=st.integers(1, 10),
-           extra_dims=st.lists(st.integers(1, 4), min_size=0, max_size=3),
-           data_type=st.sampled_from([np.float16, np.float32, np.int32, np.int64]),
-           ind_type=st.sampled_from([np.int32, np.int64]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def testScatterAssign(
-            self, first_dim, index_dim, extra_dims, data_type, ind_type, gc, dc):
-        op = core.CreateOperator('ScatterAssign',
-                                 ['data', 'indices', 'slices'], ['data'])
-        def ref(d, ind, x):
-            r = d.copy()
-            r[ind] = x
-            return [r]
-
-        # let's have indices unique
-        if first_dim < index_dim:
-            first_dim, index_dim = index_dim, first_dim
-        d = (rand_array(first_dim, *extra_dims) * 10).astype(data_type)
-        ind = np.random.choice(first_dim, index_dim,
-                               replace=False).astype(ind_type)
-        x = (rand_array(index_dim, *extra_dims) * 10).astype(data_type)
-        self.assertReferenceChecks(gc, op, [d, ind, x], ref, threshold=1e-3, ensure_outputs_are_inferred=True)
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py b/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py
deleted file mode 100644
index 267babf2145f..000000000000
--- a/caffe2/python/operator_test/sparse_to_dense_mask_op_test.py
+++ /dev/null
@@ -1,112 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestFcOperator(hu.HypothesisTestCase):
-
-    @given(n=st.integers(1, 10), k=st.integers(1, 5),
-           use_length=st.booleans(), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_sparse_to_dense_mask(self, n, k, use_length, gc, dc):
-        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
-        N = sum(lengths)
-        indices = np.random.randint(5, size=N)
-        values = np.random.rand(N, 2).astype(np.float32)
-        default = np.random.rand(2).astype(np.float32)
-        mask = np.arange(3)
-        np.random.shuffle(mask)
-
-        input_str = ['indices', 'values', 'default']
-        input_data = [indices, values, default]
-        if use_length and n > 1:
-            input_str.append('lengths')
-            input_data.append(lengths)
-        output_str = ['output']
-
-        op = core.CreateOperator(
-            'SparseToDenseMask',
-            input_str,
-            output_str,
-            mask=mask,
-        )
-
-        # Check over multiple devices
-        self.assertDeviceChecks(
-            dc, op, input_data, [0])
-        # Gradient check for values
-        self.assertGradientChecks(
-            gc, op, input_data, 1, [0])
-
-    @given(n=st.integers(1, 10), k=st.integers(1, 5),
-           use_length=st.booleans(), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_sparse_to_dense_mask_with_int64(self, n, k, use_length, gc, dc):
-        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
-        N = sum(lengths)
-        int64_mask = 10000000000
-        indices = np.random.randint(5, size=N) + int64_mask
-        values = np.random.rand(N, 2).astype(np.float32)
-        default = np.random.rand(2).astype(np.float32)
-        mask = np.arange(3) + int64_mask
-        np.random.shuffle(mask)
-
-        input_str = ['indices', 'values', 'default']
-        input_data = [indices, values, default]
-        if use_length and n > 1:
-            input_str.append('lengths')
-            input_data.append(lengths)
-        output_str = ['output']
-
-        op = core.CreateOperator(
-            'SparseToDenseMask',
-            input_str,
-            output_str,
-            mask=mask,
-        )
-
-        # Check over multiple devices
-        self.assertDeviceChecks(
-            dc, op, input_data, [0])
-        # Gradient check for values
-        self.assertGradientChecks(
-            gc, op, input_data, 1, [0])
-
-    @given(n=st.integers(1, 10), k=st.integers(1, 5),
-           dim=st.integers(1, 3), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_sparse_to_dense_mask_high_dim(self, n, k, dim, gc, dc):
-        lengths = np.random.randint(k, size=n).astype(np.int32) + 1
-        N = sum(lengths)
-        indices = np.random.randint(5, size=N)
-        shape = np.random.randint(5, size=dim).astype(np.int32) + 1
-        values = np.random.rand(*((N,) + tuple(shape))).astype(np.float32)
-        default = np.random.rand(*shape).astype(np.float32)
-        mask = np.arange(3)
-        np.random.shuffle(mask)
-
-        op = core.CreateOperator(
-            'SparseToDenseMask',
-            ['indices', 'values', 'default', 'lengths'],
-            ['output'],
-            mask=mask,
-        )
-
-        # Check over multiple devices
-        self.assertDeviceChecks(
-            dc, op, [indices, values, default, lengths], [0])
-        # Gradient check for values
-        self.assertGradientChecks(
-            gc, op, [indices, values, default, lengths], 1, [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/spatial_bn_op_test.py b/caffe2/python/operator_test/spatial_bn_op_test.py
deleted file mode 100644
index 88a0ebbd066c..000000000000
--- a/caffe2/python/operator_test/spatial_bn_op_test.py
+++ /dev/null
@@ -1,498 +0,0 @@
-
-
-
-
-
-from caffe2.python import brew, core, utils, workspace
-import caffe2.python.hip_test_util as hiputl
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.python.model_helper import ModelHelper
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-from hypothesis import given, assume, settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-class TestSpatialBN(serial.SerializedTestCase):
-
-    @serial.given(size=st.integers(7, 10),
-           input_channels=st.integers(1, 10),
-           batch_size=st.integers(0, 3),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           inplace=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    def test_spatialbn_test_mode_3d(
-            self, size, input_channels, batch_size, seed, order, epsilon,
-            inplace, engine, gc, dc):
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "mean", "var"],
-            ["X" if inplace else "Y"],
-            order=order,
-            is_test=True,
-            epsilon=epsilon,
-            engine=engine,
-        )
-
-        def reference_spatialbn_test(X, scale, bias, mean, var):
-            if order == "NCHW":
-                scale = scale[np.newaxis, :,
-                              np.newaxis, np.newaxis, np.newaxis]
-                bias = bias[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis]
-                mean = mean[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis]
-                var = var[np.newaxis, :, np.newaxis, np.newaxis, np.newaxis]
-
-            return ((X - mean) / np.sqrt(var + epsilon) * scale + bias,)
-
-        np.random.seed(1701)
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(batch_size, input_channels, size, size, size)\
-            .astype(np.float32) - 0.5
-
-        if order == "NHWC":
-            X = utils.NCHW2NHWC(X)
-        self.assertReferenceChecks(gc, op, [X, scale, bias, mean, var],
-                                   reference_spatialbn_test)
-        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var], [0])
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(size=st.integers(7, 10),
-           input_channels=st.integers(1, 10),
-           batch_size=st.integers(0, 3),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           inplace=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    def test_spatialbn_test_mode_1d(
-            self, size, input_channels, batch_size, seed, order, epsilon,
-            inplace, engine, gc, dc):
-        # Currently MIOPEN SpatialBN only supports 2D
-        if hiputl.run_in_hip(gc, dc):
-            assume(engine != "CUDNN")
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "mean", "var"],
-            ["X" if inplace else "Y"],
-            order=order,
-            is_test=True,
-            epsilon=epsilon,
-            engine=engine,
-        )
-
-        def reference_spatialbn_test(X, scale, bias, mean, var):
-            if order == "NCHW":
-                scale = scale[np.newaxis, :, np.newaxis]
-                bias = bias[np.newaxis, :, np.newaxis]
-                mean = mean[np.newaxis, :, np.newaxis]
-                var = var[np.newaxis, :, np.newaxis]
-            return ((X - mean) / np.sqrt(var + epsilon) * scale + bias,)
-
-        np.random.seed(1701)
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size).astype(np.float32) - 0.5
-
-        if order == "NHWC":
-            X = X.swapaxes(1, 2)
-        self.assertReferenceChecks(gc, op, [X, scale, bias, mean, var],
-                                   reference_spatialbn_test)
-        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var], [0])
-
-    @given(size=st.integers(7, 10),
-           input_channels=st.integers(1, 10),
-           batch_size=st.integers(0, 3),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           engine=st.sampled_from(["", "CUDNN"]),
-           inplace=st.booleans(),
-           **hu.gcs)
-    def test_spatialbn_test_mode(
-            self, size, input_channels, batch_size, seed, order, epsilon,
-            inplace, engine, gc, dc):
-        # Currently HIP SpatialBN only supports NCHW
-        if hiputl.run_in_hip(gc, dc):
-            assume(order == "NCHW")
-
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "mean", "var"],
-            ["X" if inplace else "Y"],
-            order=order,
-            is_test=True,
-            epsilon=epsilon,
-            engine=engine
-        )
-
-        def reference_spatialbn_test(X, scale, bias, mean, var):
-            if order == "NCHW":
-                scale = scale[np.newaxis, :, np.newaxis, np.newaxis]
-                bias = bias[np.newaxis, :, np.newaxis, np.newaxis]
-                mean = mean[np.newaxis, :, np.newaxis, np.newaxis]
-                var = var[np.newaxis, :, np.newaxis, np.newaxis]
-            return ((X - mean) / np.sqrt(var + epsilon) * scale + bias,)
-
-        np.random.seed(1701)
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-
-        if order == "NHWC":
-            X = X.swapaxes(1, 2).swapaxes(2, 3)
-
-        self.assertReferenceChecks(gc, op, [X, scale, bias, mean, var],
-                                   reference_spatialbn_test)
-        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var], [0])
-
-    @given(size=st.integers(1, 10),
-           input_channels=st.integers(1, 10),
-           batch_size=st.integers(0, 3),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           epsilon=st.floats(1e-5, 1e-2),
-           momentum=st.floats(0.5, 0.9),
-           engine=st.sampled_from(["", "CUDNN"]),
-           inplace=st.sampled_from([True, False]),
-           **hu.gcs)
-    def test_spatialbn_train_mode(
-            self, size, input_channels, batch_size, seed, order, epsilon,
-            momentum, inplace, engine, gc, dc):
-        # Currently HIP SpatialBN only supports NCHW
-        if hiputl.run_in_hip(gc, dc):
-            assume(order == "NCHW")
-
-        assume(batch_size == 0 or batch_size * size * size > 1)
-
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "running_mean", "running_var"],
-            ["X" if inplace else "Y",
-             "running_mean", "running_var", "saved_mean", "saved_var"],
-            order=order,
-            is_test=False,
-            epsilon=epsilon,
-            momentum=momentum,
-            engine=engine,
-        )
-        np.random.seed(1701)
-        scale = np.random.randn(input_channels).astype(np.float32)
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.randn(
-            batch_size, input_channels, size, size).astype(np.float32)
-
-        if order == "NHWC":
-            X = np.transpose(X, (0, 2, 3, 1))
-
-        def batch_norm_ref(X, scale, bias, running_mean, running_var):
-            if batch_size == 0:
-                Y = np.zeros(X.shape)
-                saved_mean = np.zeros(running_mean.shape)
-                saved_var = np.zeros(running_var.shape)
-                return (Y, running_mean, running_var, saved_mean, saved_var)
-
-            if order == "NHWC":
-                X = np.transpose(X, (0, 3, 1, 2))
-
-            C = X.shape[1]
-            reduce_size = batch_size * size * size
-            saved_mean = np.mean(X, (0, 2, 3))
-            saved_var = np.var(X, (0, 2, 3))
-            if reduce_size == 1:
-                unbias_scale = float('inf')
-            else:
-                unbias_scale = reduce_size / (reduce_size - 1)
-            running_mean = momentum * running_mean + (
-                1.0 - momentum) * saved_mean
-            running_var = momentum * running_var + (
-                1.0 - momentum) * unbias_scale * saved_var
-            std = np.sqrt(saved_var + epsilon)
-            broadcast_shape = (1, C, 1, 1)
-            Y = (X - np.reshape(saved_mean, broadcast_shape)) / np.reshape(
-                std, broadcast_shape) * np.reshape(
-                    scale, broadcast_shape) + np.reshape(bias, broadcast_shape)
-
-            if order == "NHWC":
-                Y = np.transpose(Y, (0, 2, 3, 1))
-
-            return (Y, running_mean, running_var, saved_mean, 1.0 / std)
-
-
-        self.assertReferenceChecks(gc, op, [X, scale, bias, mean, var],
-                                   batch_norm_ref)
-        self.assertDeviceChecks(dc, op, [X, scale, bias, mean, var],
-                                [0, 1, 2, 3, 4])
-
-    @given(size=st.integers(7, 10),
-           input_channels=st.integers(1, 10),
-           batch_size=st.integers(0, 3),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           momentum=st.floats(0.5, 0.9),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_spatialbn_train_mode_gradient_check(
-            self, size, input_channels, batch_size, seed, order, epsilon,
-            momentum, engine, gc, dc):
-        # Currently HIP SpatialBN only supports NCHW
-        if hiputl.run_in_hip(gc, dc):
-            assume(order == "NCHW")
-
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "mean", "var"],
-            ["Y", "mean", "var", "saved_mean", "saved_var"],
-            order=order,
-            is_test=False,
-            epsilon=epsilon,
-            momentum=momentum,
-            engine=engine
-        )
-        np.random.seed(seed)
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        if order == "NHWC":
-            X = X.swapaxes(1, 2).swapaxes(2, 3)
-
-        for input_to_check in [0, 1, 2]:  # dX, dScale, dBias
-            self.assertGradientChecks(gc, op, [X, scale, bias, mean, var],
-                                      input_to_check, [0])
-
-    @given(size=st.integers(7, 10),
-           input_channels=st.integers(1, 10),
-           batch_size=st.integers(0, 3),
-           seed=st.integers(0, 65535),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-           momentum=st.floats(min_value=0.5, max_value=0.9),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_spatialbn_train_mode_gradient_check_1d(
-            self, size, input_channels, batch_size, seed, order, epsilon,
-            momentum, engine, gc, dc):
-        # Currently MIOPEN SpatialBN only supports 2D
-        if hiputl.run_in_hip(gc, dc):
-            assume(engine != "CUDNN")
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "mean", "var"],
-            ["Y", "mean", "var", "saved_mean", "saved_var"],
-            order=order,
-            is_test=False,
-            epsilon=epsilon,
-            momentum=momentum,
-            engine=engine,
-        )
-        np.random.seed(seed)
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size).astype(np.float32) - 0.5
-        if order == "NHWC":
-            X = X.swapaxes(1, 2)
-
-        for input_to_check in [0, 1, 2]:  # dX, dScale, dBias
-            self.assertGradientChecks(gc, op, [X, scale, bias, mean, var],
-                                      input_to_check, [0], stepsize=0.01)
-
-    @given(N=st.integers(0, 5),
-           C=st.integers(1, 10),
-           H=st.integers(1, 5),
-           W=st.integers(1, 5),
-           epsilon=st.floats(1e-5, 1e-2),
-           momentum=st.floats(0.5, 0.9),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           num_batches=st.integers(2, 5),
-           in_place=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    def test_spatial_bn_multi_batch(
-            self, N, C, H, W, epsilon, momentum, order, num_batches, in_place,
-            engine, gc, dc):
-        if in_place:
-            outputs = ["Y", "mean", "var", "batch_mean", "batch_var"]
-        else:
-            outputs = ["Y", "mean", "var", "saved_mean", "saved_var"]
-        op = core.CreateOperator(
-            "SpatialBN",
-            ["X", "scale", "bias", "mean", "var", "batch_mean", "batch_var"],
-            outputs,
-            order=order,
-            is_test=False,
-            epsilon=epsilon,
-            momentum=momentum,
-            num_batches=num_batches,
-            engine=engine,
-        )
-        if order == "NCHW":
-            X = np.random.randn(N, C, H, W).astype(np.float32)
-        else:
-            X = np.random.randn(N, H, W, C).astype(np.float32)
-        scale = np.random.randn(C).astype(np.float32)
-        bias = np.random.randn(C).astype(np.float32)
-        mean = np.random.randn(C).astype(np.float32)
-        var = np.random.rand(C).astype(np.float32)
-        batch_mean = np.random.rand(C).astype(np.float32) - 0.5
-        batch_var = np.random.rand(C).astype(np.float32) + 1.0
-        inputs = [X, scale, bias, mean, var, batch_mean, batch_var]
-
-        def spatial_bn_multi_batch_ref(
-                X, scale, bias, mean, var, batch_mean, batch_var):
-            if N == 0:
-                batch_mean = np.zeros(C).astype(np.float32)
-                batch_var = np.zeros(C).astype(np.float32)
-            else:
-                size = num_batches * N * H * W
-                batch_mean /= size
-                batch_var = batch_var / size - np.square(batch_mean)
-                mean = momentum * mean + (1.0 - momentum) * batch_mean
-                var = momentum * var + (1.0 - momentum) * (
-                    size / (size - 1)) * batch_var
-                batch_var = 1.0 / np.sqrt(batch_var + epsilon)
-            if order == "NCHW":
-                scale = np.reshape(scale, (C, 1, 1))
-                bias = np.reshape(bias, (C, 1, 1))
-                batch_mean = np.reshape(batch_mean, (C, 1, 1))
-                batch_var = np.reshape(batch_var, (C, 1, 1))
-            Y = (X - batch_mean) * batch_var * scale + bias
-            if order == "NCHW":
-                batch_mean = np.reshape(batch_mean, (C))
-                batch_var = np.reshape(batch_var, (C))
-            return (Y, mean, var, batch_mean, batch_var)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=spatial_bn_multi_batch_ref,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0, 1, 2, 3, 4])
-
-    @given(N=st.integers(0, 5),
-           C=st.integers(1, 10),
-           H=st.integers(1, 5),
-           W=st.integers(1, 5),
-           epsilon=st.floats(1e-5, 1e-2),
-           order=st.sampled_from(["NCHW", "NHWC"]),
-           num_batches=st.integers(2, 5),
-           in_place=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=None)
-    def test_spatial_bn_multi_batch_grad(
-            self, N, C, H, W, epsilon, order, num_batches, in_place, engine,
-            gc, dc):
-        if in_place:
-            outputs = ["dX", "dscale_sum", "dbias_sum"]
-        else:
-            outputs = ["dX", "dscale", "dbias"]
-        op = core.CreateOperator(
-            "SpatialBNGradient",
-            ["X", "scale", "dY", "mean", "rstd", "dscale_sum", "dbias_sum"],
-            outputs,
-            order=order,
-            epsilon=epsilon,
-            num_batches=num_batches,
-            engine=engine,
-        )
-        if order == "NCHW":
-            dY = np.random.randn(N, C, H, W).astype(np.float32)
-            X = np.random.randn(N, C, H, W).astype(np.float32)
-        else:
-            dY = np.random.randn(N, H, W, C).astype(np.float32)
-            X = np.random.randn(N, H, W, C).astype(np.float32)
-        scale = np.random.randn(C).astype(np.float32)
-        mean = np.random.randn(C).astype(np.float32)
-        rstd = np.random.rand(C).astype(np.float32)
-        dscale_sum = np.random.randn(C).astype(np.float32)
-        dbias_sum = np.random.randn(C).astype(np.float32)
-        inputs = [X, scale, dY, mean, rstd, dscale_sum, dbias_sum]
-
-        def spatial_bn_multi_batch_grad_ref(
-                X, scale, dY, mean, rstd, dscale_sum, dbias_sum):
-            if N == 0:
-                dscale = np.zeros(C).astype(np.float32)
-                dbias = np.zeros(C).astype(np.float32)
-                alpha = np.zeros(C).astype(np.float32)
-                beta = np.zeros(C).astype(np.float32)
-                gamma = np.zeros(C).astype(np.float32)
-            else:
-                dscale = dscale_sum / num_batches
-                dbias = dbias_sum / num_batches
-                alpha = scale * rstd
-                beta = -alpha * dscale * rstd / (N * H * W)
-                gamma = alpha * (mean * dscale * rstd - dbias) / (N * H * W)
-            if order == "NCHW":
-                alpha = np.reshape(alpha, (C, 1, 1))
-                beta = np.reshape(beta, (C, 1, 1))
-                gamma = np.reshape(gamma, (C, 1, 1))
-            dX = alpha * dY + beta * X + gamma
-            return (dX, dscale, dbias)
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=spatial_bn_multi_batch_grad_ref,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
-
-    @given(size=st.integers(7, 10),
-           input_channels=st.integers(1, 10),
-           batch_size=st.integers(0, 3),
-           seed=st.integers(0, 65535),
-           epsilon=st.floats(1e-5, 1e-2),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    def test_spatialbn_brew_wrapper(
-            self, size, input_channels, batch_size, seed, epsilon,
-            engine, gc, dc):
-        np.random.seed(seed)
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32)
-
-        workspace.FeedBlob('X', X)
-
-        model = ModelHelper(name='test_spatialbn_brew_wrapper')
-
-        brew.spatial_bn(
-            model,
-            'X',
-            'Y',
-            input_channels,
-            epsilon=epsilon,
-            is_test=False,
-        )
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/specialized_segment_ops_test.py b/caffe2/python/operator_test/specialized_segment_ops_test.py
deleted file mode 100644
index 4f1842ac4664..000000000000
--- a/caffe2/python/operator_test/specialized_segment_ops_test.py
+++ /dev/null
@@ -1,331 +0,0 @@
-
-
-import unittest
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-import caffe2.python.hip_test_util as hiputl
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-from hypothesis import given, assume, settings
-
-
-class TestSpecializedSegmentOps(hu.HypothesisTestCase):
-    @given(
-        batchsize=st.integers(1, 20),
-        fptype=st.sampled_from([np.float16, np.float32]),
-        fp16asint=st.booleans(),
-        blocksize=st.sampled_from([8, 16, 32, 64, 85, 96, 128, 163]),
-        normalize_by_lengths=st.booleans(),
-        empty_indices=st.booleans(),
-        **hu.gcs
-    )
-    def test_sparse_lengths_sum_cpu(
-        self,
-        batchsize,
-        fptype,
-        fp16asint,
-        blocksize,
-        normalize_by_lengths,
-        empty_indices,
-        gc,
-        dc,
-    ):
-        if fptype != np.float32:
-            assume(gc.device_type == caffe2_pb2.CPU)
-            assume(not hiputl.run_in_hip(gc, dc))
-            assume(caffe2_pb2.CUDA not in {d.device_type for d in dc})
-
-        if normalize_by_lengths:
-            print("<test_sparse_lengths_sum_mean_cpu>")
-        else:
-            print("<test_sparse_lengths_sum_cpu>")
-
-        tblsize = 300
-        if fptype == np.float32:
-            Tbl = np.random.rand(tblsize, blocksize).astype(np.float32)
-            atol = 1e-5
-        else:
-            if fp16asint:
-                Tbl = (
-                    (10.0 * np.random.rand(tblsize, blocksize))
-                    .round()
-                    .astype(np.float16)
-                )
-                atol = 1e-3
-            else:
-                Tbl = np.random.rand(tblsize, blocksize).astype(np.float16)
-                atol = 1e-1
-
-        # array of each row length
-        if empty_indices:
-            Lengths = np.zeros(batchsize, dtype=np.int32)
-        else:
-            Lengths = np.random.randint(1, 30, size=batchsize, dtype=np.int32)
-        # flat indices
-        Indices = np.random.randint(0, tblsize, size=sum(Lengths), dtype=np.int64)
-
-        op = core.CreateOperator(
-            "SparseLengths" + ("Mean" if normalize_by_lengths else "Sum"),
-            ["Tbl", "Indices", "Lengths"],
-            "out",
-        )
-
-        def sparse_lengths_sum_ref(Tbl, Indices, Lengths):
-            rptr = np.cumsum(np.insert(Lengths, [0], [0]))
-            out = np.zeros((len(Lengths), blocksize))
-            if normalize_by_lengths:
-                for i in range(0, len(rptr[0:-1])):
-                    if Lengths[i] != 0:
-                        out[i] = (
-                            Tbl[Indices[rptr[i] : rptr[i + 1]]].sum(axis=0)
-                            * 1.0
-                            / float(Lengths[i])
-                        )
-            else:
-                for i in range(0, len(rptr[0:-1])):
-                    out[i] = Tbl[Indices[rptr[i] : rptr[i + 1]]].sum(axis=0)
-
-            return [out.astype(np.float32)]
-
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [Tbl, Indices, Lengths],
-            sparse_lengths_sum_ref,
-            threshold=1e-3,
-            atol=atol,
-        )
-
-    @given(
-        batchsize=st.integers(1, 20),
-        fptype=st.sampled_from([np.float16, np.float32]),
-        fp16asint=st.booleans(),
-        blocksize=st.sampled_from([8, 16, 32, 64, 85, 96, 128, 163]),
-        empty_indices=st.booleans(),
-        **hu.gcs
-    )
-    def test_sparse_lengths_weightedsum_cpu(
-        self, batchsize, fptype, fp16asint, blocksize, empty_indices, gc, dc
-    ):
-        if fptype != np.float32:
-            assume(gc.device_type == caffe2_pb2.CPU)
-            assume(not hiputl.run_in_hip(gc, dc))
-            assume(caffe2_pb2.CUDA not in {d.device_type for d in dc})
-
-        print("<test_sparse_lengths_weightedsum_cpu>")
-
-        tblsize = 300
-        if fptype == np.float32:
-            Tbl = np.random.rand(tblsize, blocksize).astype(np.float32)
-            atol = 1e-5
-        else:
-            if fp16asint:
-                Tbl = (
-                    (10.0 * np.random.rand(tblsize, blocksize))
-                    .round()
-                    .astype(np.float16)
-                )
-                atol = 1e-3
-            else:
-                Tbl = np.random.rand(tblsize, blocksize).astype(np.float16)
-                atol = 1e-1
-
-        # array of each row length
-        if empty_indices:
-            Lengths = np.zeros(batchsize, dtype=np.int32)
-        else:
-            Lengths = np.random.randint(1, 30, size=batchsize, dtype=np.int32)
-        # flat indices
-        Indices = np.random.randint(0, tblsize, size=sum(Lengths), dtype=np.int64)
-        Weights = np.random.rand(sum(Lengths)).astype(np.float32)
-
-        op = core.CreateOperator(
-            "SparseLengthsWeightedSum", ["Tbl", "Weights", "Indices", "Lengths"], "out"
-        )
-
-        def sparse_lengths_weightedsum_ref(Tbl, Weights, Indices, Lengths):
-            rptr = np.cumsum(np.insert(Lengths, [0], [0]))
-            out = np.zeros((len(Lengths), blocksize))
-            for i in range(0, len(rptr[0:-1])):
-                w = Weights[rptr[i] : rptr[i + 1]]
-                out[i] = (Tbl[Indices[rptr[i] : rptr[i + 1]]] * w[:, np.newaxis]).sum(
-                    axis=0
-                )
-            return [out.astype(np.float32)]
-
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [Tbl, Weights, Indices, Lengths],
-            sparse_lengths_weightedsum_ref,
-            threshold=1e-3,
-            atol=atol,
-        )
-
-    @given(
-        batchsize=st.integers(1, 20),
-        blocksize=st.sampled_from([8, 16, 17, 26, 32, 64, 85, 96, 128, 148, 163]),
-        normalize_by_lengths=st.booleans(),
-        empty_indices=st.booleans(),
-        **hu.gcs_cpu_only
-    )
-    def test_sparse_lengths_weightedsum_8BitsRowwiseOp_cpu(
-        self, batchsize, blocksize, normalize_by_lengths, empty_indices, gc, dc
-    ):
-        if normalize_by_lengths:
-            print(
-                "<test_sparse_lengths_weightedsum_SparseLengthsWeightedMean8BitsRowwise_cpu>"
-            )
-        else:
-            print(
-                "<test_sparse_lengths_weightedsum_SparseLengthsWeightedSum8BitsRowwise_cpu>"
-            )
-
-        tblsize = 300
-        Tbl = np.random.randint(7, size=(tblsize, blocksize), dtype=np.uint8)
-        atol = 1e-5
-
-        # array of each row length
-        if empty_indices:
-            Lengths = np.zeros(batchsize, dtype=np.int32)
-        else:
-            Lengths = np.random.randint(1, 30, size=batchsize, dtype=np.int32)
-        # flat indices
-        Indices = np.random.randint(0, tblsize, size=sum(Lengths), dtype=np.int64)
-        Weights = np.random.rand(sum(Lengths)).astype(np.float32)
-        Scale_Bias = np.random.rand(tblsize, 2).astype(np.float32)
-
-        op = core.CreateOperator(
-            "SparseLengthsWeighted"
-            + ("Mean" if normalize_by_lengths else "Sum")
-            + "8BitsRowwise",
-            ["Tbl", "Weights", "Indices", "Lengths", "Scale_Bias"],
-            "out",
-        )
-
-        def sparse_lengths_weightedsum_8BitsRowwiseOp_cpu_ref(
-            Tbl, Weights, Indices, Lengths, Scale_Bias
-        ):
-            rptr = np.cumsum(np.insert(Lengths, [0], [0]))
-            out = np.zeros((len(Lengths), blocksize))
-            for i in range(0, len(rptr[0:-1])):
-                w = Weights[rptr[i] : rptr[i + 1]]
-                s = Scale_Bias[Indices[rptr[i] : rptr[i + 1]], 0][:, np.newaxis]
-                b = Scale_Bias[Indices[rptr[i] : rptr[i + 1]], 1][:, np.newaxis]
-                f = 1.0
-                if normalize_by_lengths and Lengths[i] != 0:
-                    f = 1.0 / float(Lengths[i])
-                out[i] = (
-                    w[:, np.newaxis] * (s * Tbl[Indices[rptr[i] : rptr[i + 1]]] + b)
-                ).sum(axis=0) * f
-            return [out.astype(np.float32)]
-
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [Tbl, Weights, Indices, Lengths, Scale_Bias],
-            sparse_lengths_weightedsum_8BitsRowwiseOp_cpu_ref,
-            threshold=1e-3,
-            atol=atol,
-        )
-
-    @given(
-        batchsize=st.integers(1, 20),
-        blocksize=st.sampled_from([8, 16, 17, 26, 32, 64, 85, 96, 128, 148, 163]),
-        normalize_by_lengths=st.booleans(),
-        empty_indices=st.booleans(),
-        **hu.gcs_cpu_only
-    )
-    def test_sparse_lengths_sum_8BitsRowwiseOp_cpu(
-        self, batchsize, blocksize, normalize_by_lengths, empty_indices, gc, dc
-    ):
-        if normalize_by_lengths:
-            print("<test_sparse_lengths_sum_SparseLengthsMean8BitsRowwise_cpu>")
-        else:
-            print("<test_sparse_lengths_sum_SparseLengthsSum8BitsRowwise_cpu>")
-
-        tblsize = 300
-        Tbl = np.random.randint(7, size=(tblsize, blocksize), dtype=np.uint8)
-        atol = 1e-5
-
-        # array of each row length
-        if empty_indices:
-            Lengths = np.zeros(batchsize, dtype=np.int32)
-        else:
-            Lengths = np.random.randint(1, 30, size=batchsize, dtype=np.int32)
-        # flat indices
-        Indices = np.random.randint(0, tblsize, size=sum(Lengths), dtype=np.int64)
-        Scale_Bias = np.random.rand(tblsize, 2).astype(np.float32)
-
-        op = core.CreateOperator(
-            "SparseLengths"
-            + ("Mean" if normalize_by_lengths else "Sum")
-            + "8BitsRowwise",
-            ["Tbl", "Indices", "Lengths", "Scale_Bias"],
-            "out",
-        )
-
-        def sparse_lengths_sum_8BitsRowwiseOp_cpu_reg(
-            Tbl, Indices, Lengths, Scale_Bias
-        ):
-            rptr = np.cumsum(np.insert(Lengths, [0], [0]))
-            out = np.zeros((len(Lengths), blocksize))
-            for i in range(0, len(rptr[0:-1])):
-                s = Scale_Bias[Indices[rptr[i] : rptr[i + 1]], 0][:, np.newaxis]
-                b = Scale_Bias[Indices[rptr[i] : rptr[i + 1]], 1][:, np.newaxis]
-                f = 1.0
-                if normalize_by_lengths and Lengths[i] != 0:
-                    f = 1.0 / float(Lengths[i])
-                out[i] = (s * Tbl[Indices[rptr[i] : rptr[i + 1]]] + b).sum(axis=0) * f
-            return [out.astype(np.float32)]
-
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [Tbl, Indices, Lengths, Scale_Bias],
-            sparse_lengths_sum_8BitsRowwiseOp_cpu_reg,
-            threshold=1e-3,
-            atol=atol,
-        )
-
-    @given(
-        batchsize=st.integers(1, 20),
-        blocksize=st.sampled_from([8, 16, 17, 26, 32, 64, 85, 96, 128, 148, 163]),
-        normalize_by_lengths=st.booleans(),
-        **hu.gcs_cpu_only
-    )
-    @settings(deadline=10000)
-    def test_sparse_lengths_sum_8BitsRowwiseOp_cpu_invalid_index(
-        self, batchsize, blocksize, normalize_by_lengths, gc, dc
-    ):
-
-        tblsize = 300
-        Tbl = np.random.randint(7, size=(tblsize, blocksize), dtype=np.uint8)
-
-        # array of each row length
-        Lengths = np.random.randint(1, 30, size=batchsize, dtype=np.int32)
-        # flat indices
-        Indices = np.random.randint(0, tblsize, size=sum(Lengths), dtype=np.int64)
-        Indices[0] += 1000
-        Scale_Bias = np.random.rand(tblsize, 2).astype(np.float32)
-
-        op = core.CreateOperator(
-            "SparseLengths"
-            + ("Mean" if normalize_by_lengths else "Sum")
-            + "8BitsRowwise",
-            ["Tbl", "Indices", "Lengths", "Scale_Bias"],
-            "out",
-        )
-
-        self.ws.create_blob("Tbl").feed(Tbl)
-        self.ws.create_blob("Indices").feed(Indices)
-        self.ws.create_blob("Lengths").feed(Lengths)
-        self.ws.create_blob("Scale_Bias").feed(Scale_Bias)
-        with self.assertRaises(RuntimeError):
-            self.ws.run(op)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/split_op_cost_test.py b/caffe2/python/operator_test/split_op_cost_test.py
deleted file mode 100644
index 97df350d20d2..000000000000
--- a/caffe2/python/operator_test/split_op_cost_test.py
+++ /dev/null
@@ -1,246 +0,0 @@
-import numpy as np
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-
-
-class TestSplitOpCost(TestCase):
-    def _verify_cost(self, workspace, split_op):
-        flops, bytes_written, bytes_read = workspace.GetOperatorCost(
-            split_op, split_op.input
-        )
-        self.assertEqual(flops, 0)
-        self.assertEqual(
-            bytes_read,
-            sum(workspace.FetchBlob(b).nbytes for b in split_op.input),
-        )
-        self.assertEqual(
-            bytes_written,
-            sum(workspace.FetchBlob(b).nbytes for b in split_op.output),
-        )
-
-    def test_columnwise_equal_outputSplit(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("input", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
-        split_op = core.CreateOperator(
-            "Split",
-            ["input"],
-            ["output_1", "output_2", "output_3"],
-        )
-        workspace.RunOperatorOnce(split_op)
-
-        output_1 = workspace.FetchBlob("output_1")
-        self.assertTupleEqual(output_1.shape, (2, 1))
-        np.testing.assert_array_equal(output_1, [[1], [4]])
-
-        output_2 = workspace.FetchBlob("output_2")
-        np.testing.assert_array_equal(output_2, [[2], [5]])
-
-        output_3 = workspace.FetchBlob("output_3")
-        np.testing.assert_array_equal(output_3, [[3], [6]])
-
-        self._verify_cost(workspace, split_op)
-
-    def test_rowwise_equal_outputSplit(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("input", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
-        split_op = core.CreateOperator(
-            "Split",
-            ["input"],
-            ["output_1", "output_2"],
-            axis=0,
-        )
-        workspace.RunOperatorOnce(split_op)
-
-        output_1 = workspace.FetchBlob("output_1")
-        self.assertTupleEqual(output_1.shape, (1, 3))
-        np.testing.assert_array_equal(output_1, [[1, 2, 3]])
-
-        output_2 = workspace.FetchBlob("output_2")
-        np.testing.assert_array_equal(output_2, [[4, 5, 6]])
-
-        self._verify_cost(workspace, split_op)
-
-    def test_columnwise_equal_outputSplit_columnRemoved(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("input", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
-        # To be able to use 'add_axis' (which should have been called 'remove_axis') on 'axis',
-        # the dimensions of split tensors must match on 'axis'
-        split_op = core.CreateOperator(
-            "Split",
-            ["input"],
-            ["output_1", "output_2", "output_3"],
-            axis=1,
-            add_axis=1,
-        )
-        workspace.RunOperatorOnce(split_op)
-
-        output_1 = workspace.FetchBlob("output_1")
-        self.assertTupleEqual(output_1.shape, (2,))
-        np.testing.assert_array_equal(output_1, [1, 4])
-
-        output_2 = workspace.FetchBlob("output_2")
-        np.testing.assert_array_equal(output_2, [2, 5])
-
-        output_3 = workspace.FetchBlob("output_3")
-        np.testing.assert_array_equal(output_3, [3, 6])
-
-        self._verify_cost(workspace, split_op)
-
-    def test_rowwise_equal_outputSplit_rowRemoved(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("input", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
-        split_op = core.CreateOperator(
-            "Split",
-            ["input"],
-            ["output_1", "output_2"],
-            axis=0,
-            add_axis=1,
-        )
-        workspace.RunOperatorOnce(split_op)
-
-        output_1 = workspace.FetchBlob("output_1")
-        self.assertTupleEqual(output_1.shape, (3,))
-        np.testing.assert_array_equal(output_1, [1, 2, 3])
-
-        output_2 = workspace.FetchBlob("output_2")
-        np.testing.assert_array_equal(output_2, [4, 5, 6])
-
-        self._verify_cost(workspace, split_op)
-
-    def test_rowwise_unequal_argSplit(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob(
-            "input", np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int32)
-        )
-        split_op = core.CreateOperator(
-            "Split",
-            ["input"],
-            ["output_1", "output_2"],
-            axis=0,
-            split=[1, 2],
-        )
-        workspace.RunOperatorOnce(split_op)
-
-        output_1 = workspace.FetchBlob("output_1")
-        self.assertTupleEqual(output_1.shape, (1, 3))
-        np.testing.assert_array_equal(output_1, [[1, 2, 3]])
-
-        output_2 = workspace.FetchBlob("output_2")
-        self.assertTupleEqual(output_2.shape, (2, 3))
-        np.testing.assert_array_equal(output_2, [[4, 5, 6], [7, 8, 9]])
-
-        self._verify_cost(workspace, split_op)
-
-    def test_rowwise_unequal_argSplit_rowRemoved(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob(
-            "input", np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int32)
-        )
-        split_op = core.CreateOperator(
-            "Split",
-            ["input"],
-            ["output_1", "output_2", "output_3"],
-            axis=0,
-            split=[1, 1, 1],
-            add_axis=1,
-        )
-        workspace.RunOperatorOnce(split_op)
-
-        output_1 = workspace.FetchBlob("output_1")
-        self.assertTupleEqual(output_1.shape, (3,))
-        np.testing.assert_array_equal(output_1, [1, 2, 3])
-
-        output_2 = workspace.FetchBlob("output_2")
-        np.testing.assert_array_equal(output_2, [4, 5, 6])
-
-        output_3 = workspace.FetchBlob("output_3")
-        np.testing.assert_array_equal(output_3, [7, 8, 9])
-
-        self._verify_cost(workspace, split_op)
-
-    def test_rowwise_unequal_blobSplit(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob(
-            "input", np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.int32)
-        )
-        workspace.FeedBlob("split", np.array([1, 2], dtype=np.int32))
-        split_op = core.CreateOperator(
-            "Split",
-            ["input", "split"],
-            ["output_1", "output_2"],
-            axis=0,
-        )
-        workspace.RunOperatorOnce(split_op)
-
-        output_1 = workspace.FetchBlob("output_1")
-        self.assertTupleEqual(output_1.shape, (1, 3))
-        np.testing.assert_array_equal(output_1, [[1, 2, 3]])
-
-        output_2 = workspace.FetchBlob("output_2")
-        self.assertTupleEqual(output_2.shape, (2, 3))
-        np.testing.assert_array_equal(output_2, [[4, 5, 6], [7, 8, 9]])
-
-        self._verify_cost(workspace, split_op)
-
-    def test_columnwise_unequal_argSplit(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("input", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
-        split_op = core.CreateOperator(
-            "Split",
-            ["input"],
-            ["output_1", "output_2"],
-            axis=1,
-            split=[1, 2],
-        )
-        workspace.RunOperatorOnce(split_op)
-
-        output_1 = workspace.FetchBlob("output_1")
-        self.assertTupleEqual(output_1.shape, (2, 1))
-        np.testing.assert_array_equal(output_1, [[1], [4]])
-
-        output_2 = workspace.FetchBlob("output_2")
-        self.assertTupleEqual(output_2.shape, (2, 2))
-        np.testing.assert_array_equal(output_2, [[2, 3], [5, 6]])
-
-        self._verify_cost(workspace, split_op)
-
-    def test_columnWise_unequal_blobSplit_columnRemoved(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("input", np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32))
-        workspace.FeedBlob("split", np.array([1, 1, 1], dtype=np.int32))
-        split_op = core.CreateOperator(
-            "Split",
-            ["input", "split"],
-            ["output_1", "output_2", "output_3"],
-            axis=1,
-            add_axis=1,
-        )
-        workspace.RunOperatorOnce(split_op)
-
-        output_1 = workspace.FetchBlob("output_1")
-        self.assertTupleEqual(output_1.shape, (2,))
-        np.testing.assert_array_equal(output_1, [1, 4])
-
-        output_2 = workspace.FetchBlob("output_2")
-        np.testing.assert_array_equal(output_2, [2, 5])
-
-        output_3 = workspace.FetchBlob("output_3")
-        np.testing.assert_array_equal(output_3, [3, 6])
-
-        self._verify_cost(workspace, split_op)
-
-    def test_equal_outputSplit_NHWC(self):
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("input", np.random.rand(2, 5, 7, 9).astype(np.int32))
-        split_op = core.CreateOperator(
-            "Split",
-            ["input"],
-            ["output_1", "output_2", "output_3"],
-            order="NHWC",
-        )
-        workspace.RunOperatorOnce(split_op)
-
-        for b in split_op.output:
-            self.assertTupleEqual(workspace.FetchBlob(b).shape, (2, 5, 7, 3))
-
-        self._verify_cost(workspace, split_op)
diff --git a/caffe2/python/operator_test/square_root_divide_op_test.py b/caffe2/python/operator_test/square_root_divide_op_test.py
deleted file mode 100644
index 51f328c95f5f..000000000000
--- a/caffe2/python/operator_test/square_root_divide_op_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from functools import partial
-from hypothesis import strategies as st
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import math
-import numpy as np
-
-
-def _data_and_scale(
-        data_min_size=4, data_max_size=10,
-        examples_min_number=1, examples_max_number=4,
-        dtype=np.float32, elements=None):
-    params_ = st.tuples(
-        st.integers(min_value=examples_min_number,
-                    max_value=examples_max_number),
-        st.integers(min_value=data_min_size,
-                    max_value=data_max_size),
-        st.sampled_from([np.float32, np.int32, np.int64])
-    )
-    return params_.flatmap(
-        lambda param_: st.tuples(
-            hu.arrays([param_[0], param_[1]], dtype=dtype),
-            hu.arrays(
-                [param_[0]], dtype=param_[2],
-                elements=(hu.floats(0.0, 10000.0) if param_[2] in [np.float32]
-                          else st.integers(0, 10000)),
-            ),
-        )
-    )
-
-
-def divide_by_square_root(data, scale):
-    output = np.copy(data)
-    num_examples = len(scale)
-
-    assert num_examples == data.shape[0]
-    assert len(data.shape) == 2
-
-    for i in range(0, num_examples):
-        if scale[i] > 0:
-            output[i] = np.multiply(data[i], 1 / math.sqrt(scale[i]))
-
-    return (output, )
-
-
-def grad(output_grad, ref_outputs, inputs):
-    return (divide_by_square_root(output_grad, inputs[1])[0],
-            None)
-
-
-class TestSquareRootDivide(serial.SerializedTestCase):
-    @serial.given(data_and_scale=_data_and_scale(),
-           **hu.gcs_cpu_only)
-    def test_square_root_divide(self, data_and_scale, gc, dc):
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=core.CreateOperator("SquareRootDivide",
-                                   ["data", "scale"],
-                                   ["output"]),
-            inputs=list(data_and_scale),
-            reference=partial(divide_by_square_root),
-            output_to_grad="output",
-            grad_reference=grad,
-        )
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/stats_ops_test.py b/caffe2/python/operator_test/stats_ops_test.py
deleted file mode 100644
index 6114dfed3b10..000000000000
--- a/caffe2/python/operator_test/stats_ops_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-import numpy as np
-
-
-class TestCounterOps(TestCase):
-
-    def test_stats_ops(self):
-        # The global StatRegistry isn't reset when the workspace is reset,
-        #   so there may be existing stats from a previous test
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'StatRegistryExport', [], ['prev_k', 'prev_v', 'prev_ts']))
-        previous_keys = workspace.FetchBlob('prev_k')
-        existing = len(previous_keys)
-
-        prefix = '/'.join([__name__, 'TestCounterOps', 'test_stats_ops'])
-        keys = [
-            (prefix + '/key1').encode('ascii'),
-            (prefix + '/key2').encode('ascii')
-        ]
-        values = [34, 45]
-        workspace.FeedBlob('k', np.array(keys, dtype=str))
-        workspace.FeedBlob('v', np.array(values, dtype=np.int64))
-        for _ in range(2):
-            workspace.RunOperatorOnce(core.CreateOperator(
-                'StatRegistryUpdate', ['k', 'v'], []))
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'StatRegistryExport', [], ['k2', 'v2', 't2']))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'StatRegistryCreate', [], ['reg']))
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'StatRegistryUpdate', ['k2', 'v2', 'reg'], []))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'StatRegistryExport', ['reg'], ['k3', 'v3', 't3']))
-
-        k3 = workspace.FetchBlob('k3')
-        v3 = workspace.FetchBlob('v3')
-        t3 = workspace.FetchBlob('t3')
-
-        self.assertEqual(len(k3) - existing, 2)
-        self.assertEqual(len(v3), len(k3))
-        self.assertEqual(len(t3), len(k3))
-        for key in keys:
-            self.assertIn(key, k3)
diff --git a/caffe2/python/operator_test/stats_put_ops_test.py b/caffe2/python/operator_test/stats_put_ops_test.py
deleted file mode 100644
index d145b39da60a..000000000000
--- a/caffe2/python/operator_test/stats_put_ops_test.py
+++ /dev/null
@@ -1,195 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-import numpy as np
-
-
-class TestPutOps(TestCase):
-    def test_default_value(self):
-        magnitude_expand = int(1e12)
-        stat_name = "stat".encode('ascii')
-        sum_postfix = "/stat_value/sum".encode("ascii")
-        count_postfix = "/stat_value/count".encode("ascii")
-        default_value = 16.0
-
-        workspace.FeedBlob("value", np.array([], dtype=np.float64))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            "AveragePut",
-            "value",
-            [],
-            stat_name=stat_name,
-            magnitude_expand=magnitude_expand,
-            bound=True,
-            default_value=default_value))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'StatRegistryExport', [], ['k', 'v', 't']))
-
-        k = workspace.FetchBlob('k')
-        v = workspace.FetchBlob('v')
-
-        stat_dict = dict(zip(k, v))
-
-        self.assertIn(stat_name + sum_postfix, stat_dict)
-        self.assertIn(stat_name + count_postfix, stat_dict)
-        self.assertEqual(stat_dict[stat_name + sum_postfix],
-         default_value * magnitude_expand)
-        self.assertEqual(stat_dict[stat_name + count_postfix], 1)
-
-    def test_clamp(self):
-        put_value = 10
-        magnitude_expand = int(1e18)
-        stat_name = "stat".encode('ascii')
-        sum_postfix = "/stat_value/sum".encode("ascii")
-        count_postfix = "/stat_value/count".encode("ascii")
-
-        workspace.FeedBlob("value", np.array([put_value], dtype=np.float64))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            "AveragePut",
-            "value",
-            [],
-            stat_name=stat_name,
-            magnitude_expand=magnitude_expand,
-            bound=True))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'StatRegistryExport', [], ['k', 'v', 't']))
-
-        k = workspace.FetchBlob('k')
-        v = workspace.FetchBlob('v')
-
-        stat_dict = dict(zip(k, v))
-
-        self.assertIn(stat_name + sum_postfix, stat_dict)
-        self.assertIn(stat_name + count_postfix, stat_dict)
-        self.assertEqual(stat_dict[stat_name + sum_postfix],
-            9223372036854775807)
-        self.assertEqual(stat_dict[stat_name + count_postfix], 1)
-
-    def test_clamp_with_out_of_bounds(self):
-        put_value = float(1e20)
-        magnitude_expand = 1000000000000
-        stat_name = "stat".encode('ascii')
-        sum_postfix = "/stat_value/sum".encode("ascii")
-        count_postfix = "/stat_value/count".encode("ascii")
-
-        workspace.FeedBlob("value", np.array([put_value], dtype=np.float64))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            "AveragePut",
-            "value",
-            [],
-            stat_name=stat_name,
-            magnitude_expand=magnitude_expand,
-            bound=True))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'StatRegistryExport', [], ['k', 'v', 't']))
-
-        k = workspace.FetchBlob('k')
-        v = workspace.FetchBlob('v')
-
-        stat_dict = dict(zip(k, v))
-
-        self.assertIn(stat_name + sum_postfix, stat_dict)
-        self.assertIn(stat_name + count_postfix, stat_dict)
-        self.assertEqual(stat_dict[stat_name + sum_postfix],
-            9223372036854775807)
-        self.assertEqual(stat_dict[stat_name + count_postfix], 1)
-
-    def test_avg_put_ops(self):
-        put_value = 15.1111
-        magnitude_expand = 10000
-        stat_name = "a1".encode('ascii')
-        sum_postfix = "/stat_value/sum".encode("ascii")
-        count_postfix = "/stat_value/count".encode("ascii")
-
-        workspace.FeedBlob("value", np.array([put_value], dtype=np.float64))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            "AveragePut",
-            "value",
-            [],
-            stat_name=stat_name,
-            magnitude_expand=magnitude_expand))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'StatRegistryExport', [], ['k', 'v', 't']))
-
-        k = workspace.FetchBlob('k')
-        v = workspace.FetchBlob('v')
-
-        stat_dict = dict(zip(k, v))
-
-        self.assertIn(stat_name + sum_postfix, stat_dict)
-        self.assertIn(stat_name + count_postfix, stat_dict)
-        self.assertEqual(stat_dict[stat_name + sum_postfix],
-         put_value * magnitude_expand)
-        self.assertEqual(stat_dict[stat_name + count_postfix], 1)
-
-    def test_increment_put_ops(self):
-        put_value = 15.1111
-        magnitude_expand = 10000
-        stat_name = "i1".encode('ascii')
-        member_postfix = "/stat_value".encode("ascii")
-
-        workspace.FeedBlob("value", np.array([put_value], dtype=np.float64))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            "IncrementPut",
-            "value",
-            [],
-            stat_name=stat_name,
-            magnitude_expand=magnitude_expand))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'StatRegistryExport', [], ['k', 'v', 't']))
-
-        k = workspace.FetchBlob('k')
-        v = workspace.FetchBlob('v')
-
-        stat_dict = dict(zip(k, v))
-
-        self.assertIn(stat_name + member_postfix, stat_dict)
-        self.assertEqual(stat_dict[stat_name + member_postfix],
-         put_value * magnitude_expand)
-
-    def test_stddev_put_ops(self):
-        put_value = 15.1111
-        magnitude_expand = 10000
-        stat_name = "s1".encode('ascii')
-        sum_postfix = "/stat_value/sum".encode("ascii")
-        count_postfix = "/stat_value/count".encode("ascii")
-        sumoffset_postfix = "/stat_value/sumoffset".encode("ascii")
-        sumsqoffset_postfix = "/stat_value/sumsqoffset".encode("ascii")
-
-        workspace.FeedBlob("value", np.array([put_value], dtype=np.float64))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            "StdDevPut",
-            "value",
-            [],
-            stat_name=stat_name,
-            magnitude_expand=magnitude_expand))
-
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'StatRegistryExport', [], ['k', 'v', 't']))
-
-        k = workspace.FetchBlob('k')
-        v = workspace.FetchBlob('v')
-
-        stat_dict = dict(zip(k, v))
-
-        self.assertIn(stat_name + sum_postfix, stat_dict)
-        self.assertIn(stat_name + count_postfix, stat_dict)
-        self.assertIn(stat_name + sumoffset_postfix, stat_dict)
-        self.assertIn(stat_name + sumsqoffset_postfix, stat_dict)
-        self.assertEqual(stat_dict[stat_name + sum_postfix],
-            put_value * magnitude_expand)
-        self.assertEqual(stat_dict[stat_name + count_postfix], 1)
diff --git a/caffe2/python/operator_test/storm_test.py b/caffe2/python/operator_test/storm_test.py
deleted file mode 100644
index c97f631d2160..000000000000
--- a/caffe2/python/operator_test/storm_test.py
+++ /dev/null
@@ -1,157 +0,0 @@
-
-
-
-
-
-import functools
-
-from hypothesis import given, settings, HealthCheck
-import hypothesis.strategies as st
-import numpy as np
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestStorm(hu.HypothesisTestCase):
-    @given(inputs=hu.tensors(n=3),
-           grad_sq_sum=st.floats(min_value=0.01, max_value=0.99,
-                                 allow_nan=False, allow_infinity=False),
-           lr=st.floats(min_value=0.01, max_value=1.0,
-                        allow_nan=False, allow_infinity=False),
-           momentum=st.floats(min_value=0.1, max_value=100.0,
-                              allow_nan=False, allow_infinity=False),
-           beta=st.floats(min_value=0.1, max_value=10.0,
-                          allow_nan=False, allow_infinity=False),
-           **hu.gcs_cpu_only)
-    def test_storm_dense(self, inputs, grad_sq_sum, lr, momentum, beta, gc, dc):
-        param, moment, grad = inputs
-        grad_sq_sum = np.array([grad_sq_sum], dtype=np.float32)
-        lr = np.array([lr], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "Storm",
-            ["param", "moment", "grad_sq_sum", "grad", "lr"],
-            ["param", "moment", "grad_sq_sum"],
-            momentum=momentum,
-            beta=beta,
-            device_option=gc
-        )
-
-        def ref_dense(param, moment, grad_sq_sum, grad, lr, momentum, beta):
-            grad_sq_sum_out = grad_sq_sum + np.sum(grad * grad)
-            nlr = lr * np.power(beta + grad_sq_sum_out, -1.0 / 3.0)
-            alpha = momentum * np.square(nlr)
-            moment_out = grad + (1 - alpha) * (moment - grad)
-            param_out = param + nlr * moment_out
-
-            return (param_out.astype(np.float32), moment_out.astype(np.float32),
-                    grad_sq_sum_out.astype(np.float32))
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, moment, grad_sq_sum, grad, lr],
-            functools.partial(ref_dense, momentum=momentum, beta=beta)
-        )
-
-    # Suppress filter_too_much health check.
-    # Likely caused by `assume` call falling through too often.
-    @settings(suppress_health_check=[HealthCheck.filter_too_much])
-    @given(inputs=hu.tensors(n=3),
-           grad_sq_sum=st.floats(min_value=0.01, max_value=0.99,
-                                 allow_nan=False, allow_infinity=False),
-           lr=st.floats(min_value=0.01, max_value=1.0,
-                        allow_nan=False, allow_infinity=False),
-           momentum=st.floats(min_value=0.1, max_value=100.0,
-                              allow_nan=False, allow_infinity=False),
-           beta=st.floats(min_value=0.1, max_value=10.0,
-                          allow_nan=False, allow_infinity=False),
-           **hu.gcs_cpu_only)
-    def test_storm_sparse(self, inputs, grad_sq_sum, lr,
-                          momentum, beta, gc, dc):
-        param, moment, grad = inputs
-        grad_sq_sum = np.array([grad_sq_sum], dtype=np.float32)
-        lr = np.array([lr], dtype=np.float32)
-
-        # Create an indexing array containing values that are lists of indices,
-        # which index into grad
-        indices = np.random.choice(np.arange(grad.shape[0]),
-                                   size=np.random.randint(grad.shape[0]),
-                                   replace=False)
-
-        # Sparsify grad
-        grad = grad[indices]
-
-        op = core.CreateOperator(
-            "SparseStorm",
-            ["param", "moment", "grad_sq_sum", "grad", "indices", "lr"],
-            ["param", "moment", "grad_sq_sum"],
-            momentum=momentum,
-            beta=beta,
-            device_option=gc)
-
-        def ref_sparse(param, moment, grad_sq_sum, grad, indices,
-                       lr, momentum, beta):
-            param_out = np.copy(param)
-            moment_out = np.copy(moment)
-            grad_sq_sum_out = np.copy(grad_sq_sum)
-
-            grad_sq_sum_out = grad_sq_sum + np.sum(grad * grad)
-            nlr = lr * np.power(beta + grad_sq_sum_out, -1.0 / 3.0)
-            alpha = momentum * np.square(nlr)
-            for i, index in enumerate(indices):
-                gi = grad[i]
-                moment_out[index] = gi + (1 - alpha) * (moment[index] - gi)
-                param_out[index] = param[index] + nlr * moment_out[index]
-
-            return (param_out.astype(np.float32), moment_out.astype(np.float32),
-                    grad_sq_sum_out.astype(np.float32))
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, moment, grad_sq_sum, grad, indices, lr],
-            functools.partial(ref_sparse, momentum=momentum, beta=beta)
-        )
-
-    @given(inputs=hu.tensors(n=2),
-           grad_sq_sum=st.floats(min_value=0.01, max_value=0.99,
-                                 allow_nan=False, allow_infinity=False),
-           lr=st.floats(min_value=0.01, max_value=1.0,
-                        allow_nan=False, allow_infinity=False),
-           momentum=st.floats(min_value=0.1, max_value=100.0,
-                              allow_nan=False, allow_infinity=False),
-           beta=st.floats(min_value=0.1, max_value=10.0,
-                          allow_nan=False, allow_infinity=False),
-           data_strategy=st.data(),
-           **hu.gcs_cpu_only)
-    def test_storm_sparse_empty(self, inputs, grad_sq_sum, lr, momentum,
-                                beta, data_strategy, gc, dc):
-        param, moment = inputs
-        grad_sq_sum = np.array([grad_sq_sum], dtype=np.float32)
-        lr = np.array([lr], dtype=np.float32)
-
-        grad = np.empty(shape=(0,) + param.shape[1:], dtype=np.float32)
-        indices = np.empty(shape=(0,), dtype=np.int64)
-
-        op = core.CreateOperator(
-            "SparseStorm",
-            ["param", "moment", "grad_sq_sum", "grad", "indices", "lr"],
-            ["param", "moment", "grad_sq_sum"],
-            momentum=momentum,
-            beta=beta,
-            device_option=gc)
-
-        def ref_sparse_empty(param, moment, grad_sq_sum, grad, indices,
-                             lr, momentum, beta):
-            param_out = np.copy(param)
-            moment_out = np.copy(moment)
-            grad_sq_sum_out = np.copy(grad_sq_sum)
-
-            return (param_out.astype(np.float32), moment_out.astype(np.float32),
-                    grad_sq_sum_out.astype(np.float32))
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, moment, grad_sq_sum, grad, indices, lr],
-            functools.partial(ref_sparse_empty, momentum=momentum, beta=beta)
-        )
diff --git a/caffe2/python/operator_test/string_ops_test.py b/caffe2/python/operator_test/string_ops_test.py
deleted file mode 100644
index c98c8c5b6c01..000000000000
--- a/caffe2/python/operator_test/string_ops_test.py
+++ /dev/null
@@ -1,151 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-def _string_lists(alphabet=None):
-    return st.lists(
-        elements=st.text(alphabet=alphabet) if alphabet else st.text(),
-        min_size=0,
-        max_size=3)
-
-
-class TestStringOps(serial.SerializedTestCase):
-    @given(strings=_string_lists())
-    @settings(deadline=10000)
-    def test_string_prefix(self, strings):
-        length = 3
-        # although we are utf-8 encoding below to avoid python exceptions,
-        # StringPrefix op deals with byte-length prefixes, which may produce
-        # an invalid utf-8 string. The goal here is just to avoid python
-        # complaining about the unicode -> str conversion.
-        strings = np.array(
-            [a.encode('utf-8') for a in strings], dtype=object
-        )
-
-        def string_prefix_ref(strings):
-            return (
-                np.array([a[:length] for a in strings], dtype=object),
-            )
-
-        op = core.CreateOperator(
-            'StringPrefix',
-            ['strings'],
-            ['stripped'],
-            length=length)
-        self.assertReferenceChecks(
-            hu.cpu_do,
-            op,
-            [strings],
-            string_prefix_ref)
-
-    @given(strings=_string_lists())
-    @settings(deadline=10000)
-    def test_string_suffix(self, strings):
-        length = 3
-        strings = np.array(
-            [a.encode('utf-8') for a in strings], dtype=object
-        )
-
-        def string_suffix_ref(strings):
-            return (
-                np.array([a[-length:] for a in strings], dtype=object),
-            )
-
-        op = core.CreateOperator(
-            'StringSuffix',
-            ['strings'],
-            ['stripped'],
-            length=length)
-        self.assertReferenceChecks(
-            hu.cpu_do,
-            op,
-            [strings],
-            string_suffix_ref)
-
-    @given(strings=st.text(alphabet=['a', 'b']))
-    @settings(deadline=10000)
-    def test_string_starts_with(self, strings):
-        prefix = 'a'
-        strings = np.array(
-            [str(a) for a in strings], dtype=object
-        )
-
-        def string_starts_with_ref(strings):
-            return (
-                np.array([a.startswith(prefix) for a in strings], dtype=bool),
-            )
-
-        op = core.CreateOperator(
-            'StringStartsWith',
-            ['strings'],
-            ['bools'],
-            prefix=prefix)
-        self.assertReferenceChecks(
-            hu.cpu_do,
-            op,
-            [strings],
-            string_starts_with_ref)
-
-    @given(strings=st.text(alphabet=['a', 'b']))
-    @settings(deadline=10000)
-    def test_string_ends_with(self, strings):
-        suffix = 'a'
-        strings = np.array(
-            [str(a) for a in strings], dtype=object
-        )
-
-        def string_ends_with_ref(strings):
-            return (
-                np.array([a.endswith(suffix) for a in strings], dtype=bool),
-            )
-
-        op = core.CreateOperator(
-            'StringEndsWith',
-            ['strings'],
-            ['bools'],
-            suffix=suffix)
-        self.assertReferenceChecks(
-            hu.cpu_do,
-            op,
-            [strings],
-            string_ends_with_ref)
-
-    @given(strings=st.text(alphabet=['a', 'b']))
-    @settings(deadline=10000)
-    def test_string_equals(self, strings):
-        text = ""
-        if strings:
-            text = strings[0]
-
-        strings = np.array(
-            [str(a) for a in strings], dtype=object
-        )
-
-        def string_equals_ref(strings):
-            return (
-                np.array([a == text for a in strings], dtype=bool),
-            )
-
-        op = core.CreateOperator(
-            'StringEquals',
-            ['strings'],
-            ['bools'],
-            text=text)
-        self.assertReferenceChecks(
-            hu.cpu_do,
-            op,
-            [strings],
-            string_equals_ref)
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/text_file_reader_test.py b/caffe2/python/operator_test/text_file_reader_test.py
deleted file mode 100644
index 8889ddb9f53c..000000000000
--- a/caffe2/python/operator_test/text_file_reader_test.py
+++ /dev/null
@@ -1,67 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.text_file_reader import TextFileReader
-from caffe2.python.test_util import TestCase
-from caffe2.python.schema import Struct, Scalar, FetchRecord
-import tempfile
-import numpy as np
-
-
-class TestTextFileReader(TestCase):
-    def test_text_file_reader(self):
-        schema = Struct(
-            ('field1', Scalar(dtype=str)),
-            ('field2', Scalar(dtype=str)),
-            ('field3', Scalar(dtype=np.float32)))
-        num_fields = 3
-        col_data = [
-            ['l1f1', 'l2f1', 'l3f1', 'l4f1'],
-            ['l1f2', 'l2f2', 'l3f2', 'l4f2'],
-            [0.456, 0.789, 0.10101, -24342.64],
-        ]
-        row_data = list(zip(*col_data))
-        with tempfile.NamedTemporaryFile(mode='w+', delete=False) as txt_file:
-            txt_file.write(
-                '\n'.join(
-                    '\t'.join(str(x) for x in f)
-                    for f in row_data
-                ) + '\n'
-            )
-            txt_file.flush()
-
-            for num_passes in range(1, 3):
-                for batch_size in range(1, len(row_data) + 2):
-                    init_net = core.Net('init_net')
-                    reader = TextFileReader(
-                        init_net,
-                        filename=txt_file.name,
-                        schema=schema,
-                        batch_size=batch_size,
-                        num_passes=num_passes)
-                    workspace.RunNetOnce(init_net)
-
-                    net = core.Net('read_net')
-                    should_stop, record = reader.read_record(net)
-
-                    results = [np.array([])] * num_fields
-                    while True:
-                        workspace.RunNetOnce(net)
-                        arrays = FetchRecord(record).field_blobs()
-                        for i in range(num_fields):
-                            results[i] = np.append(results[i], arrays[i])
-                        if workspace.FetchBlob(should_stop):
-                            break
-                    for i in range(num_fields):
-                        col_batch = np.tile(col_data[i], num_passes)
-                        if col_batch.dtype in (np.float32, np.float64):
-                            np.testing.assert_array_almost_equal(
-                                col_batch, results[i], decimal=3)
-                        else:
-                            np.testing.assert_array_equal(col_batch, results[i])
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/thresholded_relu_op_test.py b/caffe2/python/operator_test/thresholded_relu_op_test.py
deleted file mode 100644
index 0cd5c0f77895..000000000000
--- a/caffe2/python/operator_test/thresholded_relu_op_test.py
+++ /dev/null
@@ -1,73 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import numpy as np
-
-import unittest
-
-
-class TestThresholdedRelu(serial.SerializedTestCase):
-
-    # test case 1 - default alpha - we do reference and dc checks.
-    # test case 2 does dc and reference checks over range of alphas.
-    # test case 3 does gc over range of alphas.
-    @serial.given(input=hu.tensor(),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    def test_thresholded_relu_1(self, input, gc, dc, engine):
-        X = input
-        op = core.CreateOperator("ThresholdedRelu", ["X"], ["Y"],
-                                 engine=engine)
-
-        def defaultRef(X):
-            Y = np.copy(X)
-            Y[Y <= 1.0] = 0.0
-            return (Y,)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertReferenceChecks(gc, op, [X], defaultRef)
-
-    @given(input=hu.tensor(),
-           alpha=st.floats(min_value=1.0, max_value=5.0),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    def test_thresholded_relu_2(self, input, alpha, gc, dc, engine):
-        X = input
-        op = core.CreateOperator("ThresholdedRelu", ["X"], ["Y"],
-                                 alpha=alpha, engine=engine)
-
-        def ref(X):
-            Y = np.copy(X)
-            Y[Y <= alpha] = 0.0
-            return (Y,)
-
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertReferenceChecks(gc, op, [X], ref)
-
-    @given(input=hu.tensor(),
-           alpha=st.floats(min_value=1.1, max_value=5.0),
-           engine=st.sampled_from(["", "CUDNN"]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_thresholded_relu_3(self, input, alpha, gc, dc, engine):
-        X = TestThresholdedRelu.fix_input(input)
-        op = core.CreateOperator("ThresholdedRelu", ["X"], ["Y"],
-                                 alpha=float(alpha), engine=engine)
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @staticmethod
-    def fix_input(input):
-        # go away from alpha to avoid derivative discontinuities
-        input += 0.02 * np.sign(input)
-        return input
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/tile_op_test.py b/caffe2/python/operator_test/tile_op_test.py
deleted file mode 100644
index fbb424fe058c..000000000000
--- a/caffe2/python/operator_test/tile_op_test.py
+++ /dev/null
@@ -1,117 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import unittest
-
-from caffe2.python import core, workspace
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-class TestTile(serial.SerializedTestCase):
-    @given(M=st.integers(min_value=1, max_value=10),
-           K=st.integers(min_value=1, max_value=10),
-           N=st.integers(min_value=1, max_value=10),
-           tiles=st.integers(min_value=1, max_value=3),
-           axis=st.integers(min_value=0, max_value=2),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_tile(self, M, K, N, tiles, axis, gc, dc):
-        X = np.random.rand(M, K, N).astype(np.float32)
-
-        op = core.CreateOperator(
-            'Tile', ['X'], 'out',
-            tiles=tiles,
-            axis=axis,
-        )
-
-        def tile_ref(X, tiles, axis):
-            dims = np.asarray([1, 1, 1], dtype=int)
-            dims[axis] = tiles
-            tiled_data = np.tile(X, dims)
-            return (tiled_data,)
-
-        # Check against numpy reference
-        self.assertReferenceChecks(gc, op, [X, tiles, axis],
-                                   tile_ref)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X], [0])
-        # Gradient check wrt X
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    @given(M=st.integers(min_value=1, max_value=200),
-           N=st.integers(min_value=1, max_value=200),
-           tiles=st.integers(min_value=50, max_value=100),
-           **hu.gcs)
-    def test_tile_grad(self, M, N, tiles, gc, dc):
-        X = np.random.rand(M, N).astype(np.float32)
-        axis = 1
-
-        op = core.CreateOperator(
-            'Tile', ['X'], 'out',
-            tiles=tiles,
-            axis=axis,
-        )
-
-        def tile_ref(X, tiles, axis):
-            dims = np.asarray([1, 1], dtype=int)
-            dims[axis] = tiles
-            tiled_data = np.tile(X, dims)
-            return (tiled_data,)
-
-        # Check against numpy reference
-        self.assertReferenceChecks(gc, op, [X, tiles, axis],
-                                   tile_ref)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-        # Gradient check wrt X
-        grad_op = core.CreateOperator(
-            'TileGradient', ['dOut'], 'dX',
-            tiles=tiles,
-            axis=axis,
-        )
-        dX = np.random.rand(M, N * tiles).astype(np.float32)
-        self.assertDeviceChecks(dc, grad_op, [dX], [0])
-
-    @given(M=st.integers(min_value=1, max_value=10),
-           K=st.integers(min_value=1, max_value=10),
-           N=st.integers(min_value=1, max_value=10),
-           tiles=st.integers(min_value=1, max_value=3),
-           axis=st.integers(min_value=0, max_value=2),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_tilewinput(self, M, K, N, tiles, axis, gc, dc):
-        X = np.random.rand(M, K, N).astype(np.float32)
-
-        tiles_arg = np.array([tiles], dtype=np.int32)
-        axis_arg = np.array([axis], dtype=np.int32)
-
-        op = core.CreateOperator(
-            'Tile', ['X', 'tiles', 'axis'], 'out',
-        )
-
-        def tile_ref(X, tiles, axis):
-            dims = np.asarray([1, 1, 1], dtype=int)
-            dims[axis] = tiles
-            tiled_data = np.tile(X, dims)
-            return (tiled_data,)
-
-        # Check against numpy reference
-        self.assertReferenceChecks(gc, op, [X, tiles_arg, axis_arg],
-                                   tile_ref)
-        # Check over multiple devices
-        self.assertDeviceChecks(dc, op, [X, tiles_arg, axis_arg], [0])
-        # Gradient check wrt X
-        self.assertGradientChecks(gc, op, [X, tiles_arg, axis_arg], 0, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/top_k_test.py b/caffe2/python/operator_test/top_k_test.py
deleted file mode 100644
index 035b1fb3d099..000000000000
--- a/caffe2/python/operator_test/top_k_test.py
+++ /dev/null
@@ -1,246 +0,0 @@
-
-
-
-
-
-import hypothesis.strategies as st
-import numpy as np
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-class TestTopK(serial.SerializedTestCase):
-
-    def top_k_ref(self, X, k, flatten_indices, axis=-1):
-        in_dims = X.shape
-        out_dims = list(in_dims)
-        out_dims[axis] = k
-        out_dims = tuple(out_dims)
-        if axis == -1:
-            axis = len(in_dims) - 1
-        prev_dims = 1
-        next_dims = 1
-        for i in range(axis):
-            prev_dims *= in_dims[i]
-        for i in range(axis + 1, len(in_dims)):
-            next_dims *= in_dims[i]
-        n = in_dims[axis]
-        X_flat = X.reshape((prev_dims, n, next_dims))
-
-        values_ref = np.ndarray(
-            shape=(prev_dims, k, next_dims), dtype=np.float32)
-        values_ref.fill(0)
-        indices_ref = np.ndarray(
-            shape=(prev_dims, k, next_dims), dtype=np.int64)
-        indices_ref.fill(-1)
-        flatten_indices_ref = np.ndarray(
-            shape=(prev_dims, k, next_dims), dtype=np.int64)
-        flatten_indices_ref.fill(-1)
-        for i in range(prev_dims):
-            for j in range(next_dims):
-                kv = []
-                for x in range(n):
-                    val = X_flat[i, x, j]
-                    y = x * next_dims + i * in_dims[axis] * next_dims + j
-                    kv.append((val, x, y))
-                cnt = 0
-                for val, x, y in sorted(
-                        kv, key=lambda x: (x[0], -x[1]), reverse=True):
-                    values_ref[i, cnt, j] = val
-                    indices_ref[i, cnt, j] = x
-                    flatten_indices_ref[i, cnt, j] = y
-                    cnt += 1
-                    if cnt >= k or cnt >= n:
-                        break
-
-        values_ref = values_ref.reshape(out_dims)
-        indices_ref = indices_ref.reshape(out_dims)
-        flatten_indices_ref = flatten_indices_ref.flatten()
-
-        if flatten_indices:
-            return (values_ref, indices_ref, flatten_indices_ref)
-        else:
-            return (values_ref, indices_ref)
-
-    @serial.given(
-        X=hu.tensor(),
-        flatten_indices=st.booleans(),
-        seed=st.integers(0, 10),
-        **hu.gcs
-    )
-    def test_top_k(self, X, flatten_indices, seed, gc, dc):
-        X = X.astype(dtype=np.float32)
-        np.random.seed(seed)
-        # `k` can be larger than the total size
-        k = np.random.randint(1, X.shape[-1] + 4)
-
-        output_list = ["Values", "Indices"]
-        if flatten_indices:
-            output_list.append("FlattenIndices")
-        op = core.CreateOperator("TopK", ["X"], output_list,
-                                 k=k, device_option=gc)
-
-        def bind_ref(X_loc):
-            return self.top_k_ref(X_loc, k, flatten_indices)
-
-        self.assertReferenceChecks(gc, op, [X], bind_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(bs=st.integers(1, 3), n=st.integers(1, 1), k=st.integers(1, 1),
-           flatten_indices=st.booleans(), **hu.gcs)
-    def test_top_k_1(self, bs, n, k, flatten_indices, gc, dc):
-        X = np.random.rand(bs, n).astype(dtype=np.float32)
-        output_list = ["Values", "Indices"]
-        if flatten_indices:
-            output_list.append("FlattenIndices")
-        op = core.CreateOperator("TopK", ["X"], output_list,
-                                 k=k, device_option=gc)
-
-        def bind_ref(X_loc):
-            return self.top_k_ref(X_loc, k, flatten_indices)
-
-        self.assertReferenceChecks(gc, op, [X], bind_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(bs=st.integers(1, 3), n=st.integers(1, 10000), k=st.integers(1, 1),
-           flatten_indices=st.booleans(), **hu.gcs)
-    def test_top_k_2(self, bs, n, k, flatten_indices, gc, dc):
-        X = np.random.rand(bs, n).astype(dtype=np.float32)
-
-        output_list = ["Values", "Indices"]
-        if flatten_indices:
-            output_list.append("FlattenIndices")
-        op = core.CreateOperator("TopK", ["X"], output_list,
-                                 k=k, device_option=gc)
-
-        def bind_ref(X_loc):
-            return self.top_k_ref(X_loc, k, flatten_indices)
-
-        self.assertReferenceChecks(gc, op, [X], bind_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(bs=st.integers(1, 3), n=st.integers(1, 10000),
-           k=st.integers(1, 1024), flatten_indices=st.booleans(), **hu.gcs)
-    def test_top_k_3(self, bs, n, k, flatten_indices, gc, dc):
-        X = np.random.rand(bs, n).astype(dtype=np.float32)
-        output_list = ["Values", "Indices"]
-        if flatten_indices:
-            output_list.append("FlattenIndices")
-        op = core.CreateOperator("TopK", ["X"], output_list,
-                                 k=k, device_option=gc)
-
-        def bind_ref(X_loc):
-            return self.top_k_ref(X_loc, k, flatten_indices)
-
-        self.assertReferenceChecks(gc, op, [X], bind_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(bs=st.integers(1, 3), n=st.integers(100, 10000),
-           flatten_indices=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_top_k_4(self, bs, n, flatten_indices, gc, dc):
-        k = np.random.randint(n // 3, 3 * n // 4)
-        X = np.random.rand(bs, n).astype(dtype=np.float32)
-
-        output_list = ["Values", "Indices"]
-        if flatten_indices:
-            output_list.append("FlattenIndices")
-        op = core.CreateOperator("TopK", ["X"], output_list,
-                                 k=k, device_option=gc)
-
-        def bind_ref(X_loc):
-            return self.top_k_ref(X_loc, k, flatten_indices)
-
-        self.assertReferenceChecks(gc, op, [X], bind_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(bs=st.integers(1, 3), n=st.integers(1, 1024),
-           flatten_indices=st.booleans(), **hu.gcs)
-    def test_top_k_5(self, bs, n, flatten_indices, gc, dc):
-        k = n
-        X = np.random.rand(bs, n).astype(dtype=np.float32)
-
-        output_list = ["Values", "Indices"]
-        if flatten_indices:
-            output_list.append("FlattenIndices")
-        op = core.CreateOperator("TopK", ["X"], output_list,
-                                 k=k, device_option=gc)
-
-        def bind_ref(X_loc):
-            return self.top_k_ref(X_loc, k, flatten_indices)
-
-        self.assertReferenceChecks(gc, op, [X], bind_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(bs=st.integers(1, 3), n=st.integers(1, 5000),
-           flatten_indices=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_top_k_6(self, bs, n, flatten_indices, gc, dc):
-        k = n
-        X = np.random.rand(bs, n).astype(dtype=np.float32)
-
-        output_list = ["Values", "Indices"]
-        if flatten_indices:
-            output_list.append("FlattenIndices")
-        op = core.CreateOperator("TopK", ["X"], output_list,
-                                 k=k, device_option=gc)
-
-        def bind_ref(X_loc):
-            return self.top_k_ref(X_loc, k, flatten_indices)
-
-        self.assertReferenceChecks(gc, op, [X], bind_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(X=hu.tensor(dtype=np.float32), k=st.integers(1, 5),
-           axis=st.integers(-1, 5), flatten_indices=st.booleans(),
-           **hu.gcs)
-    def test_top_k_axis(self, X, k, axis, flatten_indices, gc, dc):
-        dims = X.shape
-        if axis >= len(dims):
-            axis %= len(dims)
-
-        output_list = ["Values", "Indices"]
-        if flatten_indices:
-            output_list.append("FlattenIndices")
-        op = core.CreateOperator(
-            "TopK", ["X"], output_list, k=k, axis=axis, device_option=gc)
-
-        def bind_ref(X_loc):
-            return self.top_k_ref(X_loc, k, flatten_indices, axis)
-
-        self.assertReferenceChecks(gc, op, [X], bind_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-
-    @given(X=hu.tensor(dtype=np.float32), k=st.integers(1, 5),
-           axis=st.integers(-1, 5), **hu.gcs)
-    @settings(deadline=10000)
-    def test_top_k_grad(self, X, k, axis, gc, dc):
-        dims = X.shape
-        if axis >= len(dims):
-            axis %= len(dims)
-
-        input_axis = len(dims) - 1 if axis == -1 else axis
-        prev_dims = 1
-        next_dims = 1
-        for i in range(input_axis):
-            prev_dims *= dims[i]
-        for i in range(input_axis + 1, len(dims)):
-            next_dims *= dims[i]
-
-        X_flat = X.reshape((prev_dims, dims[input_axis], next_dims))
-        for i in range(prev_dims):
-            for j in range(next_dims):
-                # this try to make sure adding stepsize (0.05)
-                # will not change TopK selections at all
-                X_flat[i, :, j] = np.arange(dims[axis], dtype=np.float32) / 5
-                np.random.shuffle(X_flat[i, :, j])
-        X = X_flat.reshape(dims)
-
-        op = core.CreateOperator(
-            "TopK", ["X"], ["Values", "Indices"], k=k, axis=axis,
-            device_option=gc)
-
-        self.assertGradientChecks(gc, op, [X], 0, [0], stepsize=0.05)
diff --git a/caffe2/python/operator_test/torch_integration_test.py b/caffe2/python/operator_test/torch_integration_test.py
deleted file mode 100644
index d143e0193dfd..000000000000
--- a/caffe2/python/operator_test/torch_integration_test.py
+++ /dev/null
@@ -1,1106 +0,0 @@
-
-
-import struct
-import unittest
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-import torch
-from caffe2.python import core, workspace
-from hypothesis import given, settings
-from scipy.stats import norm
-
-from ._utils import assert_allclose
-
-
-def generate_rois(roi_counts, im_dims):
-    assert len(roi_counts) == len(im_dims)
-    all_rois = []
-    for i, num_rois in enumerate(roi_counts):
-        if num_rois == 0:
-            continue
-        # [batch_idx, x1, y1, x2, y2]
-        rois = np.random.uniform(0, im_dims[i], size=(roi_counts[i], 5)).astype(
-            np.float32
-        )
-        rois[:, 0] = i  # batch_idx
-        # Swap (x1, x2) if x1 > x2
-        rois[:, 1], rois[:, 3] = (
-            np.minimum(rois[:, 1], rois[:, 3]),
-            np.maximum(rois[:, 1], rois[:, 3]),
-        )
-        # Swap (y1, y2) if y1 > y2
-        rois[:, 2], rois[:, 4] = (
-            np.minimum(rois[:, 2], rois[:, 4]),
-            np.maximum(rois[:, 2], rois[:, 4]),
-        )
-        all_rois.append(rois)
-    if len(all_rois) > 0:
-        return np.vstack(all_rois)
-    return np.empty((0, 5)).astype(np.float32)
-
-
-def generate_rois_rotated(roi_counts, im_dims):
-    rois = generate_rois(roi_counts, im_dims)
-    # [batch_id, ctr_x, ctr_y, w, h, angle]
-    rotated_rois = np.empty((rois.shape[0], 6)).astype(np.float32)
-    rotated_rois[:, 0] = rois[:, 0]  # batch_id
-    rotated_rois[:, 1] = (rois[:, 1] + rois[:, 3]) / 2.0  # ctr_x = (x1 + x2) / 2
-    rotated_rois[:, 2] = (rois[:, 2] + rois[:, 4]) / 2.0  # ctr_y = (y1 + y2) / 2
-    rotated_rois[:, 3] = rois[:, 3] - rois[:, 1] + 1.0  # w = x2 - x1 + 1
-    rotated_rois[:, 4] = rois[:, 4] - rois[:, 2] + 1.0  # h = y2 - y1 + 1
-    rotated_rois[:, 5] = np.random.uniform(-90.0, 90.0)  # angle in degrees
-    return rotated_rois
-
-
-def create_bbox_transform_inputs(roi_counts, num_classes, rotated):
-    batch_size = len(roi_counts)
-    total_rois = sum(roi_counts)
-    im_dims = np.random.randint(100, 600, batch_size)
-    rois = (
-        generate_rois_rotated(roi_counts, im_dims)
-        if rotated
-        else generate_rois(roi_counts, im_dims)
-    )
-    box_dim = 5 if rotated else 4
-    deltas = np.random.randn(total_rois, box_dim * num_classes).astype(np.float32)
-    im_info = np.zeros((batch_size, 3)).astype(np.float32)
-    im_info[:, 0] = im_dims
-    im_info[:, 1] = im_dims
-    im_info[:, 2] = 1.0
-    return rois, deltas, im_info
-
-
-# Eigen/Python round 0.5 away from 0, Numpy rounds to even
-round_to_nearest = np.vectorize(round)
-
-
-def bytes_to_floats(byte_matrix):
-    floats = np.empty([np.shape(byte_matrix)[0], 1], dtype=np.float32)
-    for i, byte_values in enumerate(byte_matrix):
-        (floats[i],) = struct.unpack("f", bytearray(byte_values))
-    return floats
-
-
-def floats_to_bytes(floats):
-    byte_matrix = np.empty([np.shape(floats)[0], 4], dtype=np.uint8)
-    for i, value in enumerate(floats):
-        assert isinstance(value, np.float32), (value, floats)
-        as_bytes = struct.pack("f", value)
-        # In Python3 bytes will be a list of int, in Python2 a list of string
-        if isinstance(as_bytes[0], int):
-            byte_matrix[i] = list(as_bytes)
-        else:
-            byte_matrix[i] = [ord(i) for i in as_bytes]
-    return byte_matrix
-
-
-def fused_rowwise_8bit_quantize_reference(data):
-    minimum = np.min(data, axis=1, keepdims=True)
-    maximum = np.max(data, axis=1, keepdims=True)
-    span = maximum - minimum
-    bias = minimum
-    scale = span / 255.0
-    inverse_scale = 255.0 / (span + 1e-8)
-    quantized_data = round_to_nearest((data - bias) * inverse_scale)
-    scale_bytes = floats_to_bytes(scale.reshape(-1))
-    bias_bytes = floats_to_bytes(bias.reshape(-1))
-    return np.concatenate([quantized_data, scale_bytes, bias_bytes], axis=1)
-
-
-def fused_rowwise_8bit_quantize_dequantize_reference(data):
-    fused_quantized = fused_rowwise_8bit_quantize_reference(data)
-    scale = bytes_to_floats(fused_quantized[:, -8:-4].astype(np.uint8))
-    bias = bytes_to_floats(fused_quantized[:, -4:].astype(np.uint8))
-    quantized_data = fused_quantized[:, :-8]
-    return quantized_data * scale + bias
-
-
-class TorchIntegration(hu.HypothesisTestCase):
-    @given(
-        roi_counts=st.lists(st.integers(0, 5), min_size=1, max_size=10),
-        num_classes=st.integers(1, 10),
-        rotated=st.booleans(),
-        angle_bound_on=st.booleans(),
-        clip_angle_thresh=st.sampled_from([-1.0, 1.0]),
-        **hu.gcs_cpu_only
-    )
-    def test_bbox_transform(
-        self,
-        roi_counts,
-        num_classes,
-        rotated,
-        angle_bound_on,
-        clip_angle_thresh,
-        gc,
-        dc,
-    ):
-        """
-        Test with rois for multiple images in a batch
-        """
-        rois, deltas, im_info = create_bbox_transform_inputs(
-            roi_counts, num_classes, rotated
-        )
-
-        def bbox_transform_ref():
-            ref_op = core.CreateOperator(
-                "BBoxTransform",
-                ["rois", "deltas", "im_info"],
-                ["box_out"],
-                apply_scale=False,
-                rotated=rotated,
-                angle_bound_on=angle_bound_on,
-                clip_angle_thresh=clip_angle_thresh,
-            )
-            workspace.FeedBlob("rois", rois)
-            workspace.FeedBlob("deltas", deltas)
-            workspace.FeedBlob("im_info", im_info)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("box_out")
-
-        box_out = torch.tensor(bbox_transform_ref())
-        a, b = torch.ops._caffe2.BBoxTransform(
-            torch.tensor(rois),
-            torch.tensor(deltas),
-            torch.tensor(im_info),
-            [1.0, 1.0, 1.0, 1.0],
-            False,
-            rotated,
-            angle_bound_on,
-            -90,
-            90,
-            clip_angle_thresh,
-            legacy_plus_one=True,
-        )
-
-        assert_allclose(box_out, a)
-
-    @given(
-        roi_counts=st.lists(st.integers(0, 5), min_size=1, max_size=10),
-        num_classes=st.integers(1, 10),
-        rotated=st.booleans(),
-        angle_bound_on=st.booleans(),
-        clip_angle_thresh=st.sampled_from([-1.0, 1.0]),
-        batch_splits_dtype=st.sampled_from([torch.float32, torch.int32]),
-        **hu.gcs_cpu_only
-    )
-    def test_box_with_nms_limits(
-        self,
-        roi_counts,
-        num_classes,
-        rotated,
-        angle_bound_on,
-        clip_angle_thresh,
-        batch_splits_dtype,
-        gc,
-        dc,
-    ):
-        rotated = False  # FIXME remove this after rotation is supported
-        rois, deltas, im_info = create_bbox_transform_inputs(
-            roi_counts, num_classes, rotated
-        )
-        pred_bbox, batch_splits = [
-            t.detach().numpy()
-            for t in torch.ops._caffe2.BBoxTransform(
-                torch.tensor(rois),
-                torch.tensor(deltas),
-                torch.tensor(im_info),
-                [1.0, 1.0, 1.0, 1.0],
-                False,
-                rotated,
-                angle_bound_on,
-                -90,
-                90,
-                clip_angle_thresh,
-                legacy_plus_one=True,
-            )
-        ]
-        class_prob = np.random.randn(sum(roi_counts), num_classes).astype(np.float32)
-        score_thresh = 0.5
-        nms_thresh = 0.5
-        topk_per_image = sum(roi_counts) / 2
-
-        def box_with_nms_limit_ref():
-            input_blobs = ["class_prob", "pred_bbox", "batch_splits"]
-            output_blobs = [
-                "score_nms",
-                "bbox_nms",
-                "class_nms",
-                "batch_splits_nms",
-                "keeps_nms",
-                "keeps_size_nms",
-            ]
-            ref_op = core.CreateOperator(
-                "BoxWithNMSLimit",
-                input_blobs,
-                output_blobs,
-                score_thresh=float(score_thresh),
-                nms=float(nms_thresh),
-                detections_per_im=int(topk_per_image),
-                soft_nms_enabled=False,
-                soft_nms_method="linear",
-                soft_nms_sigma=0.5,
-                soft_nms_min_score_thres=0.001,
-                rotated=rotated,
-            )
-            workspace.FeedBlob("class_prob", class_prob)
-            workspace.FeedBlob("pred_bbox", pred_bbox)
-            workspace.FeedBlob("batch_splits", batch_splits)
-            workspace.RunOperatorOnce(ref_op)
-            return (workspace.FetchBlob(b) for b in output_blobs)
-
-        output_refs = box_with_nms_limit_ref()
-        outputs = torch.ops._caffe2.BoxWithNMSLimit(
-            torch.tensor(class_prob),
-            torch.tensor(pred_bbox),
-            torch.tensor(batch_splits, dtype=batch_splits_dtype),
-            score_thresh=float(score_thresh),
-            nms=float(nms_thresh),
-            detections_per_im=int(topk_per_image),
-            soft_nms_enabled=False,
-            soft_nms_method="linear",
-            soft_nms_sigma=0.5,
-            soft_nms_min_score_thres=0.001,
-            rotated=rotated,
-            cls_agnostic_bbox_reg=False,
-            input_boxes_include_bg_cls=True,
-            output_classes_include_bg_cls=True,
-            legacy_plus_one=True,
-        )
-
-        for o, o_ref in zip(outputs, output_refs):
-            assert_allclose(o, o_ref)
-
-    @given(
-        dim_1=st.integers(min_value=10, max_value=10),
-        dim_2=st.integers(min_value=3, max_value=3),
-        dim_3=st.integers(min_value=2, max_value=2),
-    )
-    def test_sparse_to_dense_mask(self, dim_1, dim_2, dim_3):
-        indices = np.array([i + 1 for i in range(dim_1)]).astype(np.int32)
-        values = np.random.rand(dim_1, dim_2, dim_3).astype(np.float32)
-        default_value = np.zeros((dim_2, dim_3)).astype(np.float32)
-        mask = [2, 4, 9]
-
-        def sparse_to_dense_mask_ref(return_presence_mask=False):
-            ref_op = core.CreateOperator(
-                "SparseToDenseMask",
-                ["indices", "values", "default_value"],
-                ["output", "presence_mask"],
-                mask=mask,
-                return_presence_mask=return_presence_mask,
-            )
-            workspace.FeedBlob("indices", indices)
-            workspace.FeedBlob("values", values)
-            workspace.FeedBlob("default_value", default_value)
-            workspace.RunOperatorOnce(ref_op)
-
-            if return_presence_mask:
-                return (
-                    workspace.FetchBlob("output"),
-                    workspace.FetchBlob("presence_mask"),
-                )
-
-            return workspace.FetchBlob("output")
-
-        # Testing return_presence_mask = False
-        output = sparse_to_dense_mask_ref()
-        output = torch.tensor(output)
-
-        a, _ = torch.ops._caffe2.SparseToDenseMask(
-            torch.tensor(indices),
-            torch.tensor(values),
-            torch.tensor(default_value),
-            None,
-            mask=mask,
-        )
-
-        assert_allclose(output, a)
-
-        # Testing return_presence_mask = True
-        output, presence_mask = sparse_to_dense_mask_ref(return_presence_mask=True)
-        output = torch.tensor(output)
-        presence_mask = torch.tensor(presence_mask)
-
-        a, b = torch.ops._caffe2.SparseToDenseMask(
-            torch.tensor(indices),
-            torch.tensor(values),
-            torch.tensor(default_value),
-            None,
-            mask=mask,
-            return_presence_mask=True,
-        )
-
-        assert_allclose(output, a)
-        assert_allclose(presence_mask, b)
-
-    @given(
-        A=st.integers(min_value=4, max_value=4),
-        H=st.integers(min_value=10, max_value=10),
-        W=st.integers(min_value=8, max_value=8),
-        img_count=st.integers(min_value=3, max_value=3),
-    )
-    def test_generate_proposals(self, A, H, W, img_count):
-        scores = np.ones((img_count, A, H, W)).astype(np.float32)
-        bbox_deltas = (
-            np.linspace(0, 10, num=img_count * 4 * A * H * W)
-            .reshape((img_count, 4 * A, H, W))
-            .astype(np.float32)
-        )
-        im_info = np.ones((img_count, 3)).astype(np.float32) / 10
-        anchors = np.ones((A, 4)).astype(np.float32)
-
-        def generate_proposals_ref():
-            ref_op = core.CreateOperator(
-                "GenerateProposals",
-                ["scores", "bbox_deltas", "im_info", "anchors"],
-                ["rois", "rois_probs"],
-                spatial_scale=2.0,
-            )
-            workspace.FeedBlob("scores", scores)
-            workspace.FeedBlob("bbox_deltas", bbox_deltas)
-            workspace.FeedBlob("im_info", im_info)
-            workspace.FeedBlob("anchors", anchors)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("rois"), workspace.FetchBlob("rois_probs")
-
-        rois, rois_probs = generate_proposals_ref()
-        rois = torch.tensor(rois)
-        rois_probs = torch.tensor(rois_probs)
-        a, b = torch.ops._caffe2.GenerateProposals(
-            torch.tensor(scores),
-            torch.tensor(bbox_deltas),
-            torch.tensor(im_info),
-            torch.tensor(anchors),
-            2.0,
-            6000,
-            300,
-            0.7,
-            16,
-            True,
-            -90,
-            90,
-            1.0,
-            legacy_plus_one=True,
-        )
-        assert_allclose(rois, a)
-        assert_allclose(rois_probs, b)
-
-    @given(
-        bsz=st.integers(1, 5),
-        seq_lens=st.integers(1, 6),
-        emb_lens=st.integers(5, 10),
-        hidden_size=st.integers(3, 7),
-        num_layers=st.integers(1, 4),
-        has_biases=st.booleans(),
-        is_bidirectional=st.booleans(),
-        batch_first=st.booleans(),
-    )
-    def test_inference_lstm(
-        self,
-        bsz,
-        seq_lens,
-        emb_lens,
-        hidden_size,
-        num_layers,
-        has_biases,
-        is_bidirectional,
-        batch_first,
-    ):
-        num_directions = 2 if is_bidirectional else 1
-        hx = np.zeros((num_layers * num_directions, bsz, hidden_size), dtype=np.float32)
-
-        if batch_first:
-            inputs = np.random.randn(bsz, seq_lens, emb_lens).astype(np.float32)
-        else:
-            inputs = np.random.randn(seq_lens, bsz, emb_lens).astype(np.float32)
-
-        torch_lstm = torch.nn.LSTM(
-            emb_lens,
-            hidden_size,
-            batch_first=batch_first,
-            bidirectional=is_bidirectional,
-            bias=has_biases,
-            num_layers=num_layers,
-        )
-
-        def inference_lstm_ref():
-            input_names = ["inputs", "hidden_0", "hidden_1"]
-            workspace.FeedBlob("inputs", inputs)
-            workspace.FeedBlob("hidden_0", hx)
-            workspace.FeedBlob("hidden_1", hx)
-            for i, param in enumerate(torch_lstm._flat_weights):
-                input_names.append("param_{}".format(i))
-                workspace.FeedBlob("param_{}".format(i), param.detach().numpy())
-
-            ref_op = core.CreateOperator(
-                "InferenceLSTM",
-                input_names,
-                ["output", "hidden", "cell"],
-                num_layers=num_layers,
-                has_biases=has_biases,
-                batch_first=batch_first,
-                bidirectional=is_bidirectional,
-            )
-            workspace.RunOperatorOnce(ref_op)
-            return (
-                workspace.FetchBlob("output"),
-                workspace.FetchBlob("hidden"),
-                workspace.FetchBlob("cell"),
-            )
-
-        output, hidden, cell = inference_lstm_ref()
-        output = torch.tensor(output)
-        hidden = torch.tensor(hidden)
-        cell = torch.tensor(cell)
-        lstm_in = [
-            torch.from_numpy(inputs),
-            torch.from_numpy(hx),
-            torch.from_numpy(hx),
-        ] + [param.detach() for param in torch_lstm._flat_weights]
-
-        a, b, c = torch.ops._caffe2.InferenceLSTM(
-            lstm_in, num_layers, has_biases, batch_first, is_bidirectional
-        )
-        assert_allclose(output, a)
-        assert_allclose(hidden, b)
-        assert_allclose(cell, c)
-
-    # Test case is using workspace.has_cuda_support and not workspace.has_gpu_support
-    # to exclude it from HIP because tensor interop doesn't work for HIP tensors yet
-    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
-    @given(
-        A=st.integers(min_value=4, max_value=4),
-        H=st.integers(min_value=10, max_value=10),
-        W=st.integers(min_value=8, max_value=8),
-        img_count=st.integers(min_value=3, max_value=3),
-    )
-    def test_generate_proposals_cuda(self, A, H, W, img_count):
-        scores = np.ones((img_count, A, H, W)).astype(np.float32)
-        bbox_deltas = (
-            np.linspace(0, 10, num=img_count * 4 * A * H * W)
-            .reshape((img_count, 4 * A, H, W))
-            .astype(np.float32)
-        )
-        im_info = np.ones((img_count, 3)).astype(np.float32) / 10
-        anchors = np.ones((A, 4)).astype(np.float32)
-
-        def generate_proposals_ref():
-            ref_op = core.CreateOperator(
-                "GenerateProposals",
-                ["scores", "bbox_deltas", "im_info", "anchors"],
-                ["rois", "rois_probs"],
-                spatial_scale=2.0,
-            )
-            workspace.FeedBlob("scores", scores)
-            workspace.FeedBlob("bbox_deltas", bbox_deltas)
-            workspace.FeedBlob("im_info", im_info)
-            workspace.FeedBlob("anchors", anchors)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("rois"), workspace.FetchBlob("rois_probs")
-
-        rois, rois_probs = generate_proposals_ref()
-        rois = torch.tensor(rois)
-        rois_probs = torch.tensor(rois_probs)
-        a, b = torch.ops._caffe2.GenerateProposals(
-            torch.tensor(scores).cuda(),
-            torch.tensor(bbox_deltas).cuda(),
-            torch.tensor(im_info).cuda(),
-            torch.tensor(anchors).cuda(),
-            2.0,
-            6000,
-            300,
-            0.7,
-            16,
-            True,
-            -90,
-            90,
-            1.0,
-            legacy_plus_one=True,
-        )
-        assert_allclose(rois, a.cpu())
-        assert_allclose(rois_probs, b.cpu())
-
-    @given(
-        N=st.integers(min_value=1, max_value=2),
-        C=st.integers(min_value=4, max_value=4),
-        H=st.integers(min_value=10, max_value=10),
-        W=st.integers(min_value=8, max_value=8),
-    )
-    def _test_roi_align(self, N, C, H, W, device):
-        def rand_roi():
-            return np.array(
-                [
-                    float(int(N * np.random.rand())),
-                    0.5 * np.random.rand() * W,
-                    0.5 * np.random.rand() * H,
-                    (0.5 + 0.5 * np.random.rand()) * W,
-                    (0.5 + 0.5 * np.random.rand()) * H,
-                ]
-            ).astype(np.float32)
-
-        feature = np.random.randn(N, C, H, W).astype(np.float32)
-        rois = np.array([rand_roi() for _ in range(10)])
-
-        def roi_align_ref(_feature, _rois):
-            ref_op = core.CreateOperator(
-                "RoIAlign",
-                ["feature", "rois"],
-                ["roi_feature"],
-                spatial_scale=1.0,
-                pooled_h=3,
-                pooled_w=3,
-                sampling_ratio=0,
-            )
-            workspace.FeedBlob("feature", _feature)
-            workspace.FeedBlob("rois", _rois)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("roi_feature")
-
-        roi_feature_ref = roi_align_ref(feature, rois)
-        roi_feature = torch.ops._caffe2.RoIAlign(
-            torch.tensor(feature).to(device),
-            torch.tensor(rois).to(device),
-            order="NCHW",
-            spatial_scale=1.0,
-            pooled_h=3,
-            pooled_w=3,
-            sampling_ratio=0,
-            aligned=False,
-        )
-        assert_allclose(roi_feature_ref, roi_feature.cpu())
-
-    def test_roi_align_cpu(self):
-        self._test_roi_align(device="cpu")
-
-    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
-    def test_roi_align_cuda(self):
-        self._test_roi_align(device="cuda")
-
-    @given(
-        N=st.integers(min_value=1, max_value=2),
-        C=st.integers(min_value=4, max_value=4),
-        H=st.integers(min_value=10, max_value=10),
-        W=st.integers(min_value=8, max_value=8),
-    )
-    def _test_roi_align_rotated(self, N, C, H, W, device):
-        def rand_rotated_roi():
-            return np.array(
-                [
-                    float(int(N * np.random.rand())),
-                    np.random.rand() * W,
-                    np.random.rand() * H,
-                    np.random.rand() * W,
-                    np.random.rand() * H,
-                    np.random.rand() * 360 - 180,
-                ]
-            ).astype(np.float32)
-
-        feature = np.random.randn(N, C, H, W).astype(np.float32)
-        rois = np.array([rand_rotated_roi() for _ in range(10)])
-
-        def roi_align_ref(_feature, _rois):
-            ref_op = core.CreateOperator(
-                "RoIAlignRotated",
-                ["feature", "rois"],
-                ["roi_feature"],
-                spatial_scale=1.0,
-                pooled_h=3,
-                pooled_w=3,
-                sampling_ratio=0,
-            )
-            workspace.FeedBlob("feature", _feature)
-            workspace.FeedBlob("rois", _rois)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("roi_feature")
-
-        roi_feature_ref = roi_align_ref(feature, rois)
-        roi_feature = torch.ops._caffe2.RoIAlignRotated(
-            torch.tensor(feature).to(device),
-            torch.tensor(rois).to(device),
-            order="NCHW",
-            spatial_scale=1.0,
-            pooled_h=3,
-            pooled_w=3,
-            sampling_ratio=0,
-            aligned=False,
-        )
-        assert_allclose(roi_feature_ref, roi_feature.cpu())
-
-    def test_roi_align_rotated_cpu(self):
-        self._test_roi_align_rotated(device="cpu")
-
-    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
-    def test_roi_align_rotated_cuda(self):
-        self._test_roi_align_rotated(device="cuda")
-
-    @given(roi_counts=st.lists(st.integers(0, 5), min_size=1, max_size=10))
-    def test_collect_and_distribute_fpn_rpn_proposals_op(self, roi_counts):
-        batch_size = len(roi_counts)
-        im_dims = np.random.randint(100, 600, batch_size)
-        rpn_rois_and_scores = []
-        for i in range(5):
-            rpn_rois_and_scores.append(torch.tensor(generate_rois(roi_counts, im_dims)))
-        for i in range(5):
-            rpn_rois_and_scores.append(torch.rand(sum(roi_counts)))
-
-        rois = torch.ops._caffe2.CollectRpnProposals(
-            rpn_rois_and_scores,
-            rpn_max_level=6,
-            rpn_min_level=2,
-            rpn_post_nms_topN=sum(roi_counts),
-        )
-        fpn_outputs = torch.ops._caffe2.DistributeFpnProposals(
-            rois,
-            roi_canonical_scale=224,
-            roi_canonical_level=4,
-            roi_max_level=5,
-            roi_min_level=2,
-            legacy_plus_one=True,
-        )
-
-        all_outputs = torch.ops._caffe2.CollectAndDistributeFpnRpnProposals(
-            rpn_rois_and_scores,
-            roi_canonical_scale=224,
-            roi_canonical_level=4,
-            roi_max_level=5,
-            roi_min_level=2,
-            rpn_max_level=6,
-            rpn_min_level=2,
-            rpn_post_nms_topN=sum(roi_counts),
-            legacy_plus_one=True,
-        )
-
-        rois_fpn_list = fpn_outputs[:-1]
-        rois_idx_restore_int32 = fpn_outputs[-1]
-
-        # [rois] + fpn_outputs should be equal to all_outputs
-        assert_allclose(rois, all_outputs[0])
-        for x, y in zip(fpn_outputs, all_outputs[1:]):
-            assert_allclose(x, y)
-
-    @given(X=hu.tensor(), fast_gelu=st.booleans())
-    def _test_gelu_op(self, X, fast_gelu, device):
-        def _gelu_ref(_X):
-            return (_X * norm.cdf(_X).astype(np.float32),)
-
-        (expected_output,) = _gelu_ref(X)
-        actual_output = torch.ops._caffe2.Gelu(torch.tensor(X), fast_gelu)
-
-        rtol = 1e-3 if fast_gelu else 1e-4
-        atol = 1e-5
-        assert_allclose(
-            expected_output, actual_output.cpu(), rtol=rtol, atol=atol
-        )
-
-    def test_gelu_op(self):
-        self._test_gelu_op(device="cpu")
-
-    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
-    def test_gelu_op_cuda(self):
-        self._test_gelu_op(device="cuda")
-
-    @given(
-        inputs=hu.lengths_tensor(
-            dtype=np.float32, min_value=1, max_value=5, allow_empty=True
-        )
-    )
-    def _test_lengths_op(self, inputs, ref_op_name, torch_op, device):
-        data, lengths = inputs
-
-        def _lengths_ref(X, Y):
-            ref_op = core.CreateOperator(ref_op_name, ["X", "Y"], "out")
-            workspace.FeedBlob("X", X)
-            workspace.FeedBlob("Y", Y)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("out")
-
-        expected_output = _lengths_ref(data, lengths)
-        actual_output = torch_op(
-            torch.tensor(data), torch.tensor(lengths, dtype=torch.int32)
-        )
-
-        assert_allclose(expected_output, actual_output.cpu())
-
-    def _test_lengths_sum_op(self, device):
-        self._test_lengths_op("LengthsSum", torch.ops._caffe2.LengthsSum, device)
-
-    def test_lengths_sum_op(self):
-        self._test_lengths_sum_op(device="cpu")
-
-    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
-    def test_lengths_sum_op_cuda(self):
-        self._test_lengths_sum_op(device="cuda")
-
-    def _test_lengths_mean_op(self, device):
-        self._test_lengths_op("LengthsMean", torch.ops._caffe2.LengthsMean, device)
-
-    def test_lengths_mean_op(self):
-        self._test_lengths_mean_op(device="cpu")
-
-    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
-    def test_lengths_mean_op_cuda(self):
-        self._test_lengths_mean_op(device="cuda")
-
-    def _test_lengths_max_op(self, device):
-        self._test_lengths_op("LengthsMax", torch.ops._caffe2.LengthsMax, device)
-
-    def test_lengths_max_op(self):
-        self._test_lengths_max_op(device="cpu")
-
-    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
-    def test_lengths_max_op_cuda(self):
-        self._test_lengths_max_op(device="cuda")
-
-    def _test_resize_nearest_op(self, device):
-        data = np.random.rand(1, 2, 3, 4).astype(np.float32)
-
-        def _resize_nearest_ref(X):
-            ref_op = core.CreateOperator(
-                "ResizeNearest",
-                ["X"],
-                ["Y"],
-                width_scale=2.0,
-                height_scale=1.5,
-                order="NCHW",
-            )
-            workspace.FeedBlob("X", X)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("Y")
-
-        expected_output = _resize_nearest_ref(data)
-        actual_output = torch.ops._caffe2.ResizeNearest(
-            torch.tensor(data).to(device),
-            order="NCHW",
-            width_scale=2.0,
-            height_scale=1.5,
-        )
-
-        assert_allclose(expected_output, actual_output.cpu())
-
-    def test_resize_nearest_op_cpu(self):
-        return self._test_resize_nearest_op("cpu")
-
-    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
-    def test_resize_nearest_op_cuda(self):
-        return self._test_resize_nearest_op("cuda")
-
-    @given(input_data=hu.tensor(min_dim=2, max_dim=2))
-    def test_Fused8BitRowwiseQuantizedToFloat(self, input_data):
-        QuantizeOp = core.CreateOperator(
-            "FloatToFused8BitRowwiseQuantized", ["input_data"], ["quantized_data"]
-        )
-
-        workspace.FeedBlob("input_data", input_data)
-        workspace.RunOperatorOnce(QuantizeOp)
-
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        dequantized_data = torch.ops._caffe2.Fused8BitRowwiseQuantizedToFloat(
-            torch.tensor(quantized_data)
-        )
-
-        reference = fused_rowwise_8bit_quantize_dequantize_reference(input_data)
-        np.testing.assert_array_almost_equal(dequantized_data.numpy(), reference)
-
-    @given(binary_input=st.booleans())
-    def test_piecewise_linear_op(self, binary_input):
-        if binary_input:
-            num_dims = 1
-        else:
-            num_dims = 3
-        data = np.random.rand(1024, num_dims).astype(np.float32)
-        slopes = np.zeros(4 * num_dims).astype(np.float32)
-        bounds = np.sort(
-            np.random.rand(5, num_dims).astype(np.float32), axis=0
-        ).flatten("F")
-        intercepts = np.random.rand(4 * num_dims).astype(np.float32)
-
-        def _piecewise_linear_ref(X):
-            ref_op = core.CreateOperator(
-                "PiecewiseLinearTransform",
-                ["data", "bounds", "slopes", "intercepts"],
-                ["calibrated"],
-                binary=binary_input,
-            )
-            workspace.FeedBlob("data", X)
-            workspace.FeedBlob("bounds", bounds)
-            workspace.FeedBlob("slopes", slopes)
-            workspace.FeedBlob("intercepts", intercepts)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("calibrated")
-
-        expected_output = _piecewise_linear_ref(data)
-        actual_output = torch.ops._caffe2.PiecewiseLinearTransform(
-            torch.tensor(data),
-            bounds.tolist(),
-            slopes.tolist(),
-            intercepts.tolist(),
-            binary_input,
-        )
-
-        assert_allclose(torch.tensor(expected_output), actual_output)
-
-    def test_alias_with_name_is_in_place(self):
-        device = "cuda" if workspace.has_cuda_support else "cpu"
-        x = torch.tensor([3., 42.]).to(device=device)
-        y = torch.ops._caffe2.AliasWithName(x, "new_name")
-        x[1] = 6
-        assert_allclose(x, torch.tensor([3., 6.]).to(device=device))
-        # y should also change because y is alias of x
-        assert_allclose(y, torch.tensor([3., 6.]).to(device=device))
-
-    @unittest.skipIf(not workspace.has_cuda_support, "No cuda support")
-    def test_copy_between_cpu_and_gpu(self):
-        x_cpu_ref = torch.tensor([1., 2., 3.])
-        x_gpu_ref = x_cpu_ref.to("cuda")
-
-        x_gpu = torch.ops._caffe2.CopyCPUToGPU(x_cpu_ref)
-        assert_allclose(x_gpu, x_gpu_ref)
-        x_cpu = torch.ops._caffe2.CopyGPUToCPU(x_gpu)
-        assert_allclose(x_cpu, x_cpu_ref)
-
-    def test_index_hash_op(self):
-        data = np.random.randint(low=0, high=1000, size=(4, 4, 4))
-
-        def _index_hash_ref(X):
-            ref_op = core.CreateOperator("IndexHash", ["X"], ["Y"], seed=0, modulo=100)
-            workspace.FeedBlob("X", X)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("Y")
-
-        expected_output = _index_hash_ref(data)
-        actual_output = torch.ops._caffe2.IndexHash(
-            torch.tensor(data), seed=0, modulo=100
-        )
-
-        assert_allclose(expected_output, actual_output.cpu())
-
-    def test_bucketize_op(self):
-        data = np.random.rand(8, 10).astype(np.float32) * 1000
-        boundaries = np.array([1, 10, 100, 1000, 100000]).astype(np.float32)
-
-        def _bucketize_ref(X):
-            ref_op = core.CreateOperator(
-                "Bucketize", ["X"], ["Y"], boundaries=boundaries
-            )
-            workspace.FeedBlob("X", X)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("Y")
-
-        expected_output = _bucketize_ref(data)
-        actual_output = torch.ops._caffe2.Bucketize(torch.tensor(data), boundaries)
-        assert_allclose(expected_output, actual_output.cpu())
-
-    @given(X=hu.tensor(), eps=st.floats(min_value=1e-4, max_value=1e-2))
-    def test_logit(self, X, eps):
-        def ref(X, eps):
-            ref_op = core.CreateOperator("Logit", ["X"], ["Y"], eps=eps)
-            workspace.FeedBlob("X", X)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("Y")
-
-        expected_output = ref(X, eps)
-        actual_output = torch.ops._caffe2.Logit(torch.tensor(X), eps)
-        assert_allclose(expected_output, actual_output.cpu())
-
-    def test_percentile(self):
-        original_values = np.array([[3.0, 5.0, 3], [5.0, 1.0, 6.0]]).astype(np.float32)
-        value_to_pct = np.array([[3, 0.2], [5, 0.5], [1, 0.3], [3, 0.6]]).astype(
-            np.float32
-        )
-        lengths = np.array([2, 1, 1]).astype(np.int32)
-
-        def _percentile_ref(original_values, value_to_pct, lengths):
-            ref_op = core.CreateOperator(
-                "Percentile", ["original_values", "value_to_pct", "lengths"], ["Y"]
-            )
-            workspace.FeedBlob("original_values", original_values)
-            workspace.FeedBlob("value_to_pct", value_to_pct)
-            workspace.FeedBlob("lengths", lengths)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("Y")
-
-        expected_output = _percentile_ref(original_values, value_to_pct, lengths)
-        actual_output = torch.ops._caffe2.Percentile(
-            torch.tensor(original_values),
-            torch.tensor(value_to_pct),
-            torch.tensor(lengths),
-        )
-        assert_allclose(expected_output, actual_output.cpu())
-
-    def test_batch_bucket_one_hot_op(self):
-        data = np.array([[2, 3], [4, 1], [2, 5]]).astype(np.float32)
-        lengths = np.array([2, 3]).astype(np.int32)
-        boundaries = np.array([0.1, 2.5, 1, 3.1, 4.5]).astype(np.float32)
-
-        def _batch_bucket_one_hot_ref(data, lengths, boundaries):
-            ref_op = core.CreateOperator(
-                "BatchBucketOneHot", ["data", "lengths", "boundaries"], ["Y"]
-            )
-            workspace.FeedBlob("data", data)
-            workspace.FeedBlob("lengths", lengths)
-            workspace.FeedBlob("boundaries", boundaries)
-            workspace.RunOperatorOnce(ref_op)
-            return workspace.FetchBlob("Y")
-
-        expected_output = _batch_bucket_one_hot_ref(data, lengths, boundaries)
-        actual_output = torch.ops._caffe2.BatchBucketOneHot(
-            torch.tensor(data), torch.tensor(lengths), torch.tensor(boundaries)
-        )
-        assert_allclose(expected_output, actual_output.cpu())
-
-    def test_gather_ranges_to_dense_op(self):
-        data = np.array([1, 2, 3, 4, 5, 6, 7, 8])
-        ranges = np.array([[[2, 4]], [[0, 0]]])
-        key = np.array([0, 1, 3, 2, 1, 0, 1, 0])
-        lengths = np.array([4])
-        min_observation = 2
-        max_mismatched_ratio = 0.5
-        max_empty_ratio = 1.0
-
-        outputs_name = ["X_{}".format(i) for i in range(len(lengths))]
-        ref_op = core.CreateOperator(
-            "GatherRangesToDense",
-            ["data", "ranges", "key"],
-            outputs_name,
-            lengths=lengths,
-            min_observation=min_observation,
-            max_mismatched_ratio=max_mismatched_ratio,
-            max_empty_ratio=max_empty_ratio,
-        )
-        workspace.FeedBlob("data", data)
-        workspace.FeedBlob("ranges", ranges)
-        workspace.FeedBlob("key", key)
-        workspace.RunOperatorOnce(ref_op)
-        ref_outputs = []
-        for output_name in outputs_name:
-            ref_outputs.append(workspace.FetchBlob(output_name))
-
-        outputs = torch.ops._caffe2.GatherRangesToDense(
-            torch.from_numpy(data),
-            torch.from_numpy(ranges),
-            torch.from_numpy(key),
-            lengths=lengths,
-            min_observation=min_observation,
-            max_mismatched_ratio=max_mismatched_ratio,
-            max_empty_ratio=max_empty_ratio,
-        )
-
-        self.assertEqual(len(ref_outputs), len(outputs))
-        for i in range(0, len(ref_outputs)):
-            np.testing.assert_array_almost_equal(ref_outputs[i], outputs[i].numpy())
-
-    @given(lengths_0=st.integers(1, 10), lengths_1=st.integers(1, 10))
-    @settings(deadline=10000)
-    def test_merge_id_lists(self, lengths_0, lengths_1):
-        def _merge_id_lists(lengths, values):
-            ref_op = core.CreateOperator(
-                "MergeIdLists",
-                ["lengths_0", "values_0", "lengths_1", "values_1"],
-                ["merged_lengths", "merged_values"],
-            )
-            workspace.FeedBlob("lengths_0", lengths[0])
-            workspace.FeedBlob("values_0", values[0])
-            workspace.FeedBlob("lengths_1", lengths[1])
-            workspace.FeedBlob("values_1", values[1])
-            workspace.RunOperatorOnce(ref_op)
-            return (
-                workspace.FetchBlob("merged_lengths"),
-                workspace.FetchBlob("merged_values"),
-            )
-
-        lengths = [
-            np.array([lengths_0]).astype(np.int32),
-            np.array([lengths_1]).astype(np.int32),
-        ]
-        values = [
-            np.random.choice(np.arange(0, 10), size=lengths_0, replace=False).astype(
-                np.int32
-            ),
-            np.random.choice(np.arange(10, 20), size=lengths_1, replace=False).astype(
-                np.int32
-            ),
-        ]
-
-        expected_merged_lengths, expected_merged_values = _merge_id_lists(
-            lengths, values
-        )
-        output_merged_lengths, output_merged_values = torch.ops._caffe2.MergeIdLists(
-            [
-                torch.tensor(lengths[0]),
-                torch.tensor(values[0]),
-                torch.tensor(lengths[1]),
-                torch.tensor(values[1]),
-            ]
-        )
-        assert_allclose(expected_merged_lengths, output_merged_lengths)
-        assert_allclose(expected_merged_values, output_merged_values)
-
-    def test_learning_rate(self):
-        base_lr = 0.05
-        no_iter = torch.tensor([0])
-        one_iter = torch.tensor([1])
-        two_iter = torch.tensor([2])
-
-        # Fixed policy
-        self.assertEqual(
-            base_lr,
-            torch.ops._caffe2.LearningRate(
-                iterations=no_iter, base_lr=base_lr, policy="fixed"
-            ),
-        )
-        self.assertEqual(
-            base_lr,
-            torch.ops._caffe2.LearningRate(
-                iterations=one_iter, base_lr=base_lr, policy="fixed"
-            ),
-        )
-
-        # Step policy
-        gamma = 0.99
-        stepsize = 1
-
-        self.assertEqual(
-            base_lr,
-            torch.ops._caffe2.LearningRate(
-                iterations=no_iter,
-                base_lr=base_lr,
-                policy="step",
-                stepsize=stepsize,
-                gamma=gamma,
-            ),
-        )
-        self.assertAlmostEqual(
-            base_lr * (gamma ** (1.0 / stepsize)),
-            torch.ops._caffe2.LearningRate(
-                iterations=one_iter,
-                base_lr=base_lr,
-                policy="step",
-                stepsize=stepsize,
-                gamma=gamma,
-            ),
-        )
-        self.assertAlmostEqual(
-            base_lr * (gamma ** (2.0 / stepsize)),
-            torch.ops._caffe2.LearningRate(
-                iterations=two_iter,
-                base_lr=base_lr,
-                policy="step",
-                stepsize=stepsize,
-                gamma=gamma,
-            ),
-        )
-
-    def test_pack_segments(self):
-        s = torch.rand(3, 3, 3)
-        lengths = torch.tensor([2, 1])
-        packed_tensor, _ = torch.ops._caffe2.PackSegments(lengths, s)
-        self.assertEqual(packed_tensor.numpy().shape, (2, 2, 3, 3))
-        unpacked_tensor = torch.ops._caffe2.UnpackSegments(lengths, packed_tensor)
-        assert_allclose(s, unpacked_tensor)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/transpose_op_test.py b/caffe2/python/operator_test/transpose_op_test.py
deleted file mode 100644
index 4ccec250e22b..000000000000
--- a/caffe2/python/operator_test/transpose_op_test.py
+++ /dev/null
@@ -1,84 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from hypothesis import given, settings
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-
-import numpy as np
-import unittest
-
-
-class TestTransposeOp(serial.SerializedTestCase):
-    @given(
-        X=hu.tensor(dtype=np.float32), use_axes=st.booleans(), **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_transpose(self, X, use_axes, gc, dc):
-        ndim = len(X.shape)
-        axes = np.arange(ndim)
-        np.random.shuffle(axes)
-
-        if (use_axes):
-            op = core.CreateOperator(
-                "Transpose", ["X"], ["Y"], axes=axes, device_option=gc)
-        else:
-            op = core.CreateOperator(
-                "Transpose", ["X"], ["Y"], device_option=gc)
-
-        def transpose_ref(X):
-            if use_axes:
-                return [np.transpose(X, axes=axes)]
-            else:
-                return [np.transpose(X)]
-
-        self.assertReferenceChecks(gc, op, [X], transpose_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-    @given(M=st.integers(10, 200), N=st.integers(10, 200), **hu.gcs)
-    @settings(max_examples=10, deadline=None)
-    def test_transpose_large_matrix(self, M, N, gc, dc):
-        op = core.CreateOperator("Transpose", ["X"], ["Y"], device_option=gc)
-        X = np.random.rand(M, N).astype(np.float32) - 0.5
-
-        def transpose_ref(X):
-            return [np.transpose(X)]
-
-        self.assertReferenceChecks(gc, op, [X], transpose_ref)
-        self.assertDeviceChecks(dc, op, [X], [0])
-        self.assertGradientChecks(gc, op, [X], 0, [0])
-
-
-    @unittest.skipIf(not workspace.has_cuda_support, "no cuda support")
-    @given(X=hu.tensor(dtype=np.float32), use_axes=st.booleans(),
-           **hu.gcs_cuda_only)
-    def test_transpose_cudnn(self, X, use_axes, gc, dc):
-        ndim = len(X.shape)
-        axes = np.arange(ndim)
-        np.random.shuffle(axes)
-
-        if (use_axes):
-            op = core.CreateOperator(
-                "Transpose", ["X"], ["Y"], axes=axes, engine="CUDNN",
-                device_option=hu.cuda_do)
-        else:
-            op = core.CreateOperator(
-                "Transpose", ["X"], ["Y"], engine="CUDNN",
-                device_option=hu.cuda_do)
-
-        def transpose_ref(X):
-            if use_axes:
-                return [np.transpose(X, axes=axes)]
-            else:
-                return [np.transpose(X)]
-
-        self.assertReferenceChecks(hu.gpu_do, op, [X], transpose_ref)
-        self.assertGradientChecks(hu.gpu_do, op, [X], 0, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/trigonometric_op_test.py b/caffe2/python/operator_test/trigonometric_op_test.py
deleted file mode 100644
index 04b98857c301..000000000000
--- a/caffe2/python/operator_test/trigonometric_op_test.py
+++ /dev/null
@@ -1,52 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-import numpy as np
-import unittest
-
-
-class TestTrigonometricOp(serial.SerializedTestCase):
-    @given(
-        X=hu.tensor(elements=hu.floats(min_value=-0.7, max_value=0.7)),
-        **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_acos(self, X, gc, dc):
-        self.assertTrigonometricChecks("Acos", X, lambda x: (np.arccos(X),), gc, dc)
-
-    @given(
-        X=hu.tensor(elements=hu.floats(min_value=-0.7, max_value=0.7)),
-        **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_asin(self, X, gc, dc):
-        self.assertTrigonometricChecks("Asin", X, lambda x: (np.arcsin(X),), gc, dc)
-
-    @given(
-        X=hu.tensor(elements=hu.floats(min_value=-100, max_value=100)),
-        **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_atan(self, X, gc, dc):
-        self.assertTrigonometricChecks("Atan", X, lambda x: (np.arctan(X),), gc, dc)
-
-    @given(
-        X=hu.tensor(elements=hu.floats(min_value=-0.5, max_value=0.5)),
-        **hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_tan(self, X, gc, dc):
-        self.assertTrigonometricChecks("Tan", X, lambda x: (np.tan(X),), gc, dc)
-
-    def assertTrigonometricChecks(self, op_name, input, reference, gc, dc):
-        op = core.CreateOperator(op_name, ["X"], ["Y"])
-        self.assertReferenceChecks(gc, op, [input], reference)
-        self.assertDeviceChecks(dc, op, [input], [0])
-        self.assertGradientChecks(gc, op, [input], 0, [0])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/unique_ops_test.py b/caffe2/python/operator_test/unique_ops_test.py
deleted file mode 100644
index b49f4765539e..000000000000
--- a/caffe2/python/operator_test/unique_ops_test.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-from functools import partial
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-
-def _unique_ref(x, return_inverse):
-    ret = np.unique(x, return_inverse=return_inverse)
-    if not return_inverse:
-        ret = [ret]
-    return ret
-
-
-class TestUniqueOps(serial.SerializedTestCase):
-    @given(
-        X=hu.tensor1d(
-            # allow empty
-            min_len=0,
-            dtype=np.int32,
-            # allow negatives
-            elements=st.integers(min_value=-10, max_value=10)),
-        return_remapping=st.booleans(),
-        **hu.gcs_no_hip
-    )
-    @settings(deadline=10000)
-    def test_unique_op(self, X, return_remapping, gc, dc):
-        # impl of unique op does not guarantees return order, sort the input
-        # so different impl return same outputs
-        X = np.sort(X)
-
-        op = core.CreateOperator(
-            "Unique",
-            ['X'],
-            ["U", "remap"] if return_remapping else ["U"],
-        )
-        self.assertDeviceChecks(
-            device_options=dc,
-            op=op,
-            inputs=[X],
-            outputs_to_check=[0, 1] if return_remapping else [0]
-        )
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=partial(_unique_ref, return_inverse=return_remapping),
-        )
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/unique_uniform_fill_op_test.py b/caffe2/python/operator_test/unique_uniform_fill_op_test.py
deleted file mode 100644
index 1026745db724..000000000000
--- a/caffe2/python/operator_test/unique_uniform_fill_op_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from hypothesis import given
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-import unittest
-
-
-class TestUniqueUniformFillOp(hu.HypothesisTestCase):
-    @given(
-        r=st.integers(1000, 10000),
-        avoid=st.lists(
-            st.integers(1, 1000),
-            min_size=1,
-            max_size=100,
-            unique=True
-        ),
-        dtypes=st.sampled_from(
-            [
-                (np.int32, core.DataType.INT32),
-                (np.int64, core.DataType.INT64)
-            ]
-        ),
-        s=st.integers(10, 500),
-        **hu.gcs_cpu_only
-    )
-    def test_unique_uniform_int_fill(self, r, avoid, dtypes, s, gc, dc):
-        net = core.Net("net")
-        workspace.FeedBlob("X", np.array([s], dtype=np.int64))
-        workspace.FeedBlob("AVOID", np.array(avoid, dtype=dtypes[0]))
-        net.UniqueUniformFill(
-            ["X", "AVOID"], ["Y"],
-            min=1,
-            max=r,
-            input_as_shape=True,
-            dtype=dtypes[1]
-        )
-        workspace.RunNetOnce(net)
-        y = workspace.FetchBlob("Y")
-        self.assertEqual(s, len(y))
-        self.assertEqual(s, len(set(y)))
-        self.assertEqual(s, len(set(y) - set(avoid)))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/unsafe_coalesce_test.py b/caffe2/python/operator_test/unsafe_coalesce_test.py
deleted file mode 100644
index c99ef31236cc..000000000000
--- a/caffe2/python/operator_test/unsafe_coalesce_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env python3
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-import numpy.testing as npt
-from caffe2.python import core, workspace
-from hypothesis import given
-
-
-class TestUnsafeCoalesceOp(hu.HypothesisTestCase):
-    @given(
-        n=st.integers(1, 5),
-        shape=st.lists(st.integers(0, 5), min_size=1, max_size=3),
-        **hu.gcs
-    )
-    def test_unsafe_coalesce_op(self, n, shape, dc, gc):
-        workspace.ResetWorkspace()
-        test_inputs = [(100 * np.random.random(shape)).astype(np.float32) for _ in range(n)]
-        test_input_blobs = ["x_{}".format(i) for i in range(n)]
-
-        coalesce_op = core.CreateOperator(
-            "UnsafeCoalesce",
-            test_input_blobs,
-            test_input_blobs + ["shared_memory_blob"],
-            device_option=gc,
-        )
-
-        def reference_func(*args):
-            self.assertEqual(len(args), n)
-            return list(args) + [np.concatenate([x.flatten() for x in args])]
-
-        self.assertReferenceChecks(gc, coalesce_op, test_inputs, reference_func)
-
-    @given(
-        n=st.integers(1, 5),
-        shape=st.lists(st.integers(1, 5), min_size=1, max_size=3),
-        seed=st.integers(0, 65535),
-        **hu.gcs
-    )
-    def test_unsafe_coalesce_op_blob_sharing(self, n, shape, seed, dc, gc):
-        workspace.ResetWorkspace()
-        # Can make debugging of the test more predictable
-        np.random.seed(seed)
-        test_inputs = [(np.random.random(shape)).astype(np.float32) for _ in range(n)]
-        test_input_blobs = ["x_{}".format(i) for i in range(n)]
-
-        coalesce_op = core.CreateOperator(
-            "UnsafeCoalesce",
-            test_input_blobs,
-            test_input_blobs + ["shared_memory_blob"],
-            device_option=gc,
-        )
-        for name, value in zip(test_input_blobs, test_inputs):
-            workspace.FeedBlob(name, value, device_option=gc)
-
-        workspace.RunOperatorOnce(coalesce_op)
-        blob_value = workspace.blobs["shared_memory_blob"]
-        npt.assert_almost_equal(
-            blob_value,
-            np.concatenate([x.flatten() for x in test_inputs]),
-            decimal=4
-        )
-        # np.random generates values in range [0, 1), so -2 is outside of range
-        blob_value.fill(-2.0)
-        self.assertTrue((blob_value != workspace.blobs["shared_memory_blob"]).all())
-        workspace.FeedBlob("shared_memory_blob", blob_value, device_option=gc)
-
-        # All blobs preserved shape, but got overwritted to -2
-        for name, value in zip(test_input_blobs, test_inputs):
-            self.assertEqual(value.shape, workspace.blobs[name].shape)
-            self.assertTrue((value != workspace.blobs[name]).all())
-            self.assertTrue((workspace.blobs[name] == -2).all())
-
-        # It should be OK to reuse operator as long as it's blob shapes are not changing
-        workspace.RunOperatorOnce(coalesce_op)
diff --git a/caffe2/python/operator_test/upsample_op_test.py b/caffe2/python/operator_test/upsample_op_test.py
deleted file mode 100644
index 61b01644bcf5..000000000000
--- a/caffe2/python/operator_test/upsample_op_test.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-
-class TestUpSample(serial.SerializedTestCase):
-    @given(height_scale=st.floats(1.0, 4.0) | st.just(2.0),
-           width_scale=st.floats(1.0, 4.0) | st.just(2.0),
-           height=st.integers(4, 32),
-           width=st.integers(4, 32),
-           num_channels=st.integers(1, 4),
-           batch_size=st.integers(1, 4),
-           seed=st.integers(0, 65535),
-           **hu.gcs)
-    @settings(max_examples=50, deadline=None)
-    def test_upsample(self, height_scale, width_scale, height, width,
-                     num_channels, batch_size, seed,
-                     gc, dc):
-
-        np.random.seed(seed)
-
-        X = np.random.rand(
-            batch_size, num_channels, height, width).astype(np.float32)
-        scales = np.array([height_scale, width_scale]).astype(np.float32)
-
-        ops = [
-            (
-                core.CreateOperator(
-                    "UpsampleBilinear",
-                    ["X"],
-                    ["Y"],
-                    width_scale=width_scale,
-                    height_scale=height_scale,
-                ),
-                [X],
-            ),
-            (
-                core.CreateOperator(
-                    "UpsampleBilinear",
-                    ["X", "scales"],
-                    ["Y"],
-                ),
-                [X, scales],
-            ),
-        ]
-
-        for op, inputs in ops:
-            def ref(X, scales=None):
-                output_height = np.int32(height * height_scale)
-                output_width = np.int32(width * width_scale)
-
-                Y = np.random.rand(
-                    batch_size, num_channels, output_height,
-                    output_width).astype(np.float32)
-
-                rheight = ((height - 1) / (output_height - 1)
-                        if output_height > 1
-                        else float(0))
-                rwidth = ((width - 1) / (output_width - 1)
-                        if output_width > 1
-                        else float(0))
-
-                for i in range(output_height):
-                    h1r = rheight * i
-                    h1 = int(h1r)
-                    h1p = 1 if h1 < height - 1 else 0
-                    h1lambda = h1r - h1
-                    h0lambda = float(1) - h1lambda
-                    for j in range(output_width):
-                        w1r = rwidth * j
-                        w1 = int(w1r)
-                        w1p = 1 if w1 < width - 1 else 0
-                        w1lambda = w1r - w1
-                        w0lambda = float(1) - w1lambda
-                        Y[:, :, i, j] = (h0lambda * (
-                            w0lambda * X[:, :, h1, w1] +
-                            w1lambda * X[:, :, h1, w1 + w1p]) +
-                            h1lambda * (w0lambda * X[:, :, h1 + h1p, w1] +
-                            w1lambda * X[:, :, h1 + h1p, w1 + w1p]))
-
-                return Y,
-
-            self.assertReferenceChecks(gc, op, inputs, ref)
-            self.assertDeviceChecks(dc, op, inputs, [0])
-            self.assertGradientChecks(gc, op, inputs, 0, [0], stepsize=0.1,
-                                      threshold=1e-2)
-
-    @given(height_scale=st.floats(1.0, 4.0) | st.just(2.0),
-           width_scale=st.floats(1.0, 4.0) | st.just(2.0),
-           height=st.integers(4, 32),
-           width=st.integers(4, 32),
-           num_channels=st.integers(1, 4),
-           batch_size=st.integers(1, 4),
-           seed=st.integers(0, 65535),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_upsample_grad(self, height_scale, width_scale, height, width,
-                          num_channels, batch_size, seed, gc, dc):
-
-        np.random.seed(seed)
-
-        output_height = np.int32(height * height_scale)
-        output_width = np.int32(width * width_scale)
-        X = np.random.rand(batch_size,
-                           num_channels,
-                           height,
-                           width).astype(np.float32)
-        dY = np.random.rand(batch_size,
-                            num_channels,
-                            output_height,
-                            output_width).astype(np.float32)
-        scales = np.array([height_scale, width_scale]).astype(np.float32)
-
-        ops = [
-            (
-                core.CreateOperator(
-                    "UpsampleBilinearGradient",
-                    ["dY", "X"],
-                    ["dX"],
-                    width_scale=width_scale,
-                    height_scale=height_scale,
-                ),
-                [dY, X],
-            ),
-            (
-                core.CreateOperator(
-                    "UpsampleBilinearGradient",
-                    ["dY", "X", "scales"],
-                    ["dX"],
-                ),
-                [dY, X, scales],
-            ),
-        ]
-
-        for op, inputs in ops:
-            def ref(dY, X, scales=None):
-                dX = np.zeros_like(X)
-
-                rheight = ((height - 1) / (output_height - 1)
-                        if output_height > 1
-                        else float(0))
-                rwidth = ((width - 1) / (output_width - 1)
-                        if output_width > 1
-                        else float(0))
-
-                for i in range(output_height):
-                    h1r = rheight * i
-                    h1 = int(h1r)
-                    h1p = 1 if h1 < height - 1 else 0
-                    h1lambda = h1r - h1
-                    h0lambda = float(1) - h1lambda
-                    for j in range(output_width):
-                        w1r = rwidth * j
-                        w1 = int(w1r)
-                        w1p = 1 if w1 < width - 1 else 0
-                        w1lambda = w1r - w1
-                        w0lambda = float(1) - w1lambda
-                        dX[:, :, h1, w1] += (
-                            h0lambda * w0lambda * dY[:, :, i, j])
-                        dX[:, :, h1, w1 + w1p] += (
-                            h0lambda * w1lambda * dY[:, :, i, j])
-                        dX[:, :, h1 + h1p, w1] += (
-                            h1lambda * w0lambda * dY[:, :, i, j])
-                        dX[:, :, h1 + h1p, w1 + w1p] += (
-                            h1lambda * w1lambda * dY[:, :, i, j])
-
-                return dX,
-
-            self.assertDeviceChecks(dc, op, inputs, [0])
-            self.assertReferenceChecks(gc, op, inputs, ref)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/utility_ops_test.py b/caffe2/python/operator_test/utility_ops_test.py
deleted file mode 100644
index 187328f9e484..000000000000
--- a/caffe2/python/operator_test/utility_ops_test.py
+++ /dev/null
@@ -1,482 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, workspace
-from hypothesis import assume, given, settings
-from caffe2.proto import caffe2_pb2
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-import random
-
-
-class TestUtilityOps(serial.SerializedTestCase):
-
-    @given(X=hu.tensor(), args=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_slice(self, X, args, gc, dc):
-        X = X.astype(dtype=np.float32)
-        dim = random.randint(0, X.ndim - 1)
-        slice_start = random.randint(0, X.shape[dim] - 1)
-        slice_end = random.randint(slice_start, X.shape[dim] - 1)
-        starts = np.array([0] * X.ndim).astype(np.int32)
-        ends = np.array([-1] * X.ndim).astype(np.int32)
-        starts[dim] = slice_start
-        ends[dim] = slice_end
-
-        if args:
-            op = core.CreateOperator(
-                "Slice", ["X"], ["Y"], starts=starts, ends=ends, device_option=gc
-            )
-
-            def slice_ref(X):
-                slc = [slice(None)] * X.ndim
-                slc[dim] = slice(slice_start, slice_end)
-                return [X[slc]]
-            inputs = [X]
-        else:
-            op = core.CreateOperator(
-                "Slice", ["X", "starts", "ends"], ["Y"], device_option=gc
-            )
-
-            def slice_ref(x, starts, ends):
-                slc = [slice(None)] * x.ndim
-                slc[dim] = slice(slice_start, slice_end)
-                return [x[slc]]
-            inputs = [X, starts, ends]
-
-        self.assertReferenceChecks(gc, op, inputs, slice_ref)
-        self.assertDeviceChecks(dc, op, inputs, [0])
-        self.assertGradientChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            outputs_to_check=0,
-            outputs_with_grads=[0],
-        )
-
-    @given(ndims=st.integers(min_value=1, max_value=10), **hu.gcs)
-    @settings(deadline=10000)
-    def test_resize_like(self, ndims, gc, dc):
-        X = np.zeros((ndims * 2, ))
-        Y = np.zeros((ndims, 2))
-
-        op = core.CreateOperator(
-            "ResizeLike", ["X", "Y"], ["Z"],
-        )
-
-        def resize_like(X, Y):
-            return [X.reshape(Y.shape)]
-
-        self.assertDeviceChecks(dc, op, [X, Y], [0])
-        self.assertReferenceChecks(gc, op, [X, Y], resize_like, ensure_outputs_are_inferred=True)
-
-    @given(dtype=st.sampled_from([np.float32, np.int32]),
-           ndims=st.integers(min_value=1, max_value=5),
-           seed=st.integers(min_value=0, max_value=65536),
-           null_axes=st.booleans(),
-           engine=st.sampled_from(['CUDNN', None]),
-           **hu.gcs)
-    @settings(deadline=10000)
-    def test_transpose(self, dtype, ndims, seed, null_axes, engine, gc, dc):
-        if (gc.device_type == caffe2_pb2.CUDA and engine == "CUDNN"):
-            # cudnn 5.1 does not support int.
-            assume(workspace.GetCuDNNVersion() >= 6000 or dtype != np.int32)
-
-        dims = (np.random.rand(ndims) * 16 + 1).astype(np.int32)
-        X = (np.random.rand(*dims) * 16).astype(dtype)
-
-        if null_axes:
-            axes = None
-            op = core.CreateOperator(
-                "Transpose",
-                ["input"], ["output"],
-                engine=engine)
-        else:
-            np.random.seed(int(seed))
-            axes = [int(v) for v in list(np.random.permutation(X.ndim))]
-            op = core.CreateOperator(
-                "Transpose",
-                ["input"], ["output"],
-                axes=axes,
-                engine=engine)
-
-        def transpose_ref(x, axes):
-            return (np.transpose(x, axes),)
-
-        self.assertReferenceChecks(gc, op, [X, axes],
-                                   transpose_ref)
-
-    @given(m=st.integers(5, 10), n=st.integers(5, 10),
-           o=st.integers(5, 10), nans=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_nan_check(self, m, n, o, nans, gc, dc):
-        other = np.array([1, 2, 3]).astype(np.float32)
-        X = np.random.rand(m, n, o).astype(np.float32)
-        if nans:
-            x_nan = np.random.randint(0, m)
-            y_nan = np.random.randint(0, n)
-            z_nan = np.random.randint(0, o)
-            X[x_nan, y_nan, z_nan] = float('NaN')
-
-        # print('nans: {}'.format(nans))
-        # print(X)
-
-        def nan_reference(X, Y):
-            if not np.isnan(X).any():
-                return [X]
-            else:
-                return [np.array([])]
-
-        op = core.CreateOperator(
-            "NanCheck",
-            ["X", "other"],
-            ["Y"]
-        )
-
-        try:
-            self.assertReferenceChecks(
-                device_option=gc,
-                op=op,
-                inputs=[X, other],
-                reference=nan_reference,
-            )
-            if nans:
-                self.assertTrue(False, "Did not fail when presented with NaN!")
-        except RuntimeError:
-            self.assertTrue(nans, "No NaNs but failed")
-
-        try:
-            self.assertGradientChecks(
-                device_option=gc,
-                op=op,
-                inputs=[X],
-                outputs_to_check=0,
-                outputs_with_grads=[0],
-            )
-            if nans:
-                self.assertTrue(False, "Did not fail when gradient had NaN!")
-        except RuntimeError:
-            pass
-
-    @serial.given(n=st.integers(4, 5), m=st.integers(6, 7),
-           d=st.integers(2, 3), **hu.gcs)
-    def test_elementwise_max(self, n, m, d, gc, dc):
-        X = np.random.rand(n, m, d).astype(np.float32)
-        Y = np.random.rand(n, m, d).astype(np.float32)
-        Z = np.random.rand(n, m, d).astype(np.float32)
-        inputs = [X, Y, Z]
-
-        def max_op(X, Y, Z):
-            return [np.maximum(np.maximum(X, Y), Z)]
-
-        op = core.CreateOperator(
-            "Max",
-            ["X", "Y", "Z"],
-            ["mx"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=max_op,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-    @given(n=st.integers(4, 5), m=st.integers(6, 7),
-           d=st.integers(2, 3), **hu.gcs)
-    @settings(deadline=10000)
-    def test_elementwise_max_grad(self, n, m, d, gc, dc):
-        go = np.random.rand(n, m, d).astype(np.float32)
-        X = np.random.rand(n, m, d).astype(np.float32)
-        Y = np.random.rand(n, m, d).astype(np.float32)
-        Z = np.random.rand(n, m, d).astype(np.float32)
-        mx = np.maximum(np.maximum(X, Y), Z)
-        inputs = [mx, go, X, Y, Z]
-
-        def max_grad_op(mx, go, X, Y, Z):
-            def mx_grad(a):
-                return go * (mx == a)
-
-            return [mx_grad(a) for a in [X, Y, Z]]
-
-        op = core.CreateOperator(
-            "MaxGradient",
-            ["mx", "go", "X", "Y", "Z"],
-            ["gX", "gY", "gZ"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=max_grad_op,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
-
-    @serial.given(n=st.integers(4, 5), m=st.integers(6, 7),
-           d=st.integers(2, 3), **hu.gcs)
-    def test_elementwise_min(self, n, m, d, gc, dc):
-        X = np.random.rand(n, m, d).astype(np.float32)
-        Y = np.random.rand(n, m, d).astype(np.float32)
-        Z = np.random.rand(n, m, d).astype(np.float32)
-        inputs = [X, Y, Z]
-
-        def min_op(X, Y, Z):
-            return [np.minimum(np.minimum(X, Y), Z)]
-
-        op = core.CreateOperator(
-            "Min",
-            ["X", "Y", "Z"],
-            ["mx"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=min_op,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0])
-
-    @given(n=st.integers(4, 5), m=st.integers(6, 7),
-           d=st.integers(2, 3), **hu.gcs)
-    @settings(deadline=10000)
-    def test_elementwise_min_grad(self, n, m, d, gc, dc):
-        go = np.random.rand(n, m, d).astype(np.float32)
-        X = np.random.rand(n, m, d).astype(np.float32)
-        Y = np.random.rand(n, m, d).astype(np.float32)
-        Z = np.random.rand(n, m, d).astype(np.float32)
-        mx = np.minimum(np.minimum(X, Y), Z)
-        inputs = [mx, go, X, Y, Z]
-
-        def min_grad_op(mx, go, X, Y, Z):
-            def mx_grad(a):
-                return go * (mx == a)
-
-            return [mx_grad(a) for a in [X, Y, Z]]
-
-        op = core.CreateOperator(
-            "MinGradient",
-            ["mx", "go", "X", "Y", "Z"],
-            ["gX", "gY", "gZ"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=inputs,
-            reference=min_grad_op,
-        )
-        self.assertDeviceChecks(dc, op, inputs, [0, 1, 2])
-
-    @given(
-        n=st.integers(1, 8), m=st.integers(1, 10), d=st.integers(1, 4),
-        in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
-        seed=st.integers(min_value=0, max_value=65535),
-        dtype=st.sampled_from([np.int32, np.int64, np.float32]),
-        **hu.gcs)
-    @settings(deadline=10000)
-    def test_sum(
-            self, n, m, d, in_place, engine, seed, dtype, gc, dc):
-        input_names = []
-        input_vars = []
-        np.random.seed(seed)
-        for i in range(m):
-            X_name = 'X' + str(i)
-            input_names.extend([X_name])
-            var = np.random.rand(n, d).astype(dtype)
-            vars()[X_name] = var
-            input_vars.append(var)
-
-        def sum_op_ref(*args):
-            res = np.zeros((n, d))
-            for i in range(m):
-                res = res + args[i]
-            return (res, )
-
-        op = core.CreateOperator(
-            "Sum",
-            input_names,
-            [input_names[0]] if in_place else ['Y'],
-            engine=engine,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=input_vars,
-            reference=sum_op_ref,
-        )
-        self.assertDeviceChecks(dc, op, input_vars, [0])
-
-    @given(
-        inputs=hu.lengths_tensor().flatmap(
-            lambda pair: st.tuples(
-                st.just(pair[0]),
-                st.just(pair[1]),
-                hu.dims(max_value=len(pair[1])),
-            )
-        ).flatmap(
-            lambda tup: st.tuples(
-                st.just(tup[0]),
-                st.just(tup[1]),
-                hu.arrays(
-                    tup[2], dtype=np.int32,
-                    elements=st.integers(
-                        min_value=0, max_value=len(tup[1]) - 1)),
-            )
-        ),
-        **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_lengths_gather(self, inputs, gc, dc):
-        items = inputs[0]
-        lengths = inputs[1]
-        indices = inputs[2]
-
-        def lengths_gather_op(items, lengths, indices):
-            ends = np.cumsum(lengths)
-            return [np.concatenate(
-                list(items[ends[i] - lengths[i]:ends[i]] for i in indices))]
-
-        op = core.CreateOperator(
-            "LengthsGather",
-            ["items", "lengths", "indices"],
-            ["output"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[items, lengths, indices],
-            reference=lengths_gather_op,
-        )
-
-    @given(
-        inputs=hu.lengths_tensor(),
-        **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_lengths_to_ranges(self, inputs, gc, dc):
-        _, lengths = inputs
-
-        def lengths_to_ranges_op(lengths):
-            return [
-                [[x, y] for x, y in zip(np.cumsum(np.append([0], lengths)),
-                                        lengths)]
-            ]
-
-        op = core.CreateOperator(
-            "LengthsToRanges",
-            ["lengths"],
-            ["output"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[lengths],
-            reference=lengths_to_ranges_op,
-        )
-
-        # Test shape inference logic
-        net = core.Net("test_shape_inference")
-
-        workspace.FeedBlob("lengths", lengths)
-        output = net.LengthsToRanges(
-            ["lengths"],
-            ["output"]
-        )
-        (shapes, types) = workspace.InferShapesAndTypes([net])
-        workspace.RunNetOnce(net)
-        self.assertEqual(shapes[output], list(workspace.blobs[output].shape))
-        self.assertEqual(shapes[output], list(lengths.shape) + [2])
-        self.assertEqual(types[output], core.DataType.INT32)
-
-    @given(**hu.gcs)
-    @settings(deadline=None, max_examples=50)
-    def test_size_op(self, gc, dc):
-        X = np.array([[1, 2], [3, 4]]).astype(np.float32)
-
-        def size_op(tensor):
-            return [np.prod(tensor.shape)]
-
-        op = core.CreateOperator(
-            "Size",
-            ["X"],
-            ["output"]
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=[X],
-            reference=size_op,
-        )
-
-    def test_alias_op(self):
-        """ Don't use hypothesis because there are only 2 cases to check"""
-        for size in [0, 5]:
-            X = np.arange(size).astype(np.float32)
-            workspace.FeedBlob('X', X)
-
-            op = core.CreateOperator(
-                "Alias",
-                ["X"],
-                ["Y"]
-            )
-            workspace.RunOperatorOnce(op)
-            Y = workspace.FetchBlob('Y')
-            np.testing.assert_array_equal(X, Y)
-
-    @given(**hu.gcs)
-    @settings(deadline=10000)
-    def test_range(self, gc, dc):
-        names = [
-            ('stop_',),
-            ('start_', 'stop_'),
-            ('start_', 'stop_', 'step_'),
-        ]
-        # Most random values aren't great here, so use a fixed set instead of
-        # hypothesis.
-        for inputs in (
-            (10,),
-            (np.float32(10.0),),
-            (0,),
-            (0, 0),
-            (10., 5.0, -1.),
-            (2, 10000),
-            (2, 10000, 20000),
-            (2, 10000, -1),
-        ):
-            inputs = [np.array(v) for v in inputs]
-            op = core.CreateOperator(
-                "Range",
-                names[len(inputs) - 1],
-                ["Y"]
-            )
-
-            self.assertReferenceChecks(
-                device_option=gc,
-                op=op,
-                inputs=inputs,
-                reference=lambda *x: [np.arange(*x)],
-            )
-            self.assertDeviceChecks(dc, op, inputs, [0])
-
-        inputs = (np.array(0), np.array(10), np.array(0))
-        op = core.CreateOperator(
-            "Range",
-            names[len(inputs) - 1],
-            ["Y"]
-        )
-        with self.assertRaisesRegex(RuntimeError, 'Step size cannot be 0'):
-            self.assertReferenceChecks(
-                device_option=gc,
-                op=op,
-                inputs=inputs,
-                reference=lambda *x: [np.arange(*x)],
-            )
diff --git a/caffe2/python/operator_test/video_input_op_test.py b/caffe2/python/operator_test/video_input_op_test.py
deleted file mode 100644
index 24f9e57434d4..000000000000
--- a/caffe2/python/operator_test/video_input_op_test.py
+++ /dev/null
@@ -1,294 +0,0 @@
-
-
-import os
-import shutil
-import sys
-import tempfile
-import unittest
-
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import model_helper, workspace
-
-
-try:
-    import lmdb
-except ImportError as e:
-    raise unittest.SkipTest("python-lmdb is not installed") from e
-
-
-class VideoInputOpTest(unittest.TestCase):
-    def create_a_list(self, output_file, line, n):
-        # create a list that repeat a line n times
-        # used for creating a list file for simple test input
-        with open(output_file, "w") as file:
-            for _i in range(n):
-                file.write(line)
-
-    def create_video_db(self, list_file, output_file, use_list=False):
-        # Write to lmdb database...
-        LMDB_MAP_SIZE = 1 << 40  # MODIFY
-        env = lmdb.open(output_file, map_size=LMDB_MAP_SIZE)
-        total_size = 0
-
-        file_name = []
-        start_frame = []
-        label = []
-        index = 0
-
-        with env.begin(write=True) as txn:
-            with open(list_file, "r") as data:
-                for line in data:
-                    p = line.split()
-                    file_name = p[0]
-                    start_frame = int(p[1])
-                    label = int(p[2])
-
-                    if not use_list:
-                        with open(file_name, mode="rb") as file:
-                            video_data = file.read()
-                    else:
-                        video_data = file_name
-
-                    tensor_protos = caffe2_pb2.TensorProtos()
-                    video_tensor = tensor_protos.protos.add()
-                    video_tensor.data_type = 4  # string data
-                    video_tensor.string_data.append(video_data)
-
-                    label_tensor = tensor_protos.protos.add()
-                    label_tensor.data_type = 2
-                    label_tensor.int32_data.append(label)
-
-                    start_frame_tensor = tensor_protos.protos.add()
-                    start_frame_tensor.data_type = 2
-                    start_frame_tensor.int32_data.append(start_frame)
-
-                    txn.put(
-                        "{}".format(index).encode("ascii"),
-                        tensor_protos.SerializeToString(),
-                    )
-                    index = index + 1
-                    total_size = total_size + len(video_data) + sys.getsizeof(int)
-        return total_size
-
-    # sample one clip randomly from the video
-    def test_rgb_with_temporal_jittering(self):
-        random_label = np.random.randint(0, 100)
-        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
-        if not os.path.exists(VIDEO):
-            raise unittest.SkipTest("Missing data")
-        temp_list = tempfile.NamedTemporaryFile(delete=False).name
-        line_str = "{} 0 {}\n".format(VIDEO, random_label)
-        self.create_a_list(temp_list, line_str, 16)
-        video_db_dir = tempfile.mkdtemp()
-
-        self.create_video_db(temp_list, video_db_dir)
-        model = model_helper.ModelHelper(name="Video Loader from LMDB")
-        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
-
-        # build the model
-        model.net.VideoInput(
-            reader,
-            ["data", "label"],
-            name="data",
-            batch_size=16,
-            clip_per_video=1,
-            crop_size=112,
-            scale_w=171,
-            scale_h=128,
-            length_rgb=8,
-            sampling_rate_rgb=1,
-            decode_type=0,
-            video_res_type=0,  # scale by scale_h and scale_w
-        )
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        data = workspace.FetchBlob("data")
-        label = workspace.FetchBlob("label")
-
-        np.testing.assert_equal(label, random_label)
-        np.testing.assert_equal(data.shape, [16, 3, 8, 112, 112])
-        os.remove(temp_list)
-        shutil.rmtree(video_db_dir)
-
-    # sample multiple clips uniformly from the video
-    def test_rgb_with_uniform_sampling(self):
-        random_label = np.random.randint(0, 100)
-        clip_per_video = np.random.randint(2, 11)
-        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
-        if not os.path.exists(VIDEO):
-            raise unittest.SkipTest("Missing data")
-        temp_list = tempfile.NamedTemporaryFile(delete=False).name
-        line_str = "{} 0 {}\n".format(VIDEO, random_label)
-        self.create_a_list(temp_list, line_str, 16)
-        video_db_dir = tempfile.mkdtemp()
-
-        self.create_video_db(temp_list, video_db_dir)
-        model = model_helper.ModelHelper(name="Video Loader from LMDB")
-        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
-
-        # build the model
-        model.net.VideoInput(
-            reader,
-            ["data", "label"],
-            name="data",
-            batch_size=3,
-            clip_per_video=clip_per_video,
-            crop_size=112,
-            scale_w=171,
-            scale_h=128,
-            length_rgb=8,
-            sampling_rate_rgb=1,
-            decode_type=1,
-            video_res_type=0,
-        )
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        data = workspace.FetchBlob("data")
-        label = workspace.FetchBlob("label")
-
-        np.testing.assert_equal(label, random_label)
-        np.testing.assert_equal(data.shape, [3 * clip_per_video, 3, 8, 112, 112])
-        os.remove(temp_list)
-        shutil.rmtree(video_db_dir)
-
-    # test optical flow
-    def test_optical_flow_with_temporal_jittering(self):
-        random_label = np.random.randint(0, 100)
-        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
-        if not os.path.exists(VIDEO):
-            raise unittest.SkipTest("Missing data")
-        temp_list = tempfile.NamedTemporaryFile(delete=False).name
-        line_str = "{} 0 {}\n".format(VIDEO, random_label)
-        self.create_a_list(temp_list, line_str, 16)
-        video_db_dir = tempfile.mkdtemp()
-
-        self.create_video_db(temp_list, video_db_dir)
-        model = model_helper.ModelHelper(name="Video Loader from LMDB")
-        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
-        model.net.VideoInput(
-            reader,
-            ["data", "label"],
-            name="data",
-            batch_size=16,
-            clip_per_video=1,
-            crop_size=112,
-            scale_w=171,
-            scale_h=128,
-            length_of=8,
-            sampling_rate_of=1,
-            frame_gap_of=1,
-            decode_type=0,
-            video_res_type=0,
-            get_rgb=False,
-            get_optical_flow=True,
-        )
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        data = workspace.FetchBlob("data")
-        label = workspace.FetchBlob("label")
-
-        np.testing.assert_equal(label, random_label)
-        np.testing.assert_equal(data.shape, [16, 2, 8, 112, 112])
-        os.remove(temp_list)
-        shutil.rmtree(video_db_dir)
-
-    # test rgb output VideoResType is
-    # USE_SHORTER_EDGE
-    def test_rgb_use_shorter_edge(self):
-        batch_size = 16
-        random_label = np.random.randint(0, 100)
-        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
-        if not os.path.exists(VIDEO):
-            raise unittest.SkipTest("Missing data")
-        temp_list = tempfile.NamedTemporaryFile(delete=False).name
-        line_str = "{} 0 {}\n".format(VIDEO, random_label)
-        self.create_a_list(temp_list, line_str, batch_size)
-        video_db_dir = tempfile.mkdtemp()
-
-        self.create_video_db(temp_list, video_db_dir)
-        model = model_helper.ModelHelper(name="Video Loader from LMDB")
-        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
-        model.net.VideoInput(
-            reader,
-            ["data", "label"],
-            name="data",
-            batch_size=batch_size,
-            clip_per_video=1,
-            crop_size=112,
-            scale_w=171,
-            scale_h=128,
-            length_of=8,
-            frame_gap_of=1,
-            decode_type=0,
-            video_res_type=1,  # use shorter edge
-            get_rgb=True,
-            length_rgb=8,
-            short_edge=112,
-        )
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        data = workspace.FetchBlob("data")
-        label = workspace.FetchBlob("label")
-
-        np.testing.assert_equal(label.shape, [batch_size])
-        for i in range(batch_size):
-            np.testing.assert_equal(label[i], random_label)
-        np.testing.assert_equal(data.shape, [batch_size, 3, 8, 112, 112])
-        os.remove(temp_list)
-        shutil.rmtree(video_db_dir)
-
-    # test optical flow output VideoResType is
-    # USE_SHORTER_EDGE
-    def test_optical_flow_use_shorter_edge(self):
-        batch_size = 16
-        random_label = np.random.randint(0, 100)
-        VIDEO = "/mnt/vol/gfsdataswarm-oregon/users/trandu/sample.avi"
-        if not os.path.exists(VIDEO):
-            raise unittest.SkipTest("Missing data")
-        temp_list = tempfile.NamedTemporaryFile(delete=False).name
-        line_str = "{} 0 {}\n".format(VIDEO, random_label)
-        self.create_a_list(temp_list, line_str, batch_size)
-        video_db_dir = tempfile.mkdtemp()
-
-        self.create_video_db(temp_list, video_db_dir)
-        model = model_helper.ModelHelper(name="Video Loader from LMDB")
-        reader = model.CreateDB("sample", db=video_db_dir, db_type="lmdb")
-        model.net.VideoInput(
-            reader,
-            ["data", "label"],
-            name="data",
-            batch_size=batch_size,
-            clip_per_video=1,
-            crop_size=112,
-            scale_w=171,
-            scale_h=128,
-            length_of=8,
-            sampling_rate_of=1,
-            frame_gap_of=1,
-            decode_type=0,
-            video_res_type=1,  # use shorter edge
-            get_rgb=False,
-            get_optical_flow=True,
-            short_edge=112,
-        )
-
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.RunNetOnce(model.net)
-        data = workspace.FetchBlob("data")
-        label = workspace.FetchBlob("label")
-
-        np.testing.assert_equal(label.shape, [batch_size])
-        for i in range(batch_size):
-            np.testing.assert_equal(label[i], random_label)
-        np.testing.assert_equal(data.shape, [batch_size, 2, 8, 112, 112])
-        os.remove(temp_list)
-        shutil.rmtree(video_db_dir)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/caffe2/python/operator_test/weight_scale_test.py b/caffe2/python/operator_test/weight_scale_test.py
deleted file mode 100644
index 5cdc11eb4d11..000000000000
--- a/caffe2/python/operator_test/weight_scale_test.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import functools
-from hypothesis import given
-import hypothesis.strategies as st
-import numpy as np
-
-class TestWeightScale(hu.HypothesisTestCase):
-    @given(inputs=hu.tensors(n=1),
-           ITER=st.integers(min_value=0, max_value=100),
-           stepsize=st.integers(min_value=20, max_value=50),
-           upper_bound_iter=st.integers(min_value=5, max_value=100),
-           scale=st.floats(min_value=0.01, max_value=0.99, allow_nan=False, allow_infinity=False),
-           **hu.gcs)
-    def test_weight_scale(self, inputs, ITER, stepsize, upper_bound_iter, scale, gc, dc):
-        ITER = np.array([ITER], dtype=np.int64)
-        op = core.CreateOperator(
-            "WeightScale", ["w", "iter"], ["nw"], stepsize=stepsize, upper_bound_iter=upper_bound_iter, scale=scale)
-
-        def ref_weight_scale(w, iter, stepsize, upper_bound_iter, scale):
-            iter = iter + 1
-            return [w * scale if iter % stepsize == 0 and iter < upper_bound_iter else w]
-
-        input_device_options = {'iter': hu.cpu_do}
-        self.assertReferenceChecks(
-            gc,
-            op,
-            [inputs[0], ITER],
-            functools.partial(ref_weight_scale, stepsize=stepsize, upper_bound_iter=upper_bound_iter, scale=scale),
-            input_device_options=input_device_options
-        )
diff --git a/caffe2/python/operator_test/weighted_multi_sample_test.py b/caffe2/python/operator_test/weighted_multi_sample_test.py
deleted file mode 100644
index 830a9f9849c7..000000000000
--- a/caffe2/python/operator_test/weighted_multi_sample_test.py
+++ /dev/null
@@ -1,69 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-from hypothesis import given
-import hypothesis.strategies as st
-
-from caffe2.python import core
-from caffe2.python import workspace
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestWeightedMultiSample(hu.HypothesisTestCase):
-    @given(
-        num_samples=st.integers(min_value=0, max_value=128),
-        data_len=st.integers(min_value=0, max_value=10000),
-        **hu.gcs_cpu_only
-    )
-    def test_weighted_multi_sample(self, num_samples, data_len, gc, dc):
-        weights = np.zeros((data_len))
-        expected_indices = []
-        if data_len > 0:
-            weights[-1] = 1.5
-            expected_indices = np.repeat(data_len - 1, num_samples)
-
-        workspace.FeedBlob("weights", weights.astype(np.float32))
-
-        op = core.CreateOperator(
-            "WeightedMultiSampling",
-            ["weights"],
-            ["sample_indices"],
-            num_samples=num_samples,
-        )
-        workspace.RunOperatorOnce(op)
-        result_indices = workspace.FetchBlob("sample_indices")
-        np.testing.assert_allclose(expected_indices, result_indices)
-        self.assertDeviceChecks(
-            dc,
-            op,
-            [weights.astype(np.float32)],
-            [0]
-        )
-
-        # test shape input
-        shape = np.zeros((num_samples))
-        workspace.FeedBlob("shape", shape)
-        op2 = core.CreateOperator(
-            "WeightedMultiSampling",
-            ["weights", "shape"],
-            ["sample_indices_2"]
-        )
-        workspace.RunOperatorOnce(op2)
-        result_indices_2 = workspace.FetchBlob("sample_indices_2")
-        if data_len > 0:
-            assert len(result_indices_2) == num_samples
-            for i in range(num_samples):
-                assert 0 <= result_indices_2[i] < data_len
-        else:
-            assert len(result_indices_2) == 0
-
-        self.assertDeviceChecks(dc, op2, [weights.astype(np.float32), shape], [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/weighted_sample_test.py b/caffe2/python/operator_test/weighted_sample_test.py
deleted file mode 100644
index 032e9e9d755e..000000000000
--- a/caffe2/python/operator_test/weighted_sample_test.py
+++ /dev/null
@@ -1,80 +0,0 @@
-
-
-
-
-
-import numpy as np
-
-from hypothesis import given
-import hypothesis.strategies as st
-
-from caffe2.python import core
-from caffe2.python import workspace
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestWeightedSample(hu.HypothesisTestCase):
-    @given(
-        batch=st.integers(min_value=0, max_value=128),
-        weights_len=st.integers(min_value=0, max_value=128),
-        **hu.gcs
-    )
-    def test_weighted_sample(self, batch, weights_len, gc, dc):
-
-        weights = np.zeros((batch, weights_len))
-        values = np.zeros((batch, weights_len))
-        rand_indices = []
-        rand_values = []
-        if batch > 0 and weights_len > 0:
-            for i in range(batch):
-                rand_tmp = np.random.randint(0, weights_len)
-                rand_val = np.random.rand()
-                rand_indices.append(rand_tmp)
-                rand_values.append(rand_val)
-                weights[i, rand_tmp] = 1.0
-                values[i, rand_tmp] = rand_val
-
-        rand_indices = np.array(rand_indices, dtype=np.float32)
-        rand_values = np.array(rand_values, dtype=np.float32)
-        workspace.FeedBlob("weights", weights.astype(np.float32))
-        workspace.FeedBlob("values", values.astype(np.float32))
-
-        # output both indices and values
-        op = core.CreateOperator(
-            "WeightedSample", ["weights", "values"],
-            ["sample_indices", "sample_values"]
-        )
-        workspace.RunOperatorOnce(op)
-        result_indices = workspace.FetchBlob("sample_indices")
-        result_values = workspace.FetchBlob("sample_values")
-        if batch > 0 and weights_len > 0:
-            for i in range(batch):
-                np.testing.assert_allclose(rand_indices[i], result_indices[i])
-                np.testing.assert_allclose(rand_values[i], result_values[i])
-        else:
-            np.testing.assert_allclose(rand_indices, result_indices)
-            np.testing.assert_allclose(rand_values, result_values)
-        self.assertDeviceChecks(
-            dc,
-            op,
-            [weights.astype(np.float32), values.astype(np.float32)],
-            [0, 1]
-        )
-
-        # output indices only
-        op2 = core.CreateOperator(
-            "WeightedSample", ["weights"], ["sample_indices_2"]
-        )
-        workspace.RunOperatorOnce(op2)
-        result = workspace.FetchBlob("sample_indices_2")
-        if batch > 0 and weights_len > 0:
-            for i in range(batch):
-                np.testing.assert_allclose(rand_indices[i], result[i])
-        else:
-            np.testing.assert_allclose(rand_indices, result)
-        self.assertDeviceChecks(dc, op2, [weights.astype(np.float32)], [0])
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/operator_test/weighted_sum_test.py b/caffe2/python/operator_test/weighted_sum_test.py
deleted file mode 100644
index fbbe2a6bf6d8..000000000000
--- a/caffe2/python/operator_test/weighted_sum_test.py
+++ /dev/null
@@ -1,101 +0,0 @@
-
-
-
-
-
-from caffe2.python import core
-from hypothesis import given, settings
-
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestWeightedSumOp(serial.SerializedTestCase):
-
-    @given(
-        n=st.integers(1, 8), m=st.integers(1, 10), d=st.integers(1, 4),
-        in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
-        seed=st.integers(min_value=0, max_value=65535),
-        **hu.gcs)
-    @settings(deadline=10000)
-    def test_weighted_sum(
-            self, n, m, d, in_place, engine, seed, gc, dc):
-        input_names = []
-        input_vars = []
-        np.random.seed(seed)
-        for i in range(m):
-            X_name = 'X' + str(i)
-            w_name = 'w' + str(i)
-            input_names.extend([X_name, w_name])
-            var = np.random.rand(n, d).astype(np.float32)
-            vars()[X_name] = var
-            input_vars.append(var)
-            var = np.random.rand(1).astype(np.float32)
-            vars()[w_name] = var
-            input_vars.append(var)
-
-        def weighted_sum_op_ref(*args):
-            res = np.zeros((n, d))
-            for i in range(m):
-                res = res + args[2 * i + 1] * args[2 * i]
-
-            return (res, )
-
-        op = core.CreateOperator(
-            "WeightedSum",
-            input_names,
-            [input_names[0]] if in_place else ['Y'],
-            engine=engine,
-        )
-
-        self.assertReferenceChecks(
-            device_option=gc,
-            op=op,
-            inputs=input_vars,
-            reference=weighted_sum_op_ref,
-        )
-        self.assertDeviceChecks(dc, op, input_vars, [0])
-
-    @given(n=st.integers(1, 8), m=st.integers(1, 10), d=st.integers(1, 4),
-           grad_on_w=st.booleans(),
-           seed=st.integers(min_value=0, max_value=65535), **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_weighted_sum_grad(
-            self, n, m, d, grad_on_w, seed, gc, dc):
-        input_names = []
-        input_vars = []
-        np.random.seed(seed)
-        for i in range(m):
-            X_name = 'X' + str(i)
-            w_name = 'w' + str(i)
-            input_names.extend([X_name, w_name])
-            var = np.random.rand(n, d).astype(np.float32)
-            vars()[X_name] = var
-            input_vars.append(var)
-            var = np.random.rand(1).astype(np.float32)
-            vars()[w_name] = var
-            input_vars.append(var)
-
-        op = core.CreateOperator(
-            "WeightedSum",
-            input_names,
-            ['Y'],
-            grad_on_w=grad_on_w,
-        )
-
-        output_to_check_grad = (
-            range(2 * m) if grad_on_w else range(0, 2 * m, 2))
-        for i in output_to_check_grad:
-            self.assertGradientChecks(
-                device_option=gc,
-                op=op,
-                inputs=input_vars,
-                outputs_to_check=i,
-                outputs_with_grads=[0],
-            )
-
-
-if __name__ == "__main__":
-    serial.testWithArgs()
diff --git a/caffe2/python/operator_test/wngrad_test.py b/caffe2/python/operator_test/wngrad_test.py
deleted file mode 100644
index 0a1f0405e92a..000000000000
--- a/caffe2/python/operator_test/wngrad_test.py
+++ /dev/null
@@ -1,218 +0,0 @@
-
-
-
-
-
-import functools
-
-import logging
-
-import hypothesis
-from hypothesis import given, settings, HealthCheck
-import hypothesis.strategies as st
-import numpy as np
-
-from caffe2.python import core
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-logger = logging.getLogger(__name__)
-
-def ref_wngrad(param_in, seq_b_in, grad, lr, epsilon,
-                output_effective_lr=False,
-                output_effective_lr_and_update=False):
-    # helper functions for wngrad operator test
-    seq_b_out = seq_b_in + 1.0 / (seq_b_in + epsilon) * np.sum(grad * grad)
-    effective_lr = lr / (seq_b_in + epsilon)
-    grad_adj = effective_lr * grad
-    param_out = param_in + grad_adj
-    if output_effective_lr_and_update:
-        return (param_out.astype(np.float32), seq_b_out.astype(np.float32),
-                effective_lr.astype(np.float32),
-                grad_adj.astype(np.float32))
-    elif output_effective_lr:
-        return (param_out.astype(np.float32), seq_b_out.astype(np.float32),
-                effective_lr.astype(np.float32))
-    return (param_out.astype(np.float32), seq_b_out.astype(np.float32))
-
-
-def wngrad_sparse_test_helper(parent_test, inputs, seq_b, lr, epsilon,
-     engine, gc, dc):
-    # helper functions for wngrad operator test
-    param, grad = inputs
-    seq_b = np.array([seq_b, ], dtype=np.float32)
-    lr = np.array([lr], dtype=np.float32)
-
-    # Create an indexing array containing values that are lists of indices,
-    # which index into grad
-    indices = np.random.choice(np.arange(grad.shape[0]),
-        size=np.random.randint(grad.shape[0]), replace=False)
-
-    # Sparsify grad
-    grad = grad[indices]
-
-    op = core.CreateOperator(
-        "SparseWngrad",
-        ["param", "seq_b", "indices", "grad", "lr"],
-        ["param", "seq_b"],
-        epsilon=epsilon,
-        engine=engine,
-        device_option=gc)
-
-    def ref_sparse(param, seq_b, indices, grad, lr):
-        param_out = np.copy(param)
-        seq_b_out = np.copy(seq_b)
-        seq_b_out = seq_b + 1.0 / seq_b * np.sum(grad * grad)
-        for i, index in enumerate(indices):
-            param_out[index] = param[index] + lr / (seq_b + epsilon) * grad[i]
-        return (param_out, seq_b_out)
-
-    logger.info('test_sparse_adagrad with full precision embedding')
-    seq_b_i = seq_b.astype(np.float32)
-    param_i = param.astype(np.float32)
-
-    parent_test.assertReferenceChecks(
-        gc, op, [param_i, seq_b_i, indices, grad, lr],
-        ref_sparse
-    )
-
-
-class TestWngrad(serial.SerializedTestCase):
-    @given(inputs=hu.tensors(n=2),
-           seq_b=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           lr=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_wngrad_dense_base(self, inputs, seq_b, lr, epsilon, gc, dc):
-        param, grad = inputs
-        seq_b = np.array([seq_b, ], dtype=np.float32)
-        lr = np.array([lr], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "Wngrad",
-            ["param", "seq_b", "grad", "lr"],
-            ["param", "seq_b"],
-            epsilon=epsilon,
-            device_option=gc,
-        )
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, seq_b, grad, lr],
-            functools.partial(ref_wngrad, epsilon=epsilon))
-
-    @given(inputs=hu.tensors(n=2),
-           seq_b=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           lr=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_wngrad_dense_output_effective_lr(self, inputs, seq_b,
-                                              lr, epsilon, gc, dc):
-        param, grad = inputs
-        seq_b = np.array([seq_b, ], dtype=np.float32)
-        lr = np.array([lr], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "Wngrad",
-            ["param", "seq_b", "grad", "lr"],
-            ["param", "seq_b", "effective_lr"],
-            epsilon=epsilon,
-            device_option=gc,
-        )
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, seq_b, grad, lr],
-            functools.partial(ref_wngrad, epsilon=epsilon,
-                              output_effective_lr=True))
-
-    @given(inputs=hu.tensors(n=2),
-           seq_b=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           lr=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_wngrad_dense_output_effective_lr_and_update(
-            self, inputs, seq_b, lr, epsilon, gc, dc):
-        param, grad = inputs
-        seq_b = np.abs(np.array([seq_b, ], dtype=np.float32))
-        lr = np.array([lr], dtype=np.float32)
-
-        op = core.CreateOperator(
-            "Wngrad",
-            ["param", "seq_b", "grad", "lr"],
-            ["param", "seq_b", "effective_lr", "update"],
-            epsilon=epsilon,
-            device_option=gc,
-        )
-
-        self.assertReferenceChecks(
-            gc, op,
-            [param, seq_b, grad, lr],
-            functools.partial(ref_wngrad, epsilon=epsilon,
-                              output_effective_lr_and_update=True))
-
-    # Suppress filter_too_much health check.
-    # Likely caused by `assume` call falling through too often.
-    @settings(suppress_health_check=[HealthCheck.filter_too_much], deadline=10000)
-    @given(inputs=hu.tensors(n=2),
-           seq_b=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           lr=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           **hu.gcs_cpu_only)
-    def test_sparse_wngrad(self, inputs, seq_b, lr, epsilon, gc, dc):
-        return wngrad_sparse_test_helper(self, inputs, seq_b, lr, epsilon,
-            None, gc, dc)
-
-    @given(inputs=hu.tensors(n=1),
-           lr=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           seq_b=st.floats(min_value=0.01, max_value=0.99,
-                        allow_nan=False, allow_infinity=False),
-           epsilon=st.floats(min_value=0.01, max_value=0.99,
-                             allow_nan=False, allow_infinity=False),
-           **hu.gcs_cpu_only)
-    @settings(deadline=10000)
-    def test_sparse_wngrad_empty(self, inputs, seq_b, lr, epsilon, gc, dc):
-        param = inputs[0]
-        seq_b = np.array([seq_b, ], dtype=np.float32)
-        lr = np.array([lr], dtype=np.float32)
-
-        grad = np.empty(shape=(0,) + param.shape[1:], dtype=np.float32)
-        indices = np.empty(shape=(0,), dtype=np.int64)
-
-        hypothesis.note('indices.shape: %s' % str(indices.shape))
-
-        op = core.CreateOperator(
-            "SparseWngrad",
-            ["param", "seq_b", "indices", "grad", "lr"],
-            ["param", "seq_b"],
-            epsilon=epsilon,
-            device_option=gc)
-
-        def ref_sparse(param, seq_b, indices, grad, lr):
-            param_out = np.copy(param)
-            seq_b_out = np.copy(seq_b)
-            return (param_out, seq_b_out)
-
-        print('test_sparse_adagrad_empty with full precision embedding')
-        seq_b_i = seq_b.astype(np.float32)
-        param_i = param.astype(np.float32)
-
-        self.assertReferenceChecks(
-            gc, op, [param_i, seq_b_i, indices, grad, lr], ref_sparse
-        )
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
deleted file mode 100644
index 899573d04e25..000000000000
--- a/caffe2/python/optimizer.py
+++ /dev/null
@@ -1,2368 +0,0 @@
-# @package optimizer
-# Module caffe2.python.optimizer
-
-
-import copy
-import logging
-from collections import defaultdict, namedtuple
-from typing import Any, Dict
-
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, scope, utils, workspace
-from caffe2.python.modeling import parameter_info
-from past.builtins import basestring
-
-
-_LEARNING_RATE_INJECTION = "lr_injection"
-
-AuxOptimizerParams = namedtuple("AuxOptimizerParams", ["local", "shared"])
-_optimizer_instance_count = defaultdict(int)
-
-FP16_ENGINES = ["SIMD_Q_FP16", "SIMD_Q_STOC_FP16", "SIMD_Q_STOC_MKL_FP16"]
-
-logger = logging.getLogger(__name__)
-
-def reset_optimizer_instance_count():
-    """
-    This function clears the _optimizer_instance_count. And keeps it
-    empty. This functionality is needed in some situations where
-    optimizer instance count might not reset even though the workplace is reset.
-    """
-    _optimizer_instance_count.clear()
-
-
-class Optimizer:
-    def __init__(self):
-        self._aux_params = AuxOptimizerParams(local=[], shared=[])
-        self._instance_num = _optimizer_instance_count[self.__class__.__name__]
-        _optimizer_instance_count[self.__class__.__name__] += 1
-        self._lr_multiplier = None
-        self._local_lr_multiplier = None
-        self._local_lr_multiplier_on_gpu = False
-        self._use_dedicated_lr_iteration_counter = False
-
-    """
-    Adds optimization operators to the net for given parameter and its gradient
-    Parameter is specified by either 'param' being a ParameterInfo object.
-    In this case  param.grad has to be set
-
-    Or by 'param' being a BlobReference and 'grad' being a BlobReference for its
-    gradient.
-    """
-
-    def __call__(self, net, param_init_net, param, grad=None):
-        if grad is None:
-            assert isinstance(
-                param, parameter_info.ParameterInfo
-            ), "Expected parameter to be of type ParameterInfo, got {}".format(param)
-            assert param.grad is not None
-        else:
-            if isinstance(param, basestring):
-                param = core.BlobReference(param)
-            param = parameter_info.ParameterInfo(param_id=None, param=param, grad=grad)
-
-        self._run(net, param_init_net, param)
-
-    def _run(self, net, param_init_net, param_info):
-        raise Exception("Not Implemented")
-
-    def get_cpu_blob_name(self, base_str, node_name=""):
-        classname = self.__class__.__name__
-        return "%s_%d_%s%s_cpu" % (classname, self._instance_num, base_str, node_name)
-
-    def get_gpu_blob_name(self, base_str, gpu_id, node_name):
-        classname = self.__class__.__name__
-        return "%s_%d_%s%s_gpu%d" % (
-            classname,
-            self._instance_num,
-            base_str,
-            node_name,
-            gpu_id,
-        )
-
-    @property
-    def attributes(self):
-        # return a dict that contains attributes related to init args only
-        attr = copy.deepcopy(self.__dict__)
-        del attr["_instance_num"]
-        return attr
-
-    @property
-    def use_dedicated_lr_iteration_counter(self):
-        return self._use_dedicated_lr_iteration_counter
-
-    @use_dedicated_lr_iteration_counter.setter
-    def use_dedicated_lr_iteration_counter(self, val):
-        self._use_dedicated_lr_iteration_counter = val
-
-    def make_unique_blob_name(self, base_str):
-        """
-        Returns a blob name that will be unique to the current device
-        and optimizer instance.
-        """
-        current_scope = scope.CurrentDeviceScope()
-        if current_scope is None:
-            return self.get_cpu_blob_name(base_str)
-
-        if core.IsGPUDeviceType(current_scope.device_type):
-            return self.get_gpu_blob_name(
-                base_str, current_scope.device_id, current_scope.node_name
-            )
-        else:
-            return self.get_cpu_blob_name(base_str, current_scope.node_name)
-
-    def build_lr(
-        self,
-        net,
-        param_init_net,
-        base_learning_rate,
-        learning_rate_blob=None,
-        policy="fixed",
-        iter_val=0,
-        **kwargs
-    ):
-        if learning_rate_blob is None:
-            learning_rate_blob = self.make_unique_blob_name("lr")
-
-        if self._use_dedicated_lr_iteration_counter:
-            iteration = utils.BuildUniqueMutexIter(
-                param_init_net,
-                net,
-                iter=utils.OPTIMIZER_ITERATION_LR_NAME,
-                iter_mutex=utils.ITERATION_MUTEX_LR_NAME,
-                iter_val=iter_val,
-            )
-            logger.info(f"Created dedicated learning rate iteration counter: {iteration}")
-        else:
-            iteration = utils.BuildUniqueMutexIter(param_init_net, net, iter_val=iter_val)
-
-        if not net.BlobIsDefined(learning_rate_blob):
-            # There is one interesting thing here: since we are minimizing, we are
-            # doing "descent" so the learning rate is set to be negative.
-            lr = net.LearningRate(
-                [iteration],
-                learning_rate_blob,
-                base_lr=-base_learning_rate,
-                policy=policy,
-                **kwargs
-            )
-        else:
-            lr = net.GetBlobRef(learning_rate_blob)
-
-        if self._lr_multiplier is not None:
-            lr_multiplier = net.CopyFromCPUInput(
-                self._lr_multiplier, self.make_unique_blob_name("lr_multiplier")
-            )
-
-            lr = net.Mul(
-                [lr, lr_multiplier],
-                self.make_unique_blob_name("scaled_lr"),
-                broadcast=1,
-            )
-
-        if self._local_lr_multiplier is not None:
-            current_scope = scope.CurrentDeviceScope()
-            if (
-                current_scope is not None
-                and core.IsGPUDeviceType(current_scope.device_type)
-                and not self._local_lr_multiplier_on_gpu
-            ):
-                local_lr_multiplier = net.CopyFromCPUInput(
-                    self._local_lr_multiplier,
-                    self.make_unique_blob_name("local_lr_multiplier"),
-                )
-            else:
-                local_lr_multiplier = self._local_lr_multiplier
-
-            lr = net.Mul(
-                [lr, local_lr_multiplier],
-                self.make_unique_blob_name("local_scaled_lr"),
-                broadcast=1,
-            )
-
-        return lr, iteration
-
-    def build_non_lr_iter(
-        self,
-        net,
-        param_init_net,
-        iter_val=0,
-    ):
-        assert (
-            self._use_dedicated_lr_iteration_counter
-        ), "This method should be only called when dedicated learning rate iteration counter is used."
-
-        iteration = utils.BuildUniqueMutexIter(param_init_net, net, iter_val=iter_val)
-        logger.info(f"Created iteration counter for non learning rate purposes: {iteration}")
-
-        # We need to create a dummy learning rate operator to enforce that
-        # iteration counter blob being placed in the trainer nodes. Otherwise,
-        # the Automatic Device Placement (ADP) algorithm for Hierachical
-        # Training (HT) will encounter issues to distribute blobs across group
-        # parameter servers. Note that this learning rate operator will not be
-        # used for any other purpose.
-        learning_rate_blob = self.make_unique_blob_name("iter_placement_hint")
-        if not net.BlobIsDefined(learning_rate_blob):
-            net.LearningRate(
-                [iteration],
-                learning_rate_blob,
-                base_lr=1.0,
-                policy="fixed",
-            )
-
-        return iteration
-
-    def add_lr_multiplier(self, lr_multiplier):
-        """
-        Set the global learning rate multiplier. If a multiplier already
-        existed, this will overwrite the existing multiplier. The multiplier is
-        used for all future calls to _run(), unless it is overwritten.
-        """
-        self._lr_multiplier = lr_multiplier
-
-    def _add_local_lr_multiplier(self, local_lr_multiplier, is_gpu_blob=False):
-        """
-        Set the local learning rate multiplier. This local multiplier is
-        multiplied with the global learning rate multiplier if it exists. As
-        with the global learning rate multiplier, this multiplier will be
-        used for all future calls to _run(), so please call
-        _clear_local_lr_multiplier() at the beginning of the optimizer's _run()
-        before optionally calling this function.
-        """
-        self._local_lr_multiplier = local_lr_multiplier
-        self._local_lr_multiplier_on_gpu = is_gpu_blob
-
-    def _clear_local_lr_multiplier(self):
-        self._local_lr_multiplier = None
-        self._local_lr_multiplier_on_gpu = False
-
-    @staticmethod
-    def dedup(net, sparse_dedup_aggregator, grad):
-        assert isinstance(
-            grad, core.GradientSlice
-        ), "Dedup only works for sparse gradient, got {}".format(grad)
-        if sparse_dedup_aggregator:
-            return net.DeduplicateGradientSlices(
-                grad, aggregator=sparse_dedup_aggregator
-            )
-        else:
-            return grad
-
-    def get_auxiliary_parameters(self):
-        """Returns a list of auxiliary parameters.
-
-        Returns:
-            aux_params: A namedtuple, AuxParams.
-
-            aux_params.local stores a list of blobs. Each blob is a local
-            auxiliary parameter. A local auxiliary parameter is a parameter in
-            parallel to a learning rate parameter. Take adagrad as an example,
-            the local auxiliary parameter is the squared sum parameter, because
-            every learning rate has a squared sum associated with it.
-
-            aux_params.shared also stores a list of blobs. Each blob is a shared
-            auxiliary parameter. A shared auxiliary parameter is a parameter
-            that is shared across all the learning rate parameters. Take adam as
-            an example, the iteration parameter is a shared parameter, because
-            all the learning rates share the same iteration parameter.
-        """
-        return self._aux_params
-
-    # TODO(xlwang): In transfer learning, parameter initialized from pretrained
-    # model might require a different learning rate than otherwise initialized.
-    # To this end, here we implement a python solution where
-    # `base_learning_rate` is scaled by `scale`, by calling
-    # `scale_learning_rate`; Alternatively, we can achieve same effect by
-    # rewriting the LearningRate operator in C++
-    # Note that it is the responsibility of specific optimizer to decide what
-    # logic should be used for `scale_learning_rate`
-    def scale_learning_rate(self, *args, **kwargs):
-        raise NotImplementedError(
-            "Optimizer Need to Implement `scale_learning_rate` method."
-        )
-
-    def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max):
-        wd = param_init_net.ConstantFill(
-            [], "weight_decay", shape=[1], value=weight_decay
-        )
-        trust = param_init_net.ConstantFill([], "trust", shape=[1], value=trust)
-        lr_max = param_init_net.ConstantFill([], "lr_max", shape=[1], value=lr_max)
-        return wd, trust, lr_max
-
-
-class SgdOptimizer(Optimizer):
-    def __init__(
-        self,
-        base_learning_rate=0.01,
-        policy="fixed",
-        momentum=0.0,
-        nesterov=True,
-        sparse_dedup_aggregator=None,
-        lars=None,
-        **kwargs
-    ):
-        super().__init__()
-        self.base_learning_rate = base_learning_rate
-        self.policy = policy
-        self.momentum = momentum
-        self.nesterov = nesterov
-        self.sparse_dedup_aggregator = sparse_dedup_aggregator
-        self.lars = lars
-        self.init_kwargs = kwargs
-
-    def _run(self, net, param_init_net, param_info):
-        param = param_info.blob
-        grad = param_info.grad
-        if self.base_learning_rate == 0:
-            return
-        assert (
-            self.base_learning_rate > 0
-        ), "Expect positive base learning rate, got {}".format(self.base_learning_rate)
-
-        self._clear_local_lr_multiplier()
-
-        # TODO(zqq): support LARS for sparse parameters
-        if self.lars is not None and not isinstance(grad, core.GradientSlice):
-            assert self.lars >= 0, "Lars offset must be nonnegative, got {}".format(
-                self.lars
-            )
-            wd, trust, lr_max = self.create_lars_inputs(
-                param_init_net, 0.0, 1.0, np.finfo(np.float32).max
-            )
-            lr_lars_multiplier = net.Lars(
-                [param, grad, wd, trust, lr_max],
-                self.make_unique_blob_name(str(param) + "_lars"),
-                offset=self.lars,
-                lr_min=0.0,
-            )
-            current_scope = scope.CurrentDeviceScope()
-            self._add_local_lr_multiplier(
-                lr_lars_multiplier,
-                is_gpu_blob=(
-                    current_scope is not None
-                    and core.IsGPUDeviceType(current_scope.device_type)
-                ),
-            )
-
-        # We need negative sign for LR when used directly with WeightedSum
-        # below.
-        lr_sign = -1 if self.momentum else 1
-        lr, _ = self.build_lr(
-            net,
-            param_init_net,
-            base_learning_rate=self.base_learning_rate * lr_sign,
-            policy=self.policy,
-            **(self.init_kwargs)
-        )
-
-        dev = scope.CurrentDeviceScope()
-        if dev is None:
-            dev = core.DeviceOption(caffe2_pb2.CPU)
-
-        # Each GPU/CPU must have its own ONE blob, thus modify the name
-        # to include device information.
-        ONE = param_init_net.ConstantFill(
-            [],
-            "ONE_{}_{}{}".format(dev.device_type, dev.device_id, dev.node_name),
-            shape=[1],
-            value=1.0,
-        )
-
-        self._aux_params.shared.append(ONE)
-
-        if self.momentum > 0:
-            momentum_data = param_init_net.ConstantFill(
-                param, str(param) + "_momentum", value=0.0
-            )
-            self._aux_params.local.append(momentum_data)
-
-        if isinstance(grad, core.GradientSlice):
-            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
-            if self.momentum > 0.0:
-                net.SparseMomentumSGDUpdate(
-                    [grad.values, momentum_data, lr, param, grad.indices],
-                    [grad.values, momentum_data, param],
-                    momentum=self.momentum,
-                    nesterov=self.nesterov,
-                )
-            else:
-                net.ScatterWeightedSum(
-                    [param, ONE, grad.indices, grad.values, lr], param
-                )
-        else:
-            if self.momentum > 0.0:
-                net.MomentumSGDUpdate(
-                    [grad, momentum_data, lr, param],
-                    [grad, momentum_data, param],
-                    momentum=self.momentum,
-                    nesterov=self.nesterov,
-                )
-            else:
-                coeff = lr
-
-                net.WeightedSum([param, ONE, grad, coeff], param)
-
-    def scale_learning_rate(self, scale):
-        self.base_learning_rate *= scale
-        return
-
-
-class MultiPrecisionSgdOptimizer(SgdOptimizer):
-    def __init__(
-        self,
-        base_learning_rate=0.1,
-        momentum=0.0,
-        policy="fixed",
-        nesterov=True,
-        sparse_dedup_aggregator=None,
-        **kwargs
-    ):
-        super().__init__(
-            base_learning_rate=base_learning_rate,
-            policy=policy,
-            momentum=momentum,
-            nesterov=nesterov,
-            sparse_dedup_aggregator=sparse_dedup_aggregator,
-            **kwargs
-        )
-
-    def _run(self, net, param_init_net, param_info):
-        param = param_info.blob
-        param_fp32 = (
-            param_info.blob_copy[core.DataType.FLOAT]
-            if param_info.blob_copy is not None
-            else None
-        )
-
-        # If we have a straight fp32 parameter, run the base class
-        if param_fp32 is None:
-            return SgdOptimizer._run(self, net, param_init_net, param_info)
-
-        grad = param_info.grad
-        if self.base_learning_rate == 0:
-            return
-        assert (
-            self.base_learning_rate > 0
-        ), "Expect positive base learning rate, got {}".format(self.base_learning_rate)
-
-        lr, _ = self.build_lr(
-            net,
-            param_init_net,
-            base_learning_rate=-self.base_learning_rate,
-            policy=self.policy,
-            **(self.init_kwargs)
-        )
-
-        momentum_data = param_init_net.ConstantFill(
-            param_fp32, str(param) + "_momentum", value=0.0
-        )
-        self._aux_params.local.append(momentum_data)
-
-        assert not isinstance(
-            grad, core.GradientSlice
-        ), "MultiPrecisionSgd does not support sparse gradients"
-
-        # Copy gradient to fp32
-        grad_fp32 = net.HalfToFloat(grad, grad + "_fp32")
-
-        # update (fused) in fp32
-        net.MomentumSGDUpdate(
-            [grad_fp32, momentum_data, lr, param_fp32],
-            [grad_fp32, momentum_data, param_fp32],
-            momentum=self.momentum,
-            nesterov=self.nesterov,
-        )
-
-        # Copy updated param back to fp16
-        net.FloatToHalf(param_fp32, param)
-
-
-class FP16SgdOptimizer(SgdOptimizer):
-    def __init__(
-        self,
-        base_learning_rate=0.1,
-        momentum=0.0,
-        policy="fixed",
-        nesterov=True,
-        weight_decay=0.0001,
-        sparse_dedup_aggregator=None,
-        **kwargs
-    ):
-        super().__init__(
-            base_learning_rate=base_learning_rate,
-            policy=policy,
-            momentum=momentum,
-            nesterov=nesterov,
-            sparse_dedup_aggregator=sparse_dedup_aggregator,
-            **kwargs
-        )
-        self.weight_decay = weight_decay
-
-    def _run(self, net, param_init_net, param_info, fp32_update=False):
-
-        fp32_update_flag = 0
-        param_name = str(param_info.blob)
-
-        # should only be triggered in FP16 training by SpatialBN, which
-        # requires FP32 params in CuDNN.
-        if param_name.find("spatbn") != -1:
-            fp32_update = True
-
-        if fp32_update:
-            # doing a 32bit update
-            # Have to assume param_info.blob is FP32 as there is no way
-            # (that i currently know of) to query a blob's type in python
-            fp32_update_flag = 1
-            param = param_info.blob
-            param_fp32 = param_info.blob
-        else:
-            if param_info.blob_copy is None:
-                # doing a 32bit update
-                # Have to assume param_info.blob is FP32 as there is no way
-                # (that i currently know of) to query a blob's type in python
-                fp32_update_flag = 1
-                param = param_info.blob
-                param_fp32 = param_info.blob
-            else:
-                if core.DataType.FLOAT in param_info.blob_copy:
-                    param = param_info.blob
-                    param_fp32 = param_info.blob_copy[core.DataType.FLOAT]
-                elif core.DataType.FLOAT16 in param_info.blob_copy:
-                    param = param_info.blob_copy[core.DataType.FLOAT16]
-                    param_fp32 = param_info.blob
-                else:
-                    AssertionError(
-                        "Unrecognized parameter format to be updated "
-                        "by FP16 Optimizer. Parameter: {}".format(param_info.name)
-                    )
-
-        grad = param_info.grad
-
-        if self.base_learning_rate == 0:
-            return
-        assert (
-            self.base_learning_rate > 0
-        ), "Expect positive base learning rate, got {}".format(self.base_learning_rate)
-
-        lr, _ = self.build_lr(
-            net,
-            param_init_net,
-            base_learning_rate=-self.base_learning_rate,
-            policy=self.policy,
-            **(self.init_kwargs)
-        )
-
-        momentum_data_fp32 = param_init_net.ConstantFill(
-            param_fp32, str(param) + "_momentum_fp32", value=0.0
-        )
-
-        momentum_data = param_init_net.FloatToHalf(
-            momentum_data_fp32, str(param) + "_momentum"
-        )
-
-        self._aux_params.local.append(momentum_data)
-
-        assert not isinstance(
-            grad, core.GradientSlice
-        ), "FP16Sgd does not support sparse gradients"
-
-        if fp32_update_flag == 0:
-            net.FP16MomentumSGDUpdate(
-                [grad, momentum_data, lr, param],
-                [grad, momentum_data, param],
-                momentum=self.momentum,
-                nesterov=self.nesterov,
-                weight_decay=self.weight_decay,
-            )
-        else:
-            # flag set to 1, therefore doing FP32 update
-            net.FP32MomentumSGDUpdate(
-                [grad, momentum_data_fp32, lr, param],
-                [grad, momentum_data_fp32, param],
-                momentum=self.momentum,
-                nesterov=self.nesterov,
-                weight_decay=self.weight_decay,
-            )
-
-
-class WeightDecayBuilder(Optimizer):
-    def __init__(self, weight_decay):
-        self.weight_decay = weight_decay
-
-    def _run(self, net, param_init_net, param_info):
-        dev = scope.CurrentDeviceScope()
-        if dev is None:
-            dev = core.DeviceOption(caffe2_pb2.CPU)
-
-        ONE = param_init_net.ConstantFill(
-            [], "ONE_{}_{}".format(dev.device_type, dev.device_id), shape=[1], value=1.0
-        )
-        WD = param_init_net.ConstantFill(
-            [],
-            "wd_{}_{}".format(dev.device_type, dev.device_id),
-            shape=[1],
-            value=self.weight_decay,
-        )
-
-        if isinstance(param_info.grad, core.GradientSlice):
-            raise ValueError("Weight decay does not yet support sparse gradients")
-        else:
-            net.WeightedSum(
-                [param_info.grad, ONE, param_info.blob, WD], param_info.grad
-            )
-
-
-class AdagradOptimizer(Optimizer):
-    def __init__(
-        self,
-        alpha=0.01,
-        epsilon=1e-4,
-        decay=1,
-        weight_decay=0.0,
-        policy="fixed",
-        sparse_dedup_aggregator=None,
-        rowWise=False,
-        engine="",
-        lars=None,
-        output_effective_lr=False,
-        output_effective_lr_and_update=False,
-        pruning_options=None,
-        swa_options=None,
-        ema_options=None,
-        weight_scale=None,
-        counter_halflife=-1,
-        use_dedicated_lr_iteration_counter=False,
-        **kwargs
-    ):
-        super().__init__()
-        self.alpha = alpha
-        self.epsilon = epsilon
-        self.decay = decay
-        self.weight_decay = float(weight_decay)
-        self.policy = policy
-        self.sparse_dedup_aggregator = sparse_dedup_aggregator
-        self.rowWise = rowWise
-        self.engine = engine
-        self.lars = lars
-        self.output_effective_lr = output_effective_lr
-        self.output_effective_lr_and_update = output_effective_lr_and_update
-        self.counter_halflife = counter_halflife
-        self.init_kwargs = kwargs
-        self.weight_scale = weight_scale
-        self.use_dedicated_lr_iteration_counter = use_dedicated_lr_iteration_counter
-
-        self._process_pruning_options(pruning_options)
-        self._process_swa_options(swa_options)
-        self._process_ema_options(ema_options)
-
-    def set_mapping_for_param2ema_teacher_param(self, param_mapping: Dict[str, Any]) -> None:
-        self.param2ema_teacher_param = param_mapping
-
-    def _process_swa_options(self, swa_options):
-        self.swa_enabled = True if swa_options else False
-        if self.swa_enabled:
-            self.swa_avg_start_it = swa_options.get("swa_avg_start_it", None)
-            self.swa_avg_end_it = swa_options.get("swa_avg_end_it", None)
-            self.swa_feedback_start_it = swa_options.get("swa_feedback_start_it", None)
-            self.swa_feedback_step = swa_options.get("swa_feedback_step", None)
-            self.swa_feedback_end_it = swa_options.get("swa_feedback_end_it", None)
-
-    def _process_ema_options(self, ema_options):
-        logger.info(f"ema_options: {str(ema_options)}")
-        self.ema_enabled = ema_options and ema_options.get("ema_alpha", None) is not None
-        self.ema_teacher_enabled = ema_options and ema_options.get("ema_teacher_alpha", None) is not None
-        self.param2ema_teacher_param = {}
-        if self.ema_enabled or self.ema_teacher_enabled:
-            self.ema_start = ema_options.get("ema_start", None)
-            self.ema_end = ema_options.get("ema_end", None)
-            self.ema_step = ema_options.get("ema_step", None)
-            self.ema_alpha = ema_options.get("ema_alpha", None)
-            self.ema_teacher_alpha = ema_options.get("ema_teacher_alpha", None)
-            self.ema_teacher_module_name = ema_options.get(
-                "ema_teacher_module_name", "ema_teacher_arch"
-            )
-
-    def _process_pruning_options(self, pruning_options):
-        self.use_mask = False
-
-        if pruning_options is None:
-            pruning_options = {}
-        else:
-            assert isinstance(pruning_options, dict), (
-                "pruning_options can only "
-                "be provided as a dictionary, currently: {}".format(pruning_options)
-            )
-
-        self.mask_tensor = pruning_options.get("mask_tensor", None)
-        self.mask_db_path = pruning_options.get("mask_db_path", None)
-        self.mask_db_type = pruning_options.get("mask_db_type", None)
-        self.mask_blob_name = pruning_options.get("mask_blob_name", None)
-        self.prune_delays = pruning_options.get("prune_delays", [])
-        self.prune_ratios = pruning_options.get("prune_ratios", [])
-        self.prune_block_size = pruning_options.get("prune_block_size", 1)
-
-        if self.mask_tensor is not None:
-            assert (
-                type(self.mask_tensor) is np.ndarray
-            ), "mask_tensor must be a numpy array!"
-            assert self.mask_db_path is None, (
-                "mask can be provided through either a numpy array "
-                "or a db path, not both"
-            )
-            assert self.mask_db_type is None, (
-                "mask can be provided through either a numpy array "
-                "or a db path, not both"
-            )
-            assert self.mask_blob_name is None, (
-                "mask can be provided through either a numpy array "
-                "or a db path, not both"
-            )
-            self.use_mask = True
-
-        if self.mask_db_path is not None or self.mask_db_type is not None:
-            assert self.mask_db_path is not None, (
-                "when mask is provided through db, "
-                "db path, db type, and blob name are all needed"
-            )
-            assert self.mask_db_type is not None, (
-                "when mask is provided through db, "
-                "db path, db type, and blob name are all needed"
-            )
-            assert self.mask_tensor is None, (
-                "mask can be provided through either a numpy array "
-                "or a db path, not both"
-            )
-            self.use_mask = True
-
-        if self.prune_delays:
-            assert self.prune_ratios is not None and len(self.prune_delays) == len(
-                self.prune_ratios
-            ), "Prune Delays and prune ratios should be of the same length"
-            assert (
-                self.mask_tensor is None
-            ), "Mask Tensor should be None with prune ratios"
-            assert (
-                self.mask_db_path is None
-            ), "Mask DB Path should be None with prune ratios"
-            self.use_mask = True
-
-    def _run(self, net, param_init_net, param_info):
-        param = param_info.blob
-        grad = param_info.grad
-
-        if self.alpha <= 0:
-            return
-
-        self._clear_local_lr_multiplier()
-
-        if self.lars is not None and not isinstance(grad, core.GradientSlice):
-            assert (
-                self.weight_decay == 0
-            ), "weight decay is not implemented for LARS yet"
-            assert self.lars >= 0, "Lars offset must be nonnegative, got {}".format(
-                self.lars
-            )
-            wd, trust, lr_max = self.create_lars_inputs(
-                param_init_net, 0.0, 1.0, np.finfo(np.float32).max
-            )
-            lr_lars_multiplier = net.Lars(
-                [param, grad, wd, trust, lr_max],
-                self.make_unique_blob_name(str(param) + "_lars"),
-                offset=self.lars,
-                lr_min=0.0,
-            )
-
-            current_scope = scope.CurrentDeviceScope()
-            self._add_local_lr_multiplier(
-                lr_lars_multiplier,
-                is_gpu_blob=(
-                    current_scope is not None
-                    and core.IsGPUDeviceType(current_scope.device_type)
-                ),
-            )
-
-        lr, lr_iteration = self.build_lr(
-            net,
-            param_init_net,
-            base_learning_rate=self.alpha,
-            policy=self.policy,
-            **(self.init_kwargs)
-        )
-        iteration = (
-            self.build_non_lr_iter(net, param_init_net, iter_val=0)
-            if self._use_dedicated_lr_iteration_counter
-            else lr_iteration
-        )
-
-        if self.counter_halflife > 0:
-            self._aux_params.shared.append(iteration)
-
-        if self.rowWise:
-            logger.debug(
-                "Using engine {} for rowWise Adagrad to train param {}".format(
-                    self.engine, param
-                )
-            )
-
-            shapes, types = workspace.InferShapesAndTypes([param_init_net])
-            if str(param) not in shapes:
-                # Type/shape inference is not available for this param, fallback
-                # on Shape/Slice logic
-                shape = param_init_net.Shape(param, str(param) + "_shape")
-                num_rows = param_init_net.Slice(
-                    [shape], str(shape) + "_numrows", starts=[0], ends=[1]
-                )
-                param_squared_sum = param_init_net.ConstantFill(
-                    num_rows,
-                    str(param) + "_avg_squared_sum",
-                    input_as_shape=1,
-                    value=0.0,
-                )
-            else:
-                param_squared_sum = param_init_net.ConstantFill(
-                    [],
-                    str(param) + "_avg_squared_sum",
-                    shape=[shapes[str(param)][0]],
-                    value=0.0,
-                )
-        else:
-            logger.debug(
-                "Using engine {} for regular Adagrad to train param {}".format(
-                    self.engine, param
-                )
-            )
-
-            if self.engine in FP16_ENGINES:
-                assert (
-                    self.weight_decay == 0
-                ), "weight decay is not tested for engine: {}".format(self.engine)
-
-                shapes, types = workspace.InferShapesAndTypes([param_init_net])
-                assert str(param) in shapes, shapes
-                shape = shapes[str(param)]
-
-                param_squared_sum = param_init_net.Float16ConstantFill(
-                    [], str(param) + "_squared_sum", value=0.0, shape=shape
-                )
-            else:
-                param_squared_sum = param_init_net.ConstantFill(
-                    [param], str(param) + "_squared_sum", value=0.0
-                )
-
-        if self.use_mask is True:
-            assert (
-                self.weight_decay == 0
-            ), "weight decay is not implemented for use_mask yet"
-
-            if self.mask_tensor is not None:
-                if not isinstance(grad, core.GradientSlice):
-                    mask_blob = param_init_net.GivenTensorFill(
-                        [],
-                        [str(param) + "_mask"],
-                        values=self.mask_tensor,
-                        shape=self.mask_tensor.shape,
-                    )
-                else:
-                    self.mask_tensor = self.mask_tensor.astype(np.uint8)
-                    mask_blob = param_init_net.GivenTensorBoolFill(
-                        [],
-                        [str(param) + "_mask"],
-                        values=self.mask_tensor,
-                        shape=self.mask_tensor.shape,
-                    )
-                    mask_blob = param_init_net.Cast(mask_blob, to=core.DataType.UINT8)
-                    mask_changed_blob = param_init_net.ConstantFill(
-                        [],
-                        [str(param) + "_mask_changed_blob"],
-                        value=False,
-                        dtype=core.DataType.BOOL,
-                        shape=[1],
-                    )
-            elif (
-                self.mask_db_path is not None or self.mask_db_type is not None
-            ):  # mask is provided through a db file
-                # if mask_blob_name is not given use the param name to derive mask name
-                self.mask_blob_name = self.mask_blob_name or str(param) + "_mask"
-
-                mask_blob = param_init_net.Load(
-                    [],
-                    self.mask_blob_name,
-                    db=self.mask_db_path,
-                    db_type=self.mask_db_type,
-                    absolute_path=True,
-                )
-
-                if isinstance(grad, core.GradientSlice):
-                    mask_changed_blob = param_init_net.ConstantFill(
-                        [],
-                        [str(param) + "_mask_changed_blob"],
-                        value=False,
-                        dtype=core.DataType.BOOL,
-                        shape=[1],
-                    )
-            elif self.prune_delays:
-                last_mask_updated_iter = param_init_net.ConstantFill(
-                    [],
-                    [str(param) + "_last_mask_updated_iter"],
-                    value=-1,
-                    dtype=core.DataType.INT64,
-                    shape=[1],
-                )
-
-                if isinstance(grad, core.GradientSlice):
-                    AssertionError(
-                        "Prune Delays and Prune Ratios are currently not supported"
-                        "for sparse operators"
-                    )
-                else:
-                    mask_blob = param_init_net.GivenTensorFill(
-                        [],
-                        [str(param) + "_empty_mask"],
-                        values=[],
-                        dtype=core.DataType.FLOAT,
-                        shape=[0],
-                    )
-            else:
-                raise NotImplementedError(
-                    "If mask is used, it needs a numpy array or a db file or"
-                    "a delay iter needs to be provided"
-                )
-
-        self._aux_params.local.append(param_squared_sum)
-        if self.counter_halflife > 0:
-            shapes, types = workspace.InferShapesAndTypes([param_init_net])
-            if str(param) not in shapes:
-                shape = param_init_net.Shape(param, str(param) + "_shape")
-                num_rows = param_init_net.Slice(
-                    [shape], str(shape) + "_numrows", starts=[0], ends=[1]
-                )
-                update_counter = param_init_net.ConstantFill(
-                    num_rows,
-                    str(param) + "_update_counter",
-                    input_as_shape=1,
-                    value=0.0,
-                    dtype=core.DataType.DOUBLE,
-                )
-                prev_update_iter = param_init_net.ConstantFill(
-                    num_rows,
-                    str(param) + "_prev_update_iter",
-                    input_as_shape=1,
-                    value=0,
-                    dtype=core.DataType.INT64,
-                )
-            else:
-                update_counter = param_init_net.ConstantFill(
-                    [],
-                    str(param) + "_update_counter",
-                    shape=[shapes[str(param)][0]],
-                    value=0.0,
-                    dtype=core.DataType.DOUBLE,
-                )
-                prev_update_iter = param_init_net.ConstantFill(
-                    [],
-                    str(param) + "_prev_update_iter",
-                    shape=[shapes[str(param)][0]],
-                    value=0,
-                    dtype=core.DataType.INT64,
-                )
-            self._aux_params.local.append(update_counter)
-            self._aux_params.local.append(prev_update_iter)
-
-        if self.rowWise:
-            assert isinstance(grad, core.GradientSlice), (
-                "If SparseAdagrad with rowWise=True, gradient must be "
-                "a gradientslice. PLease ensure that rowWise is not enabled "
-                "for the dense Adagrad optimizer, as it is not supported."
-            )
-
-        shapes, _ = workspace.InferShapesAndTypes([param_init_net])
-        param_shape = shapes[str(param)]
-        weight_decay = 0.0
-        if isinstance(grad, core.GradientSlice):
-            if len(param_shape) == 1:
-                weight_decay = 0.0
-                logger.warn(
-                    "SKIPPING weight decay on 1d sparse param: {}.shape is {}".format(
-                        str(param), param_shape
-                    )
-                )
-            else:
-                weight_decay = self.weight_decay
-        else:
-            # Skip weight decay for 1d parameters
-            if len(param_shape) == 1:
-                weight_decay = 0.0
-                logger.warning(
-                    "SKIPPING weight decay on 1d dense param: {}.shape is {}".format(
-                        str(param), param_shape
-                    )
-                )
-            else:
-                weight_decay = self.weight_decay
-        logger.debug(
-            "weight_decay for {} (shape:{}): {}".format(
-                str(param), param_shape, weight_decay
-            )
-        )
-
-        if isinstance(grad, core.GradientSlice):
-            assert (
-                self.decay == 1.0
-            ), "Decay is not implemented for SparseAdagrad and must be set to 1"
-            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
-
-            input_args = [param, param_squared_sum, grad.indices, grad.values, lr]
-            output_args = [param, param_squared_sum]
-            if self.rowWise:
-                if self.use_mask is True:
-                    op = "MaskedRowWiseSparseAdagrad"
-                    assert (
-                        weight_decay == 0
-                    ), "weight decay is not implemented for {} yet".format(op)
-                    input_args += [mask_blob, mask_changed_blob]
-                else:
-                    if self.counter_halflife > 0:
-                        input_args += [update_counter]
-                    op = "RowWiseSparseAdagrad"
-            else:
-                if self.use_mask is True:
-                    op = "MaskedSparseAdagrad"
-                    assert (
-                        weight_decay == 0
-                    ), "weight decay is not implemented for {} yet".format(op)
-                    input_args += [mask_blob, mask_changed_blob]
-                else:
-                    op = "SparseAdagrad"
-            logger.debug("using {} for {}".format(op, str(param)))
-
-            if self.prune_delays:
-                input_args += [iteration, last_mask_updated_iter]
-                output_args += [mask_blob, last_mask_updated_iter]
-
-            if weight_decay > 0 and self.counter_halflife == -1:
-                net.__getattr__(op)(
-                    input_args,
-                    output_args,
-                    epsilon=self.epsilon,
-                    weight_decay=weight_decay,
-                    engine=self.engine,
-                )
-            elif weight_decay > 0 and self.counter_halflife != -1:
-                net.__getattr__(op)(
-                    input_args,
-                    output_args,
-                    epsilon=self.epsilon,
-                    weight_decay=weight_decay,
-                    engine=self.engine,
-                    counter_halflife=self.counter_halflife,
-                )
-            else:
-                net.__getattr__(op)(
-                    input_args, output_args, epsilon=self.epsilon, engine=self.engine
-                )
-            if self.counter_halflife > 0:
-                net.RowWiseCounter(
-                    [prev_update_iter, update_counter, grad.indices, iteration],
-                    [prev_update_iter, update_counter],
-                    counter_halflife=self.counter_halflife,
-                )
-        else:
-            input_args = [param, param_squared_sum, grad, lr]
-            output_args = [param, param_squared_sum]
-
-            if self.output_effective_lr_and_update:
-                assert (
-                    self.use_mask is False
-                ), "MaskedAdagrad doesn't support outputting effective_lr_and_update"
-                output_args.append(str(param) + "_effective_lr")
-                output_args.append(str(param) + "_update")
-            elif self.output_effective_lr:
-                assert (
-                    self.use_mask is False
-                ), "MaskedAdagrad doesn't support outputting effective_lr"
-                output_args.append(str(param) + "_effective_lr")
-
-            if self.use_mask is True:
-                input_args += [mask_blob]
-
-            if self.prune_delays:
-                input_args += [iteration, last_mask_updated_iter]
-                output_args += [mask_blob, last_mask_updated_iter]
-
-            if self.use_mask:
-                assert (
-                    weight_decay == 0
-                ), "weight decay is not implemented for use_mask yet"
-                net.MaskedAdagrad(
-                    input_args,
-                    output_args,
-                    epsilon=self.epsilon,
-                    decay=float(self.decay),
-                    block_size=self.prune_block_size,
-                    delays=self.prune_delays,
-                    prune_ratios=self.prune_ratios,
-                    engine=self.engine,
-                )
-            else:
-                if weight_decay > 0:
-                    net.Adagrad(
-                        input_args,
-                        output_args,
-                        epsilon=self.epsilon,
-                        decay=float(self.decay),
-                        weight_decay=weight_decay,
-                        engine=self.engine,
-                    )
-                else:
-                    net.Adagrad(
-                        input_args,
-                        output_args,
-                        epsilon=self.epsilon,
-                        decay=float(self.decay),
-                        engine=self.engine,
-                    )
-
-                if self.swa_enabled:
-                    param_swa = str(param) + "_swa"
-                    if not param_init_net.BlobIsDefined(param_swa):
-                        param_init_net.ConstantFill([param], param_swa, value=0.0)
-                        self._aux_params.local.append(param_swa)
-
-                    net.SWA(
-                        [param, param_swa, iteration],
-                        [param, param_swa],
-                        avg_start=self.swa_avg_start_it,
-                        avg_end=self.swa_avg_end_it,
-                        feedback_start=self.swa_feedback_start_it,
-                        feedback_step=self.swa_feedback_step,
-                        feedback_end=self.swa_feedback_end_it,
-                    )
-
-        if self.ema_enabled:
-            param_ema = str(param) + "_ema"
-            if not param_init_net.BlobIsDefined(param_ema):
-                param_init_net.ConstantFill([param], param_ema, value=0.0)
-                self._aux_params.local.append(param_ema)
-
-            net.EMA(
-                [param, param_ema, iteration],
-                [param, param_ema],
-                ema_start=self.ema_start,
-                ema_end=self.ema_end,
-                ema_step=self.ema_step,
-                ema_alpha=self.ema_alpha,
-            )
-
-
-        if self.ema_teacher_enabled:
-            if param in self.param2ema_teacher_param:
-                param_ema_teacher = self.param2ema_teacher_param[param]
-                if not param_init_net.BlobIsDefined(param_ema_teacher):
-                    param_init_net.ConstantFill([param], param_ema_teacher, value=0.0)
-                    self._aux_params.local.append(param_ema_teacher)
-
-                net.EMA(
-                    [param, param_ema_teacher, iteration],
-                    [param, param_ema_teacher],
-                    ema_start=self.ema_start,
-                    ema_end=self.ema_end,
-                    ema_step=self.ema_step,
-                    ema_alpha=self.ema_teacher_alpha,
-                )
-
-        if self.weight_scale:
-            net.WeightScale(
-                [param, iteration],
-                [param],
-                stepsize=self.weight_scale.stepsize,
-                upper_bound_iter=self.weight_scale.upper_bound_iter,
-                scale=float(self.weight_scale.scale),
-            )
-            if self.weight_scale.to_aux:
-                net.WeightScale(
-                    [param_squared_sum, iteration],
-                    [param_squared_sum],
-                    stepsize=self.weight_scale.stepsize,
-                    upper_bound_iter=self.weight_scale.upper_bound_iter,
-                    scale=float(self.weight_scale.scale),
-                )
-
-    def scale_learning_rate(self, scale):
-        self.alpha *= scale
-        return
-
-
-class WngradOptimizer(Optimizer):
-    def __init__(
-        self,
-        alpha=1.0,
-        epsilon=1e-9,
-        policy="fixed",
-        sparse_dedup_aggregator=None,
-        engine="",
-        moment_init=100.0,
-        lars=None,
-        output_effective_lr=False,
-        output_effective_lr_and_update=False,
-        **kwargs
-    ):
-        super().__init__()
-        self.alpha = alpha
-        self.epsilon = epsilon
-        self.policy = policy
-        self.sparse_dedup_aggregator = sparse_dedup_aggregator
-        self.engine = engine
-        self.moment_init = moment_init
-        self.lars = lars
-        self.output_effective_lr = output_effective_lr
-        self.output_effective_lr_and_update = output_effective_lr_and_update
-        self.init_kwargs = kwargs
-
-    def _run(self, net, param_init_net, param_info):
-        param = param_info.blob
-        grad = param_info.grad
-
-        if self.alpha <= 0:
-            return
-
-        self._clear_local_lr_multiplier()
-
-        if self.lars is not None and not isinstance(grad, core.GradientSlice):
-            assert self.lars >= 0, "Lars offset must be nonnegative, got {}".format(
-                self.lars
-            )
-            wd, trust, lr_max = self.create_lars_inputs(
-                param_init_net, 0.0, 1.0, np.finfo(np.float32).max
-            )
-            lr_lars_multiplier = net.Lars(
-                [param, grad, wd, trust, lr_max],
-                self.make_unique_blob_name(str(param) + "_lars"),
-                offset=self.lars,
-                lr_min=0.0,
-            )
-            current_scope = scope.CurrentDeviceScope()
-            self._add_local_lr_multiplier(
-                lr_lars_multiplier,
-                is_gpu_blob=(
-                    current_scope is not None
-                    and core.IsGPUDeviceType(current_scope.device_type)
-                ),
-            )
-
-        lr, _ = self.build_lr(
-            net,
-            param_init_net,
-            base_learning_rate=self.alpha,
-            policy=self.policy,
-            **(self.init_kwargs)
-        )
-
-        moment = param_init_net.ConstantFill(
-            [], str(param) + "_moment", shape=[1], value=self.moment_init
-        )
-
-        self._aux_params.local.append(moment)
-
-        if isinstance(grad, core.GradientSlice):
-            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
-            net.SparseWngrad(
-                [param, moment, grad.indices, grad.values, lr],
-                [param, moment],
-                epsilon=self.epsilon,
-                engine=self.engine,
-            )
-        else:
-            output_args = [param, moment]
-            if self.output_effective_lr_and_update:
-                output_args.append(str(param) + "_effective_lr")
-                output_args.append(str(param) + "_update")
-            elif self.output_effective_lr:
-                output_args.append(str(param) + "_effective_lr")
-
-            net.Wngrad(
-                [param, moment, grad, lr],
-                output_args,
-                epsilon=self.epsilon,
-                engine=self.engine,
-            )
-
-    def scale_learning_rate(self, scale):
-        self.alpha *= scale
-        return
-
-
-class StormOptimizer(Optimizer):
-    def __init__(
-        self,
-        lr=0.1,
-        momentum=10.0,
-        beta=0.1,
-        grad_sq_init=0.01,
-        policy="fixed",
-        sparse_dedup_aggregator=None,
-        lars=None,
-        **kwargs
-    ):
-        """Constructor function to add STORM Optimizer
-
-        Args:
-            lr: learning rate scaling (called k in the original paper)
-            momentum: momentum scaling (called c in the original paper)
-            beta: initial value of denominator in adaptive learning rate (
-              called c in the original paper)
-            grad_sq_init: initial value of gradient squared accumulator.
-            policy: specifies how learning rate should be applied, options are
-              'fixed', 'step', 'exp', etc.
-            sparse_dedup_aggregator: specifies deduplication strategy for
-              gradient slices. Works while using sparse gradients. Options
-              include 'mean' and 'sum'.
-            lars: lars offset.
-        """
-        super().__init__()
-        self.lr = lr
-        self.momentum = momentum
-        self.beta = beta
-        self.grad_sq_init = grad_sq_init
-        self.policy = policy
-        self.sparse_dedup_aggregator = sparse_dedup_aggregator
-        self.lars = lars
-        self.init_kwargs = kwargs
-
-    def _run(self, net, param_init_net, param_info):
-        param = param_info.blob
-        grad = param_info.grad
-
-        if self.lr <= 0:
-            return
-
-        self._clear_local_lr_multiplier()
-
-        if self.lars is not None and not isinstance(grad, core.GradientSlice):
-            assert self.lars >= 0, "Lars offset must be nonnegative, got {}".format(
-                self.lars
-            )
-            wd, trust, lr_max = self.create_lars_inputs(
-                param_init_net, 0.0, 1.0, np.finfo(np.float32).max
-            )
-            lr_lars_multiplier = net.Lars(
-                [param, grad, wd, trust, lr_max],
-                self.make_unique_blob_name(str(param) + "_lars"),
-                offset=self.lars,
-                lr_min=0.0,
-            )
-            current_scope = scope.CurrentDeviceScope()
-            self._add_local_lr_multiplier(
-                lr_lars_multiplier,
-                is_gpu_blob=(
-                    current_scope is not None
-                    and core.IsGPUDeviceType(current_scope.device_type)
-                ),
-            )
-
-        lr, _ = self.build_lr(
-            net,
-            param_init_net,
-            base_learning_rate=self.lr,
-            policy=self.policy,
-            **(self.init_kwargs)
-        )
-
-        moment = param_init_net.ConstantFill(param, str(param) + "_moment", value=0.0)
-        self._aux_params.local.append(moment)
-
-        grad_sq_sum = param_init_net.ConstantFill(
-            [], str(param) + "_grad_sq_sum", shape=[1], value=self.grad_sq_init
-        )
-        self._aux_params.local.append(grad_sq_sum)
-
-        if isinstance(grad, core.GradientSlice):
-            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
-            net.SparseStorm(
-                [param, moment, grad_sq_sum, grad.values, grad.indices, lr],
-                [param, moment, grad_sq_sum],
-                momentum=self.momentum,
-                beta=self.beta,
-            )
-        else:
-            net.Storm(
-                [param, moment, grad_sq_sum, grad, lr],
-                [param, moment, grad_sq_sum],
-                momentum=self.momentum,
-                beta=self.beta,
-            )
-
-    def scale_learning_rate(self, scale):
-        self.lr *= scale
-
-
-class AdadeltaOptimizer(Optimizer):
-    def __init__(
-        self,
-        alpha=0.01,
-        epsilon=1e-4,
-        decay=0.95,
-        policy="fixed",
-        sparse_dedup_aggregator=None,
-        engine="",
-        **kwargs
-    ):
-        """Constructor function to add Adadelta Optimizer
-
-        Args:
-            alpha: learning rate
-            epsilon: attribute of Adadelta to avoid numerical issues
-            decay: attribute of Adadelta to decay the squared gradient sum
-            policy: specifies how learning rate should be applied, options are
-              "fixed", "step", "exp", etc.
-            sparse_dedup_aggregator: specifies deduplication strategy for
-              gradient slices. Works while using sparse gradients. Options
-              include "mean" and "sum".
-            engine: the engine used, options include "", "CUDNN", etc.
-        """
-        super().__init__()
-        self.alpha = alpha
-        self.epsilon = epsilon
-        self.decay = decay
-        self.policy = policy
-        self.sparse_dedup_aggregator = sparse_dedup_aggregator
-        self.engine = engine
-        self.init_kwargs = kwargs
-
-    def _run(self, net, param_init_net, param_info):
-        param = param_info.blob
-        grad = param_info.grad
-
-        if self.alpha <= 0:
-            return
-
-        lr, _ = self.build_lr(
-            net,
-            param_init_net,
-            base_learning_rate=self.alpha,
-            policy=self.policy,
-            **(self.init_kwargs)
-        )
-
-        moment = param_init_net.ConstantFill(
-            [param], str(param) + "_squared_moment", value=0.0
-        )
-
-        moment_update = param_init_net.ConstantFill(
-            [param], str(param) + "_squared_moment_update", value=0.0
-        )
-
-        self._aux_params.local.append(moment)
-        self._aux_params.local.append(moment_update)
-
-        if isinstance(grad, core.GradientSlice):
-            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
-            net.SparseAdadelta(
-                [param, moment, moment_update, grad.indices, grad.values, lr],
-                [param, moment, moment_update],
-                epsilon=self.epsilon,
-                decay=self.decay,
-                engine=self.engine,
-            )
-        else:
-            net.Adadelta(
-                [param, moment, moment_update, grad, lr],
-                [param, moment, moment_update],
-                epsilon=self.epsilon,
-                decay=self.decay,
-                engine=self.engine,
-            )
-
-    def scale_learning_rate(self, scale):
-        self.alpha *= scale
-        return
-
-
-class FtrlOptimizer(Optimizer):
-    def __init__(
-        self,
-        alpha=0.01,
-        beta=1e-4,
-        lambda1=0,
-        lambda2=0,
-        sparse_dedup_aggregator=None,
-        engine="",
-    ):
-        super().__init__()
-        self.alpha = alpha
-        self.beta = beta
-        self.lambda1 = lambda1
-        self.lambda2 = lambda2
-        self.sparse_dedup_aggregator = sparse_dedup_aggregator
-        self.engine = engine
-
-    def _run(self, net, param_init_net, param_info):
-        param = param_info.blob
-        grad = param_info.grad
-
-        if self.alpha <= 0:
-            return
-
-        nz = param_init_net.ConstantFill(
-            [param], str(param) + "_ftrl_nz", extra_shape=[2], value=0.0
-        )
-        self._aux_params.local.append(nz)
-        if isinstance(grad, core.GradientSlice):
-            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
-            net.SparseFtrl(
-                [param, nz, grad.indices, grad.values],
-                [param, nz],
-                engine=self.engine,
-                alpha=self.alpha,
-                beta=self.beta,
-                lambda1=self.lambda1,
-                lambda2=self.lambda2,
-            )
-        else:
-            net.Ftrl(
-                [param, nz, grad],
-                [param, nz],
-                engine=self.engine,
-                alpha=self.alpha,
-                beta=self.beta,
-                lambda1=self.lambda1,
-                lambda2=self.lambda2,
-            )
-
-    def scale_learning_rate(self, scale):
-        self.alpha *= scale
-        return
-
-
-class GFtrlOptimizer(Optimizer):
-    """Group Lasso FTRL Optimizer."""
-
-    def __init__(
-        self,
-        alpha=0.01,
-        beta=1e-4,
-        lambda1=0,
-        lambda2=0,
-        sparse_dedup_aggregator=None,
-        engine="",
-    ):
-        super().__init__()
-        self.alpha = alpha
-        self.beta = beta
-        self.lambda1 = lambda1
-        self.lambda2 = lambda2
-        self.sparse_dedup_aggregator = sparse_dedup_aggregator
-        self.engine = engine
-
-    def _run(self, net, param_init_net, param_info):
-        param = param_info.blob
-        grad = param_info.grad
-
-        if self.alpha <= 0:
-            return
-
-        nz = param_init_net.ConstantFill(
-            [param], str(param) + "_gftrl_nz", extra_shape=[2], value=0.0
-        )
-        self._aux_params.local.append(nz)
-        net.GFtrl(
-            [param, nz, grad],
-            [param, nz],
-            engine=self.engine,
-            alpha=self.alpha,
-            beta=self.beta,
-            lambda1=self.lambda1,
-            lambda2=self.lambda2,
-        )
-
-    def scale_learning_rate(self, scale):
-        self.alpha *= scale
-        return
-
-
-class AdamOptimizer(Optimizer):
-    def __init__(
-        self,
-        alpha=0.001,
-        beta1=0.9,
-        beta2=0.999,
-        epsilon=1e-8,
-        policy="fixed",
-        use_lr_adaption=False,
-        lr_alpha=0.01,
-        normalized_lr_adaption=True,
-        sparse_dedup_aggregator=None,
-        rowWise=False,
-        engine="",
-        enableRAdam=False,
-        use_smart_decay=False,  # See https://fburl.com/2jdiwrhy for context.
-        **kwargs
-    ):
-        super().__init__()
-        self.alpha = alpha
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.policy = policy
-        self.use_lr_adaption = use_lr_adaption
-        self.lr_alpha = lr_alpha
-        self.normalized_lr_adaption = normalized_lr_adaption
-        self.sparse_dedup_aggregator = sparse_dedup_aggregator
-        self.rowWise = rowWise
-        self.engine = engine
-        self.enableRAdam = enableRAdam
-        if use_smart_decay:
-            if rowWise:
-                raise NotImplementedError(('Smart decay is not implemented for rowWise Adam.  '
-                                           'Set rowWise or use_smart_decay to False.'))
-            if enableRAdam:
-                raise NotImplementedError(('Smart decay is not implemented for RAdam.  '
-                                           'Set enableRAdam or use_smart_decay to False.'))
-            if use_lr_adaption:
-                raise NotImplementedError(('Smart decay is not implemented with lr_adaption.  '
-                                           'Set use_lr_adaption or use_smart_decay to False.'))
-
-        self.use_smart_decay = use_smart_decay
-        self.init_kwargs = kwargs
-
-    def _run(self, net, param_init_net, param_info):
-        param = param_info.blob
-        grad = param_info.grad
-
-        if self.alpha <= 0:
-            return
-
-        lr, iteration = self.build_lr(
-            net,
-            param_init_net,
-            base_learning_rate=self.alpha,
-            policy=self.policy,
-            **(self.init_kwargs)
-        )
-
-        m1 = param_init_net.ConstantFill([param], param + "_first_moment", value=0.0)
-
-        if self.rowWise:
-            shapes, types = workspace.InferShapesAndTypes([param_init_net])
-            m2 = param_init_net.ConstantFill(
-                [], param + "_avg_second_moment", shape=[shapes[param][0]], value=0.0
-            )
-        else:
-            m2 = param_init_net.ConstantFill(
-                [param], param + "_second_moment", value=0.0
-            )
-
-        # Initialize "minibatch in which this parameter was last seen" for smart decay.
-        if self.use_smart_decay:
-            shapes, _ = workspace.InferShapesAndTypes([param_init_net])
-            last_seen = param_init_net.ConstantFill(
-                [], param + "_last_seen", shape=[shapes[param][0]], value=0, dtype=core.DataType.INT64
-            )
-            self._aux_params.local.append(last_seen)
-
-        self._aux_params.shared.append(iteration)
-        self._aux_params.local.append(m1)
-        self._aux_params.local.append(m2)
-
-        if self.rowWise:
-            assert isinstance(grad, core.GradientSlice), (
-                "If SparseAdam with rowWise=True, gradient must be "
-                "a gradientslice. PLease ensure that rowWise is not enabled "
-                "for the dense Adam optimizer, as it is not supported."
-            )
-
-        output_blobs = [param, m1, m2]
-
-        if self.use_smart_decay:
-            output_blobs.append(last_seen)
-
-        if self.use_lr_adaption:
-            effective_grad = str(param) + "_effective_grad"
-            output_blobs.append(effective_grad)
-
-        if isinstance(grad, core.GradientSlice):
-            grad = self.dedup(net, self.sparse_dedup_aggregator, grad)
-            if self.rowWise:
-                op = "RowWiseSparseAdam"
-            elif self.use_smart_decay:
-                op = "SmartDecaySparseAdam"
-            else:
-                op = "SparseAdam"
-
-            # Currently, only SparseAdam support RAdam, other Adam Ops will support later
-            if op == "SparseAdam":
-                net.__getattr__(op)(
-                    [param, m1, m2, grad.indices, grad.values, lr, iteration],
-                    output_blobs,
-                    beta1=self.beta1,
-                    beta2=self.beta2,
-                    epsilon=self.epsilon,
-                    enableRAdam=self.enableRAdam,
-                )
-            elif op == "SmartDecaySparseAdam":
-                net.__getattr__(op)(
-                    [param, m1, m2, last_seen, grad.indices, grad.values, lr, iteration],
-                    output_blobs,
-                    beta1=self.beta1,
-                    beta2=self.beta2,
-                    epsilon=self.epsilon,
-                )
-            else:
-                assert (
-                    not self.enableRAdam
-                ), "Currently, RowWiseSparseAdam is not supported by RAdam!"
-                net.__getattr__(op)(
-                    [param, m1, m2, grad.indices, grad.values, lr, iteration],
-                    output_blobs,
-                    beta1=self.beta1,
-                    beta2=self.beta2,
-                    epsilon=self.epsilon,
-                )
-
-            if self.use_lr_adaption:
-                net.LearningRateAdaption(
-                    [lr, grad.values, effective_grad],
-                    [lr],
-                    lr_alpha=self.lr_alpha,
-                    normalized_lr_adaption=self.normalized_lr_adaption,
-                )
-
-        else:
-            net.Adam(
-                [param, m1, m2, grad, lr, iteration],
-                output_blobs,
-                beta1=self.beta1,
-                beta2=self.beta2,
-                epsilon=self.epsilon,
-            )
-            if self.use_lr_adaption:
-                net.LearningRateAdaption(
-                    [lr, grad, effective_grad],
-                    [lr],
-                    lr_alpha=self.lr_alpha,
-                    normalized_lr_adaption=self.normalized_lr_adaption,
-                )
-
-    def scale_learning_rate(self, scale):
-        self.alpha *= scale
-        return
-
-class DecayAdagradOptimizer(Optimizer):
-    def __init__(
-        self,
-        alpha=0.01,
-        beta1=0.0,
-        beta2=0.999,
-        epsilon=0.1,
-        weight_decay=0.0,
-        ema_options=None,
-        bias_correction_first=True,
-        policy="fixed",
-        engine="",
-        **kwargs
-    ):
-        super().__init__()
-        self.alpha = alpha
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.weight_decay = weight_decay
-        self.bias_correction_first = bias_correction_first
-        self.policy = policy
-        self.engine = engine
-        self.init_kwargs = kwargs
-        self._process_ema_options(ema_options)
-
-    def set_mapping_for_param2ema_teacher_param(self, param_mapping: Dict[str, Any]) -> None:
-        self.param2ema_teacher_param = param_mapping
-
-    def _process_ema_options(self, ema_options):
-        self.ema_enabled = True if ema_options and "ema_alpha" in ema_options else False
-        self.ema_teacher_enabled = True if ema_options and "ema_teacher_alpha" in ema_options else False
-        self.param2ema_teacher_param = {}
-        if self.ema_enabled or self.ema_teacher_enabled:
-            self.ema_start = ema_options.get("ema_start", None)
-            self.ema_end = ema_options.get("ema_end", None)
-            self.ema_step = ema_options.get("ema_step", None)
-            self.ema_alpha = ema_options.get("ema_alpha", None)
-            self.ema_teacher_alpha = ema_options.get("ema_alpha", None)
-            self.ema_teacher_module_name = ema_options.get(
-                "ema_teacher_module_name", "ema_teacher_arch"
-            )
-
-    def _run(self, net, param_init_net, param_info):
-        param = param_info.blob
-        grad = param_info.grad
-
-        if self.alpha <= 0:
-            return
-
-        lr, iteration = self.build_lr(
-            net,
-            param_init_net,
-            base_learning_rate=self.alpha,
-            policy=self.policy,
-            **(self.init_kwargs)
-        )
-
-        if isinstance(grad, core.GradientSlice):
-            # hack for position weighted.
-            param_squared_sum = param_init_net.ConstantFill([param], param + "_squared_sum", value=0.0)
-            self._aux_params.local.append(param_squared_sum)
-            output_blobs = [param, param_squared_sum]
-            net.SparseAdagrad(
-                [param, param_squared_sum, grad.indices, grad.values, lr],
-                output_blobs,
-                epsilon=self.epsilon,
-            )
-        else:
-            m1 = param_init_net.ConstantFill([param], param + "_first_mo1ment", value=0.0)
-            m2 = param_init_net.ConstantFill([param], param + "_second_moment", value=0.0)
-            self._aux_params.shared.append(iteration)
-            self._aux_params.local.append(m1)
-            self._aux_params.local.append(m2)
-            output_blobs = [param, m1, m2]
-            net.DecayAdagrad(
-                [param, m1, m2, grad, lr, iteration],
-                output_blobs,
-                beta1=self.beta1,
-                beta2=self.beta2,
-                epsilon=self.epsilon,
-                weight_decay=self.weight_decay,
-                bias_correction_first=self.bias_correction_first,
-            )
-
-            if self.ema_enabled:
-                param_ema = str(param) + "_ema"
-                if not param_init_net.BlobIsDefined(param_ema):
-                    param_init_net.ConstantFill([param], param_ema, value=0.0)
-                    self._aux_params.local.append(param_ema)
-
-                net.EMA(
-                    [param, param_ema, iteration],
-                    [param, param_ema],
-                    ema_start=self.ema_start,
-                    ema_end=self.ema_end,
-                    ema_step=self.ema_step,
-                    ema_alpha=self.ema_alpha,
-                )
-
-            if self.ema_teacher_enabled:
-                if param in self.param2ema_teacher_param:
-                    param_ema_teacher = self.param2ema_teacher_param[param]
-                    if not param_init_net.BlobIsDefined(param_ema_teacher):
-                        param_init_net.ConstantFill([param], param_ema_teacher, value=0.0)
-                        self._aux_params.local.append(param_ema_teacher)
-
-                    net.EMA(
-                        [param, param_ema_teacher, iteration],
-                        [param, param_ema_teacher],
-                        ema_start=self.ema_start,
-                        ema_end=self.ema_end,
-                        ema_step=self.ema_step,
-                        ema_alpha=self.ema_teacher_alpha,
-                    )
-
-    def scale_learning_rate(self, scale):
-        self.alpha *= scale
-        return
-
-class YellowFinOptimizer(Optimizer):
-    """YellowFin: An automatic tuner for momentum SGD
-
-    See https://arxiv.org/abs/1706.03471 for more details. This implementation
-    has separate learning rate and momentum per each parameter."""
-
-    def __init__(
-        self,
-        alpha=0.1,
-        mu=0.0,
-        beta=0.999,
-        curv_win_width=20,
-        zero_debias=True,
-        epsilon=0.1 ** 6,
-        policy="fixed",
-        sparse_dedup_aggregator=None,
-        **kwargs
-    ):
-        super().__init__()
-        self.alpha = alpha
-        self.mu = mu
-        self.beta = beta
-        self.curv_win_width = curv_win_width
-        self.zero_debias = zero_debias
-        self.epsilon = epsilon
-        self.policy = policy
-        self.sparse_dedup_aggregator = sparse_dedup_aggregator
-        self.init_kwargs = kwargs
-
-    def _run(self, net, param_init_net, param_info):
-
-        # Note: This is number of persistent scalars in YellowFin optimizer.
-        #       It should always be the number of scalars being used. The same
-        #       number should be used in class for the operation.
-        SCALARS_MEMORY_SIZE = 5
-
-        param = param_info.blob
-        grad = param_info.grad
-        moment = param_init_net.ConstantFill([param], param + "_moment", value=0.0)
-        curv_win = param_init_net.ConstantFill(
-            [], param + "_curv_win", shape=[self.curv_win_width], value=0.0
-        )
-        g_avg = param_init_net.ConstantFill([param], param + "_g_avg", value=0.0)
-        g2_avg = param_init_net.ConstantFill([param], param + "_g2_avg", value=0.0)
-        lr_avg = param_init_net.ConstantFill(
-            [], param + "_lr_avg", shape=[1], value=self.alpha
-        )
-        mu_avg = param_init_net.ConstantFill(
-            [], param + "_mu_avg", shape=[1], value=self.mu
-        )
-        scalars_memory = param_init_net.ConstantFill(
-            [], param + "_scalars_memory", shape=[SCALARS_MEMORY_SIZE], value=0.0
-        )
-
-        assert self.alpha > 0
-        assert not isinstance(
-            grad, core.GradientSlice
-        ), "YellowFin does not support sparse gradients"
-
-        iteration = utils.BuildUniqueMutexIter(param_init_net, net, iter_val=0)
-
-        self._aux_params.shared.append(iteration)
-        self._aux_params.local.append(moment)
-        self._aux_params.local.append(lr_avg)
-        self._aux_params.local.append(mu_avg)
-        self._aux_params.local.append(curv_win)
-        self._aux_params.local.append(g_avg)
-        self._aux_params.local.append(g2_avg)
-        self._aux_params.local.append(scalars_memory)
-
-        yf_in_out_args = [
-            param,
-            moment,
-            lr_avg,
-            mu_avg,
-            curv_win,
-            g_avg,
-            g2_avg,
-            scalars_memory,
-        ]
-
-        net.YellowFin(
-            yf_in_out_args + [grad, iteration],
-            yf_in_out_args,
-            beta=self.beta,
-            epsilon=self.epsilon,
-            curv_win_width=self.curv_win_width,
-            zero_debias=self.zero_debias,
-        )
-
-    def scale_learning_rate(self, scale):
-        self.alpha *= scale
-        return
-
-
-class RmsPropOptimizer(Optimizer):
-    def __init__(
-        self,
-        alpha=0.01,
-        decay=0.9,
-        momentum=0.0,
-        epsilon=1e-5,
-        policy="fixed",
-        engine="",
-        **kwargs
-    ):
-        super().__init__()
-        self.alpha = alpha
-        self.decay = decay
-        self.momentum = momentum
-        self.epsilon = epsilon
-        self.policy = policy
-        self.engine = engine
-        self.init_kwargs = kwargs
-
-    def _run(self, net, param_init_net, param_info):
-        param = param_info.blob
-        grad = param_info.grad
-
-        assert self.alpha > 0
-        assert not isinstance(
-            grad, core.GradientSlice
-        ), "RmsPropOptimizer doesn't support sparse gradients"
-
-        dev = scope.CurrentDeviceScope()
-        if dev is None:
-            dev = core.DeviceOption(caffe2_pb2.CPU)
-
-        ONE = param_init_net.ConstantFill(
-            [], "ONE_{}_{}".format(dev.device_type, dev.device_id), shape=[1], value=1.0
-        )
-
-        lr, _ = self.build_lr(
-            net,
-            param_init_net,
-            base_learning_rate=-self.alpha,
-            policy=self.policy,
-            **(self.init_kwargs)
-        )
-
-        grad_o = param_init_net.ConstantFill(
-            [param], str(param) + "_grad_o", values=0.0
-        )
-
-        ms = param_init_net.ConstantFill(
-            [param], str(param) + "_mean_squares", values=0.0
-        )
-
-        mom = param_init_net.ConstantFill([param], str(param) + "_momentum", values=0.0)
-
-        self._aux_params.local.append(ms)
-        self._aux_params.local.append(mom)
-
-        net.RmsProp(
-            [grad, ms, mom, ONE],
-            [grad_o, ms, mom],
-            decay=self.decay,
-            momentum=self.momentum,
-            epsilon=self.epsilon,
-            engine=self.engine,
-        )
-
-        net.MomentumSGDUpdate([grad_o, mom, lr, param], [grad_o, mom, param])
-
-    def scale_learning_rate(self, scale):
-        self.alpha *= scale
-        return
-
-
-def _get_param_to_device(model):
-    # Infer blob devices by going through the net and param_init_net
-    # ops and observing the device used to create or use the blob.
-    param_to_device = core.InferBlobDevices(model.net)
-    param_to_device.update(core.InferBlobDevices(model.param_init_net))
-    return param_to_device
-
-
-def get_param_device(param_name, grad, param_to_device=None, default_device=None):
-    device = default_device
-    param_to_device = param_to_device or {}
-    # We first check if parameter's device has been inferred. If not,
-    # we check the gradient. This can happen if parameter is not output
-    # by any blob but created by a FetchBlob.
-    if param_name in param_to_device:
-        device = param_to_device[param_name]
-    else:
-        if isinstance(grad, core.GradientSlice):
-            grad = grad
-            if str(grad.values) in param_to_device:
-                device = param_to_device[str(grad.values)]
-            elif str(grad.indices) in param_to_device:
-                device = param_to_device[str(grad.indices)]
-        else:
-            grad_name = str(grad)
-            if grad_name in param_to_device:
-                device = param_to_device[grad_name]
-
-    assert device is not None, "Cannot infer device for {}: no op creates it".format(
-        param_name
-    )
-    return device
-
-
-def get_lr_injection():
-    """
-    Gets current value for lr_injection, a multiplier for all base
-    learning rates.
-    Must set allow_lr_injection=True when building optimizer, as it
-    relies on synchronization over CPU.
-    """
-    return workspace.FetchBlob(_LEARNING_RATE_INJECTION)
-
-
-def set_lr_injection(lr_injection_value):
-    """
-    Sets lr_injection, a multiplier for all base learning rates.
-    Must set allow_lr_injection=True when building optimizer, as it
-    relies on synchronization over CPU.
-    """
-    workspace.FeedBlob(
-        _LEARNING_RATE_INJECTION,
-        np.array([float(lr_injection_value)], dtype=np.float32),
-    )
-
-
-def _calc_norm_ratio(model, params, name_scope, param_to_device, max_gradient_norm):
-    with core.NameScope(name_scope):
-        grad_squared_sums = []
-        for i, param in enumerate(params):
-            device = get_param_device(str(param.blob), param.grad, param_to_device)
-
-            with core.DeviceScope(device):
-                grad = (
-                    param.grad
-                    if not isinstance(param.grad, core.GradientSlice)
-                    else param.grad.values
-                )
-
-                grad_squared_sum_name = "grad_{}_squared_sum".format(i)
-                grad_squared_sum = model.net.SumSqrElements(grad, grad_squared_sum_name)
-                grad_squared_sum_cpu = model.net.EnsureCPUOutput(grad_squared_sum)
-                grad_squared_sums.append(grad_squared_sum_cpu)
-
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-            grad_squared_full_sum = model.net.Sum(
-                grad_squared_sums, "grad_squared_full_sum"
-            )
-            global_norm = model.net.Pow(
-                grad_squared_full_sum, "global_norm", exponent=0.5
-            )
-            clip_norm = model.param_init_net.ConstantFill(
-                [], "clip_norm", shape=[], value=float(max_gradient_norm)
-            )
-            max_norm = model.net.Max([global_norm, clip_norm], "max_norm")
-            norm_ratio = model.net.Div([clip_norm, max_norm], "norm_ratio")
-            return norm_ratio
-
-
-def _build(
-    model,
-    optimizer,
-    weights_only=False,
-    use_param_info_optim=True,
-    max_gradient_norm=None,
-    allow_lr_injection=False,
-):
-    param_to_device = _get_param_to_device(model)
-
-    # Validate there are no duplicate params
-    model.Validate()
-
-    params = []
-    for param_info in model.GetOptimizationParamInfo():
-        if weights_only and param_info.blob not in model.weights:
-            continue
-        params.append(param_info)
-
-    lr_multiplier = None
-    if max_gradient_norm is not None:
-        lr_multiplier = _calc_norm_ratio(
-            model,
-            params,
-            "norm_clipped_grad_update",
-            param_to_device,
-            max_gradient_norm,
-        )
-
-    if allow_lr_injection:
-        if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION):
-            lr_injection = model.param_init_net.ConstantFill(
-                [], _LEARNING_RATE_INJECTION, shape=[1], value=1.0
-            )
-        else:
-            lr_injection = _LEARNING_RATE_INJECTION
-
-        if lr_multiplier is None:
-            lr_multiplier = lr_injection
-        else:
-            lr_multiplier = model.net.Mul(
-                [lr_multiplier, lr_injection], "lr_multiplier", broadcast=1
-            )
-    optimizer.add_lr_multiplier(lr_multiplier)
-
-    for param_info in params:
-        param_name = str(param_info.blob)
-        device = get_param_device(param_name, param_info.grad, param_to_device)
-        with core.DeviceScope(device):
-            if param_info.optimizer and use_param_info_optim:
-                param_info.optimizer(model.net, model.param_init_net, param_info)
-            else:
-                optimizer(model.net, model.param_init_net, param_info)
-    return optimizer
-
-
-def add_weight_decay(model, weight_decay):
-    """Adds a decay to weights in the model.
-
-    This is a form of L2 regularization.
-
-    Args:
-        weight_decay: strength of the regularization
-    """
-    _build(
-        model,
-        WeightDecayBuilder(weight_decay=weight_decay),
-        weights_only=True,
-        use_param_info_optim=False,
-    )
-
-
-def build_sgd(
-    model,
-    base_learning_rate,
-    max_gradient_norm=None,
-    allow_lr_injection=False,
-    **kwargs
-):
-    sgd_optimizer = SgdOptimizer(base_learning_rate, **kwargs)
-    return _build(
-        model,
-        sgd_optimizer,
-        max_gradient_norm=max_gradient_norm,
-        allow_lr_injection=allow_lr_injection,
-    )
-
-
-def build_multi_precision_sgd(
-    model,
-    base_learning_rate,
-    max_gradient_norm=None,
-    allow_lr_injection=False,
-    **kwargs
-):
-    multi_prec_sgd_optimizer = MultiPrecisionSgdOptimizer(base_learning_rate, **kwargs)
-    return _build(
-        model,
-        multi_prec_sgd_optimizer,
-        max_gradient_norm=max_gradient_norm,
-        allow_lr_injection=allow_lr_injection,
-    )
-
-
-def build_fp16_sgd(model, base_learning_rate, **kwargs):
-    fp16_sgd_optimizer = FP16SgdOptimizer(base_learning_rate, **kwargs)
-    return _build(model, fp16_sgd_optimizer)
-
-
-def build_ftrl(model, engine="SIMD", **kwargs):
-    if engine == "SIMD":
-        assert core.IsOperator("Ftrl_ENGINE_SIMD")
-        assert core.IsOperator("SparseFtrl_ENGINE_SIMD")
-    ftrl_optimizer = FtrlOptimizer(engine=engine, **kwargs)
-    return _build(model, ftrl_optimizer)
-
-
-def build_gftrl(model, engine="", **kwargs):
-    if engine == "SIMD":
-        assert core.IsOperator("GFtrl_ENGINE_SIMD")
-    gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs)
-    return _build(model, gftrl_optimizer)
-
-
-def build_adagrad(
-    model,
-    base_learning_rate,
-    parameters=None,
-    max_gradient_norm=None,
-    allow_lr_injection=False,
-    **kwargs
-):
-    adagrad_optimizer = AdagradOptimizer(alpha=base_learning_rate, **kwargs)
-    return _build(
-        model,
-        adagrad_optimizer,
-        max_gradient_norm=max_gradient_norm,
-        allow_lr_injection=allow_lr_injection,
-    )
-
-
-def build_wngrad(
-    model,
-    base_learning_rate,
-    parameters=None,
-    max_gradient_norm=None,
-    allow_lr_injection=False,
-    **kwargs
-):
-    wngrad_optimizer = WngradOptimizer(alpha=base_learning_rate, **kwargs)
-    return _build(
-        model,
-        wngrad_optimizer,
-        max_gradient_norm=max_gradient_norm,
-        allow_lr_injection=allow_lr_injection,
-    )
-
-
-def build_storm(
-    model,
-    base_learning_rate,
-    parameters=None,
-    max_gradient_norm=None,
-    allow_lr_injection=False,
-    **kwargs
-):
-    storm_optimizer = StormOptimizer(lr=base_learning_rate, **kwargs)
-    return _build(
-        model,
-        storm_optimizer,
-        max_gradient_norm=max_gradient_norm,
-        allow_lr_injection=allow_lr_injection,
-    )
-
-
-def build_adadelta(
-    model,
-    base_learning_rate,
-    parameters=None,
-    max_gradient_norm=None,
-    allow_lr_injection=False,
-    **kwargs
-):
-    adadelta_optimizer = AdadeltaOptimizer(alpha=base_learning_rate, **kwargs)
-    return _build(
-        model,
-        adadelta_optimizer,
-        max_gradient_norm=max_gradient_norm,
-        allow_lr_injection=allow_lr_injection,
-    )
-
-
-def build_adam(
-    model,
-    base_learning_rate,
-    max_gradient_norm=None,
-    allow_lr_injection=False,
-    **kwargs
-):
-    adam_optimizer = AdamOptimizer(alpha=base_learning_rate, **kwargs)
-    return _build(
-        model,
-        adam_optimizer,
-        max_gradient_norm=max_gradient_norm,
-        allow_lr_injection=allow_lr_injection,
-    )
-
-def build_decay_adagrad(
-    model,
-    base_learning_rate,
-    max_gradient_norm=None,
-    allow_lr_injection=False,
-    **kwargs
-):
-    decay_adagrad_optimizer = DecayAdagradOptimizer(alpha=base_learning_rate, **kwargs)
-    return _build(
-        model,
-        decay_adagrad_optimizer,
-        max_gradient_norm=max_gradient_norm,
-        allow_lr_injection=allow_lr_injection,
-    )
-
-def build_yellowfin(model, base_learning_rate=0.1, **kwargs):
-    yellowfin_optimizer = YellowFinOptimizer(alpha=base_learning_rate, **kwargs)
-    return _build(model, yellowfin_optimizer)
-
-
-def build_rms_prop(
-    model,
-    base_learning_rate,
-    max_gradient_norm=None,
-    allow_lr_injection=False,
-    **kwargs
-):
-    rms_prop_optimizer = RmsPropOptimizer(alpha=base_learning_rate, **kwargs)
-    return _build(
-        model,
-        rms_prop_optimizer,
-        max_gradient_norm=max_gradient_norm,
-        allow_lr_injection=allow_lr_injection,
-    )
diff --git a/caffe2/python/optimizer_context.py b/caffe2/python/optimizer_context.py
deleted file mode 100644
index b214d136f61a..000000000000
--- a/caffe2/python/optimizer_context.py
+++ /dev/null
@@ -1,53 +0,0 @@
-## @package optimizer_context
-# Module caffe2.python.optimizer_context
-
-
-
-
-
-from caffe2.python import context
-from caffe2.python.modifier_context import (
-    ModifierContext, UseModifierBase)
-
-
-DEFAULT_OPTIM = 'DEFAULT'
-
-
-class OptimizerContext(ModifierContext, context.DefaultManaged):
-    """
-    provide context to allow param_info to have different optimizers
-    """
-
-    def has_optimizer(self, name):
-        return self._has_modifier(name)
-
-    def get_optimizer(self, name):
-        assert self.has_optimizer(name), (
-            "{} optimizer is not provided!".format(name))
-        return self._get_modifier(name)
-
-
-class UseOptimizer(UseModifierBase):
-    '''
-    context class to allow setting the current context.
-    Example usage with brew:
-        - with UseOptimizer(optim):
-            brew.func
-        - with UseOptimizer({'WEIGHT': weight_optim}):
-            brew.func
-        - with UseOptimizer({'DEFAULT': optim, 'BIAS': bias_optim,
-                                'WEIGHT': weight_optim}):
-            brew.func
-        - with UseOptimizer(optim1):
-            brew.func
-            with UseOptimizer(optim2):
-                brew.func
-
-    Example usage with layer:
-        optimizers = {'optim1': optim1, 'optim2': optim2}
-        with Optimizers(optimizers):
-            optim = OptimizerContext.current().get_optimizer('optim1')
-            layer(optim=optim)
-    '''
-    def _context_class(self):
-        return OptimizerContext
diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py
deleted file mode 100644
index e84177502be5..000000000000
--- a/caffe2/python/optimizer_test.py
+++ /dev/null
@@ -1,777 +0,0 @@
-
-
-
-from caffe2.proto import caffe2_pb2
-import caffe2.python.optimizer as optimizer
-from caffe2.python.optimizer import (
-    build_sgd, build_multi_precision_sgd, build_ftrl, build_gftrl, build_wngrad,
-    build_adagrad, build_adadelta, build_adam, build_yellowfin, build_rms_prop,
-    build_storm, build_decay_adagrad, add_weight_decay, SgdOptimizer)
-from caffe2.python.optimizer_context import UseOptimizer
-from caffe2.python.optimizer_test_util import (
-    OptimizerTestBase, LRModificationTestBase
-)
-from caffe2.python import core, utils, workspace
-from caffe2.python.test_util import TestCase
-import numpy as np
-from numpy.testing import assert_allclose, assert_equal
-import math
-import unittest
-
-
-class TestLars(OptimizerTestBase, TestCase):
-    def testSparse(self):
-        raise unittest.SkipTest("no sparse support")
-
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = False
-        return build_sgd(model, base_learning_rate=0.1, lars=0.5, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
-        self.assertFalse(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().shared:
-            tensor = workspace.FetchBlob(param)
-            np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
-
-
-class TestMomentumSgd(OptimizerTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = False
-        return build_sgd(model, base_learning_rate=0.1, momentum=0.1, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().shared:
-            tensor = workspace.FetchBlob(param)
-            np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
-
-
-class TestSgd(OptimizerTestBase, LRModificationTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = False
-        return build_sgd(model, base_learning_rate=0.1, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
-        self.assertFalse(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().shared:
-            tensor = workspace.FetchBlob(param)
-            np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
-
-
-class TestMultiPrecisionSgd(
-    OptimizerTestBase, LRModificationTestBase, TestCase
-):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = False
-        return build_multi_precision_sgd(
-            model, base_learning_rate=0.1, **kwargs
-        )
-
-    def check_optimizer(self, optimizer):
-        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
-        self.assertFalse(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().shared:
-            tensor = workspace.FetchBlob(param)
-            np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
-    def testGPUDense(self):
-        super().testGPUDense(core.DataType.FLOAT16)
-
-
-class TestFtrl(OptimizerTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = True
-        return build_ftrl(
-            model,
-            engine=None,
-            alpha=1.0,
-            beta=0.1,
-            lambda1=0.0,
-            lambda2=0.0,
-            **kwargs
-        )
-
-    def check_optimizer(self, optimizer):
-        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-
-class TestGFtrl(OptimizerTestBase, TestCase):
-    def testSparse(self):
-        raise unittest.SkipTest("no sparse support")
-
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = True
-        return build_gftrl(
-            model,
-            engine=None,
-            alpha=1.0,
-            beta=0.1,
-            lambda1=0.0,
-            lambda2=0.0,
-            **kwargs
-        )
-
-    def check_optimizer(self, optimizer):
-        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-
-class TestAdagrad(OptimizerTestBase, LRModificationTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = False
-        return build_adagrad(model, base_learning_rate=1.0, lars=0.5, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-
-class TestAdagradWithDedicatedLRIteration(OptimizerTestBase, LRModificationTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = False
-        return build_adagrad(model, base_learning_rate=1.0, lars=0.5, use_dedicated_lr_iteration_counter=True, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-        # check iteration counters have the same value by default
-        non_lr_iter = workspace.FetchBlob(utils.OPTIMIZER_ITERATION_NAME)
-        lr_iter = workspace.FetchBlob(utils.OPTIMIZER_ITERATION_LR_NAME)
-        self.assertEqual(non_lr_iter, lr_iter)
-
-    def testGPUDense(self):
-        raise unittest.SkipTest("GPU support is not validated")
-
-
-class TestRowWiseAdagrad(OptimizerTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = True
-        return build_adagrad(
-            model, base_learning_rate=1.0, lars=0.5, rowWise=True, **kwargs
-        )
-
-    def check_optimizer(self, optimizer):
-        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-    def testDense(self):
-        raise unittest.SkipTest("no dense support")
-
-    def testGPUDense(self):
-        raise unittest.SkipTest("no dense support")
-
-class TestRowWiseAdagradWithCounter(OptimizerTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = True
-        return build_adagrad(
-            model,
-            base_learning_rate=1.0,
-            lars=0.5,
-            rowWise=True,
-            counter_halflife=5,
-            **kwargs
-        )
-
-    def check_optimizer(self, optimizer):
-        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        self.assertTrue(workspace.HasBlob("optimizer_iteration"))
-        iteration_tensor = workspace.FetchBlob("optimizer_iteration")
-        np.testing.assert_allclose(np.array([2000]),
-                                   iteration_tensor,
-                                   atol=1e-5)
-        for param in optimizer.get_auxiliary_parameters().shared:
-            workspace.FetchBlob(param)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-    def testDense(self):
-        raise unittest.SkipTest("no dense support")
-
-    def testGPUDense(self):
-        raise unittest.SkipTest("no dense support")
-
-class TestWngrad(OptimizerTestBase, LRModificationTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = True
-        return build_wngrad(model, base_learning_rate=25.0, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-
-class TestStorm(OptimizerTestBase, LRModificationTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = True
-        return build_storm(model, base_learning_rate=2.0, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-
-class TestAdadelta(OptimizerTestBase, LRModificationTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = False
-        return build_adadelta(model, base_learning_rate=1.0, decay=0.995, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-
-class TestAdam(OptimizerTestBase, LRModificationTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = False
-        return build_adam(model, base_learning_rate=0.1, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        self.assertTrue(workspace.HasBlob("optimizer_iteration"))
-        iteration_tensor = workspace.FetchBlob("optimizer_iteration")
-        np.testing.assert_allclose(np.array([2000]),
-                                   iteration_tensor,
-                                   atol=1e-5)
-        for param in optimizer.get_auxiliary_parameters().shared:
-            workspace.FetchBlob(param)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-class TestSmartDecayAdam(OptimizerTestBase, LRModificationTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = False
-        kwargs['beta1'] = 0.0
-        return build_adam(model, base_learning_rate=0.1, use_smart_decay=True, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        self.assertTrue(workspace.HasBlob("optimizer_iteration"))
-        blob_names = workspace.Blobs()
-        self.assertTrue(any((bn.endswith('_last_seen') for bn in blob_names)))
-        for param in optimizer.get_auxiliary_parameters().shared:
-            workspace.FetchBlob(param)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-class TestDecayAdagrad(OptimizerTestBase, LRModificationTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = True
-        return build_decay_adagrad(model, base_learning_rate=1.0, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        self.assertTrue(workspace.HasBlob("optimizer_iteration"))
-        iteration_tensor = workspace.FetchBlob("optimizer_iteration")
-        np.testing.assert_allclose(np.array([2000]),
-                                   iteration_tensor,
-                                   atol=1e-5)
-        for param in optimizer.get_auxiliary_parameters().shared:
-            workspace.FetchBlob(param)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-    def testSparse(self):
-        raise unittest.SkipTest("no sparse support")
-
-class TestSparseRAdam(OptimizerTestBase, LRModificationTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = True
-        return build_adam(model, base_learning_rate=0.1, enableRAdam=True, **kwargs)
-
-    def check_optimizer(self, optimizer):
-        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        self.assertTrue(workspace.HasBlob("optimizer_iteration"))
-        iteration_tensor = workspace.FetchBlob("optimizer_iteration")
-        np.testing.assert_allclose(np.array([2000]),
-                                   iteration_tensor,
-                                   atol=1e-5)
-        for param in optimizer.get_auxiliary_parameters().shared:
-            workspace.FetchBlob(param)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-
-class TestYellowFin(OptimizerTestBase, TestCase):
-    # YellowFin: An automatic tuner for momentum SGD
-    # (https://arxiv.org/abs/1706.03471)
-    def build_optimizer(self, model):
-        self._skip_gpu = False
-        return build_yellowfin(model, base_learning_rate=0.1)
-
-    def check_optimizer(self, optimizer):
-        self.assertTrue(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        self.assertTrue(workspace.HasBlob("optimizer_iteration"))
-        iteration_tensor = workspace.FetchBlob("optimizer_iteration")
-        np.testing.assert_allclose(np.array([2000]),
-                                   iteration_tensor,
-                                   atol=1e-5)
-        for param in optimizer.get_auxiliary_parameters().shared:
-            workspace.FetchBlob(param)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-    def testSparse(self):
-        raise unittest.SkipTest("no sparse support")
-
-    def deb(self, val, beta, i, zero_debias):
-        if zero_debias:
-            return val / (1.0 - beta ** i)
-        else:
-            return val
-
-    def get_lr_mu(self, distance, grad_var, h_min, h_max):
-        # First tune based on dynamic range
-        if grad_var == 0:
-            dr = h_max / h_min
-            mu = ((np.sqrt(dr) - 1) / (np.sqrt(dr) + 1)) ** 2
-            lr_min = (1 + np.sqrt(mu)) ** 2 / h_max
-            return lr_min, mu
-
-        p = distance ** 2 * h_min ** 2 / 2 / grad_var
-        w3 = (-math.sqrt(p * p + 4.0 / 27.0 * p * p * p) - p) / 2.0
-        w = (1.0 if w3 > 0.0 else -1.0) * math.pow(math.fabs(w3), 1.0 / 3.0)
-        y = w - p / 3.0 / w
-        root = y + 1
-        root = min(root, 1.0 - 1e-6)
-        dr = h_max / h_min
-        mu = max(((np.sqrt(dr) - 1) / (np.sqrt(dr) + 1)) ** 2, root**2)
-        lr_min = (1 - np.sqrt(mu)) ** 2 / h_min
-        return lr_min, mu
-
-    def caffe2_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu):
-        caffe2_res = {}
-
-        alpha = 1.0
-        mu = 0.0
-        beta = 0.999
-        curv_win_width = 20
-        epsilon = 1e-6
-
-        net = core.Net("net")
-        param_init_net = core.Net("param_init_net")
-        workspace.ResetWorkspace()
-
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-            iteration = param_init_net.ConstantFill(
-                [],
-                "iteration",
-                shape=[1],
-                value=0,
-                dtype=core.DataType.INT64)
-            iter_mutex = param_init_net.CreateMutex([], ["iteration_mutex"])
-            net.AtomicIter([iter_mutex, iteration], [iteration])
-        pre_grad = param_init_net.ConstantFill(
-            [],
-            "pre_grad",
-            shape=[n_dim],
-            value=grad_coef
-        )
-        if gpu:
-            iteration = net.CopyCPUToGPU(
-                [iteration],
-                "iteration_cpu"
-            )
-        iteration_float = net.Cast([iteration], "iteration_float")
-        grad = net.Mul([pre_grad, iteration_float], "grad", broadcast=True)
-        w = param_init_net.ConstantFill([], "w", shape=[n_dim], value=0.0)
-
-        # a hack to create an object with __dict__
-        param_info = lambda: None
-        param_info.blob = w
-        param_info.grad = grad
-
-        optimizer.YellowFinOptimizer(
-            alpha=alpha,
-            mu=mu,
-            beta=beta,
-            curv_win_width=curv_win_width,
-            epsilon=epsilon,
-            zero_debias=zero_debias
-        )._run(
-            net,
-            param_init_net,
-            param_info
-        )
-
-        workspace.RunNetOnce(param_init_net)
-        workspace.CreateNet(net, overwrite=True)
-        for i in range(n_iter):
-            workspace.RunNet(net)
-            scalars_memory_blob = workspace.FetchBlob("w_scalars_memory")
-            g_norm2_avg = scalars_memory_blob[1]
-            g_norm2_min_avg = scalars_memory_blob[2]
-            g_norm2_max_avg = scalars_memory_blob[3]
-            distance_avg = scalars_memory_blob[4]
-            g_avg_blob = workspace.FetchBlob("w_g_avg")
-            res_lr = workspace.FetchBlob("w_lr_avg")[0]
-            res_mu = workspace.FetchBlob("w_mu_avg")[0]
-            g_deb = self.deb(g_avg_blob, beta, i + 1, zero_debias)
-            variance = max(
-                self.deb(g_norm2_avg, beta, i + 1, zero_debias) -
-                g_deb.dot(g_deb),
-                epsilon
-            )
-            if i > 0:
-                caffe2_res[i] = {
-                    'h_max': np.exp(self.deb(g_norm2_max_avg,
-                                             beta,
-                                             i + 1,
-                                             zero_debias)),
-                    'h_min': np.exp(self.deb(g_norm2_min_avg,
-                                             beta,
-                                             i + 1,
-                                             zero_debias)),
-                    'var': variance,
-                    'dist': self.deb(distance_avg, beta, i + 1, zero_debias),
-                    'lr': res_lr,
-                    'mu': res_mu
-                }
-        return caffe2_res
-
-    def numpy_yellowfin(self, zero_debias, grad_coef, n_dim, n_iter, gpu):
-        numpy_res = {}
-
-        target_h_max = 0.0
-        target_h_min = 0.0
-        target_g_norm_squared_avg = 0.0
-        target_g_norm_avg = 0.0
-        target_g_avg = 0.0
-        target_dist_avg = 0.0
-        target_lr = 1.0
-        target_mu = 0.0
-
-        for i in range(n_iter):
-            grad_val = (i + 1) * grad_coef
-            target_g_norm_squared_avg = 0.999 * target_g_norm_squared_avg + \
-                0.001 * np.sum((grad_val * np.ones([n_dim, ])) ** 2)
-            target_g_norm_avg = 0.999 * target_g_norm_avg + \
-                0.001 * np.linalg.norm(grad_val * np.ones([n_dim, ]))
-            target_g_avg = 0.999 * target_g_avg + 0.001 * grad_val
-
-            target_h_max = 0.999 * target_h_max + \
-                0.001 * np.log(grad_val ** 2 * n_dim)
-            target_h_min = 0.999 * target_h_min + \
-                0.001 * np.log((max(1, i + 2 - 20) * grad_coef) ** 2 * n_dim)
-            if zero_debias:
-                target_var = target_g_norm_squared_avg / \
-                    (1 - 0.999 ** (i + 1)) - \
-                    target_g_avg ** 2 * n_dim / (1 - 0.999 ** (i + 1)) ** 2
-            else:
-                target_var = target_g_norm_squared_avg - \
-                    target_g_avg ** 2 * n_dim
-            target_dist_avg = 0.999 * target_dist_avg + \
-                0.001 * target_g_norm_avg / target_g_norm_squared_avg
-
-            if i > 0:
-                if zero_debias:
-                    lr, mu = self.get_lr_mu(
-                        target_dist_avg / (1.0 - 0.999 ** (i + 1)),
-                        target_var,
-                        np.exp(target_h_min / (1.0 - 0.999 ** (i + 1))),
-                        np.exp(target_h_max / (1.0 - 0.999 ** (i + 1))))
-                    target_lr = 0.999 * target_lr + 0.001 * lr
-                    target_mu = 0.999 * target_mu + 0.001 * mu
-                    numpy_res[i] = {
-                        'h_max': np.exp(target_h_max / (1 - 0.999 ** (i + 1))),
-                        'h_min': np.exp(target_h_min / (1 - 0.999 ** (i + 1))),
-                        'var': target_var,
-                        'dist': target_dist_avg / (1 - 0.999 ** (i + 1)),
-                        'lr': target_lr,
-                        'mu': target_mu
-                    }
-                else:
-                    lr, mu = self.get_lr_mu(
-                        target_dist_avg,
-                        target_var,
-                        np.exp(target_h_min),
-                        np.exp(target_h_max))
-                    target_lr = 0.999 * target_lr + 0.001 * lr
-                    target_mu = 0.999 * target_mu + 0.001 * mu
-                    numpy_res[i] = {
-                        'h_max': np.exp(target_h_max),
-                        'h_min': np.exp(target_h_min),
-                        'var': target_var,
-                        'dist': target_dist_avg,
-                        'lr': target_lr,
-                        'mu': target_mu
-                    }
-        return numpy_res
-
-    def compare_yellowfin_models(self,
-                                 model0,
-                                 model1,
-                                 zero_debias,
-                                 grad_coef,
-                                 n_dim,
-                                 n_iter,
-                                 gpu):
-        model0_res = model0(zero_debias, grad_coef, n_dim, n_iter, gpu)
-        model1_res = model1(zero_debias, grad_coef, n_dim, n_iter, gpu)
-        assert_equal(len(model0_res), len(model1_res))
-        for i in range(1, len(model0_res)):
-            assert_equal(model0_res[i].keys(), model1_res[i].keys())
-            for feat in model0_res[i].keys():
-                err_msg = \
-                    'i=' + str(i) + ',\n' + \
-                    'feat=' + feat + ',\n' + \
-                    'grad_coef=' + str(grad_coef) + ',\n' + \
-                    'zero_debias=' + str(zero_debias)
-                assert_allclose(model0_res[i][feat],
-                                model1_res[i][feat],
-                                rtol=1e-2,
-                                err_msg=err_msg)
-
-    @unittest.skip("Results might vary too much. Only for individual use.")
-    def test_caffe2_cpu_vs_numpy(self):
-        n_dim = 1000000
-        n_iter = 50
-        cpu_device_opt = core.DeviceOption(caffe2_pb2.CPU)
-        with core.DeviceScope(cpu_device_opt):
-            for zero_debias, grad_coef in [
-                (False, 1.0),
-                (False, 0.1),
-                (False, 0.01),
-                (True, 1.0)
-            ]:
-                self.compare_yellowfin_models(
-                    self.caffe2_yellowfin,
-                    self.numpy_yellowfin,
-                    zero_debias,
-                    grad_coef,
-                    n_dim,
-                    n_iter,
-                    gpu=False
-                )
-
-    @unittest.skip("Results might vary too much. Only for individual use.")
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    def test_caffe2_gpu_vs_numpy(self):
-        n_dim = 1000000
-        n_iter = 50
-        gpu_device_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
-        with core.DeviceScope(gpu_device_opt):
-            for zero_debias in [False, True]:
-                for grad_coef in [1.0, 0.1, 0.01]:
-                    self.compare_yellowfin_models(
-                        self.caffe2_yellowfin,
-                        self.numpy_yellowfin,
-                        zero_debias,
-                        grad_coef,
-                        n_dim,
-                        n_iter,
-                        gpu=True
-                    )
-
-
-class TestRmsProp(OptimizerTestBase, LRModificationTestBase, TestCase):
-    def build_optimizer(self, model, **kwargs):
-        self._skip_gpu = False
-        return build_rms_prop(
-            model, base_learning_rate=0.1, epsilon=0.1, **kwargs
-        )
-
-    def check_optimizer(self, optimizer):
-        self.assertFalse(optimizer.get_auxiliary_parameters().shared)
-        self.assertTrue(optimizer.get_auxiliary_parameters().local)
-        for param in optimizer.get_auxiliary_parameters().local:
-            workspace.FetchBlob(param)
-
-    def testSparse(self):
-        raise unittest.SkipTest("no sparse support")
-
-
-class TestMultiOptimizers(TestCase):
-    def test_multiple_optimizers(self):
-        from caffe2.python import brew, core, optimizer
-        from caffe2.python.model_helper import ModelHelper
-
-        model = ModelHelper(name="test")
-        fc1 = brew.fc(model, 'data', 'fc1', 100, 50)
-        fc2 = brew.fc(model, fc1, 'fc2', 50, 25)
-        pred = brew.fc(model, fc2, 'fc3', 25, 10)
-        (softmax, loss) = model.SoftmaxWithLoss(
-            [pred, 'label'],
-            ['softmax', 'loss'],
-        )
-        model.AddGradientOperators([loss])
-
-        param_to_device = optimizer._get_param_to_device(model)
-
-        def infer_blob_device(blob_name):
-            return optimizer.get_param_device(
-                blob_name, "{}_grad".format(blob_name), param_to_device
-            )
-
-        sgd_1 = optimizer.SgdOptimizer(base_learning_rate=0.1)
-        sgd_2 = optimizer.SgdOptimizer(base_learning_rate=0.2)
-        adagrad = optimizer.AdagradOptimizer()
-
-        # Check same optimizer share the same learning rate.
-        with core.DeviceScope(infer_blob_device("fc1_w")):
-            sgd_1(model.net, model.param_init_net, "fc1_w", "fc1_w_grad")
-        with core.DeviceScope(infer_blob_device("fc1_b")):
-            sgd_1(model.net, model.param_init_net, "fc1_b", "fc1_b_grad")
-        fc1_lr_blobs = []
-        for op in model.net.Proto().op:
-            if op.type == 'WeightedSum' and op.input[0] == 'fc1_w' or \
-                    op.input[0] == 'fc1_b':
-                fc1_lr_blobs.append(op.input[3])
-        self.assertEqual(fc1_lr_blobs[0], fc1_lr_blobs[1])
-
-        # Check different instance of the same optimizer has a different lr.
-        with core.DeviceScope(infer_blob_device("fc2_w")):
-            sgd_2(model.net, model.param_init_net, "fc2_w", "fc2_w_grad")
-        with core.DeviceScope(infer_blob_device("fc2_b")):
-            sgd_2(model.net, model.param_init_net, "fc2_b", "fc2_b_grad")
-        fc2_lr_blobs = []
-        for op in model.net.Proto().op:
-            if op.type == 'WeightedSum' and op.input[0] == 'fc2_w' or \
-                    op.input[0] == 'fc2_b':
-                self.assertTrue(op.input[3] not in fc1_lr_blobs)
-                fc2_lr_blobs.append(op.input[3])
-        self.assertEqual(fc2_lr_blobs[0], fc2_lr_blobs[1])
-
-        # Check different optimizer type case
-        with core.DeviceScope(infer_blob_device("fc3_w")):
-            adagrad(model.net, model.param_init_net, "fc3_w", "fc3_w_grad")
-        with core.DeviceScope(infer_blob_device("fc3_b")):
-            adagrad(model.net, model.param_init_net, "fc3_b", "fc3_b_grad")
-        fc3_lr_blobs = []
-        for op in model.net.Proto().op:
-            if op.type == 'Adagrad' and op.input[0] == 'fc3_w' or \
-                    op.input[0] == 'fc3_b':
-                self.assertTrue(op.input[3] not in fc2_lr_blobs)
-                self.assertTrue(op.input[3] not in fc1_lr_blobs)
-                fc3_lr_blobs.append(op.input[3])
-        self.assertEqual(fc3_lr_blobs[0], fc3_lr_blobs[1])
-
-
-class TestWeightDecay(TestCase):
-
-    def test_weight_decay(self):
-        from caffe2.python import brew
-        from caffe2.python.model_helper import ModelHelper
-
-        model = ModelHelper(name="test", arg_scope={'order': 'NCHW'})
-        cnv = brew.conv(model, 'data', 'cnv', 32, 32, 4)
-        a = brew.fc(model, cnv, 'a', 100, 200)
-        pred = brew.fc(model, a, 'b', 200, 5)
-        (softmax, loss) = model.SoftmaxWithLoss(
-            [pred, 'label'],
-            ['softmax', 'loss'],
-        )
-        model.AddGradientOperators([loss])
-
-        add_weight_decay(model, weight_decay=1e-4)
-        build_sgd(model, 0.11)
-
-        expected_weight_grad = {'b_w_grad', 'a_w_grad', 'cnv_w_grad'}
-
-        # Check the proto that all weights are decayed and not non-weights
-        # are decayed.
-        for op in model.net.Proto().op:
-            if op.type == 'WeightedSum' and 'wd_0_0' in op.input:
-                if op.output[0] not in expected_weight_grad:
-                    print(
-                        "Unexpected param for weight_decay: {}".
-                        format(op.output[0])
-                    )
-                self.assertTrue(op.output[0] in expected_weight_grad)
-                expected_weight_grad.remove(op.output[0])
-
-        self.assertEqual(
-            expected_weight_grad,
-            set(),
-            "Not all weights were decayed: {}".format(expected_weight_grad)
-        )
-
-
-class TestOptimizerContext(TestCase):
-
-    def test_optimizer_context(self):
-        from caffe2.python import brew, optimizer
-        from caffe2.python.model_helper import ModelHelper
-
-        model = ModelHelper(name="test", arg_scope={'order': 'NCHW'})
-        count = optimizer._optimizer_instance_count['SgdOptimizer']
-        cnv_optim = SgdOptimizer(0.15)
-        weight_optim = SgdOptimizer(0.2)
-        bias_optim = SgdOptimizer(0.1)
-
-        with UseOptimizer(cnv_optim):
-            cnv = brew.conv(model, 'data', 'cnv', 32, 32, 4)
-        with UseOptimizer({'WEIGHT': weight_optim, 'BIAS': bias_optim}):
-            a = brew.fc(model, cnv, 'a', 100, 200)
-        pred = brew.fc(model, a, 'b', 200, 5)
-        (softmax, loss) = model.SoftmaxWithLoss(
-            [pred, 'label'],
-            ['softmax', 'loss'],
-        )
-        model.AddGradientOperators([loss])
-
-        add_weight_decay(model, weight_decay=1e-4)
-        # use the following optimizer if none specified in param_info
-        build_sgd(model, 0.11)
-        expected_weight_grad = {'b_w_grad', 'a_w_grad', 'cnv_w_grad'}
-        expected_learning_rate = {
-            "SgdOptimizer_{}_lr_cpu".format(count): -0.15,
-            "SgdOptimizer_{}_lr_cpu".format(count + 1): -0.2,
-            "SgdOptimizer_{}_lr_cpu".format(count + 2): -0.1,
-            "SgdOptimizer_{}_lr_cpu".format(count + 3): -0.11
-        }
-
-        for op in model.net.Proto().op:
-            # Check the proto that all weights are decayed and not non-weights
-            # are decayed.
-            if op.type == 'WeightedSum' and 'wd_0_0' in op.input:
-                if op.output[0] not in expected_weight_grad:
-                    print(
-                        "Unexpected param for weight_decay: {}".
-                        format(op.output[0])
-                    )
-                self.assertTrue(op.output[0] in expected_weight_grad)
-                expected_weight_grad.remove(op.output[0])
-            # Check the learning rate for each parameter
-            if op.type == 'LearningRate':
-                val = 0
-                for arg in op.arg:
-                    if arg.name == 'base_lr':
-                        val = arg.f
-                self.assertAlmostEqual(
-                    val,
-                    expected_learning_rate[op.output[0]]
-                )
-
-        self.assertEqual(
-            expected_weight_grad,
-            set(),
-            "Not all weights were decayed: {}".format(expected_weight_grad)
-        )
diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py
deleted file mode 100644
index 2c0eefa71012..000000000000
--- a/caffe2/python/optimizer_test_util.py
+++ /dev/null
@@ -1,237 +0,0 @@
-## @package optimizer_test_util
-# Module caffe2.python.optimizer_test_util
-
-
-
-
-
-import unittest
-import numpy as np
-from caffe2.python import brew, core, workspace, cnn, optimizer
-from caffe2.python.modeling.initializers import (
-    Initializer, PseudoFP16Initializer)
-
-from caffe2.python.model_helper import ModelHelper
-
-
-class OptimizerTestBase:
-    """
-    This is an abstract base class.
-    Don't inherit from unittest.TestCase, and don't name it 'Test*'.
-    Do, however, do these things in classes which inherit from this.
-    """
-
-    def _createDense(self, dtype=core.DataType.FLOAT):
-        perfect_model = np.array([2, 6, 5, 0, 1]).astype(np.float32)
-        np.random.seed(123)  # make test deterministic
-        numpy_dtype = np.float32 if dtype == core.DataType.FLOAT else np.float16
-        initializer = Initializer if dtype == core.DataType.FLOAT else \
-            PseudoFP16Initializer
-        data = np.random.randint(
-            2,
-            size=(20, perfect_model.size)).astype(numpy_dtype)
-        label = np.dot(data, perfect_model)[:, np.newaxis]
-
-        model = ModelHelper(name="test", arg_scope={'order': 'NCHW'})
-        out = brew.fc(
-            model,
-            'data', 'fc', perfect_model.size, 1, ('ConstantFill', {}),
-            ('ConstantFill', {}), axis=0,
-            WeightInitializer=initializer, BiasInitializer=initializer
-        )
-        if dtype == core.DataType.FLOAT16:
-            out = model.HalfToFloat(out, out + "_fp32")
-        sq = model.SquaredL2Distance([out, 'label'])
-        loss = model.AveragedLoss(sq, "avg_loss")
-        grad_map = model.AddGradientOperators([loss])
-        self.assertIsInstance(grad_map['fc_w'], core.BlobReference)
-        return (model, perfect_model, data, label)
-
-    def testDense(self):
-        model, perfect_model, data, label = self._createDense()
-        optimizer = self.build_optimizer(model)
-        workspace.FeedBlob('data', data[0])
-        workspace.FeedBlob('label', label[0])
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.CreateNet(model.net, True)
-        for _ in range(2000):
-            idx = np.random.randint(data.shape[0])
-            workspace.FeedBlob('data', data[idx])
-            workspace.FeedBlob('label', label[idx])
-            workspace.RunNet(model.net.Proto().name)
-
-        np.testing.assert_allclose(
-            perfect_model[np.newaxis, :],
-            workspace.FetchBlob('fc_w'),
-            atol=1e-2
-        )
-        self.check_optimizer(optimizer)
-
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
-    def testGPUDense(self, dtype=core.DataType.FLOAT):
-        device_opt = core.DeviceOption(workspace.GpuDeviceType, 0)
-        with core.DeviceScope(device_opt):
-            model, _perfect_model, data, label = self._createDense(dtype)
-            if dtype == core.DataType.FLOAT16:
-                fc_fp32_for_host = model.HalfToFloat('fc', 'fc_fp32_for_host')
-                model.CopyGPUToCPU(fc_fp32_for_host, 'fc_cpu')
-            else:
-                model.CopyGPUToCPU('fc', 'fc_cpu')
-            workspace.FeedBlob('data', data[0])
-            workspace.FeedBlob('label', label[0])
-
-        # Add some CPU ops
-        brew.fc(model, 'fc_cpu', 'fc2', dim_in=1, dim_out=10, axis=0)
-
-        # Create optimizer in default device scope
-        self.build_optimizer(model)
-
-        if self._skip_gpu:
-            return
-
-        # Run net to see it does not crash
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.CreateNet(model.net, True)
-        workspace.RunNet(model.net.Proto().name)
-
-    def testSparse(self):
-        # to test duplicated indices we assign two indices to each weight and
-        # thus each weight might count once or twice
-        DUPLICATION = 2
-        perfect_model = np.array([2, 6, 5, 0, 1]).astype(np.float32)
-        np.random.seed(123)  # make test deterministic
-        data = np.random.randint(
-            2,
-            size=(20, perfect_model.size * DUPLICATION)).astype(np.float32)
-        label = np.dot(data, np.repeat(perfect_model, DUPLICATION))
-
-        model = cnn.CNNModelHelper("NCHW", name="test")
-        # imitate what model wrapper does
-        w = model.param_init_net.ConstantFill(
-            [], 'w', shape=[perfect_model.size], value=0.0)
-        model.params.append(w)
-        picked = model.net.Gather([w, 'indices'], 'gather')
-        out = model.ReduceFrontSum(picked, 'sum')
-
-        sq = model.SquaredL2Distance([out, 'label'])
-        loss = model.AveragedLoss(sq, "avg_loss")
-        grad_map = model.AddGradientOperators([loss])
-        self.assertIsInstance(grad_map['w'], core.GradientSlice)
-        optimizer = self.build_optimizer(model)
-
-        workspace.CreateBlob('indices')
-        workspace.CreateBlob('label')
-
-        for indices_type in [np.int32, np.int64]:
-            workspace.RunNetOnce(model.param_init_net)
-            workspace.CreateNet(model.net, True)
-            for _ in range(2000):
-                idx = np.random.randint(data.shape[0])
-                # transform into indices of binary features
-                indices = np.repeat(np.arange(perfect_model.size),
-                                    DUPLICATION)[data[idx] == 1]
-                if indices.size == 0:
-                    continue
-                workspace.FeedBlob(
-                    'indices',
-                    indices.reshape((indices.size,)).astype(indices_type)
-                )
-                workspace.FeedBlob('label',
-                                   np.array(label[idx]).astype(np.float32))
-                workspace.RunNet(model.net.Proto().name)
-
-            np.testing.assert_allclose(
-                perfect_model,
-                workspace.FetchBlob('w'),
-                atol=1e-2
-            )
-        self.check_optimizer(optimizer)
-
-
-class LRModificationTestBase:
-    """
-    This is an abstract base class.
-    Don't inherit from unittest.TestCase, and don't name it 'Test*'.
-    Do, however, do these things in classes which inherit from this.
-    """
-
-    def _gradient_ratio_reference(self, model, params, max_gradient_norm):
-        from caffe2.python import core
-        sum_squared_norms = 0.0
-        for param in params:
-            grad = (
-                model.param_to_grad[param]
-                if not isinstance(
-                    model.param_to_grad[param],
-                    core.GradientSlice,
-                ) else model.param_to_grad[param].values
-            )
-            val = workspace.FetchBlob(grad)
-            sum_squared_norms += np.power(np.linalg.norm(val), 2.0)
-        global_norm = np.sqrt(sum_squared_norms)
-        clip_norm = max_gradient_norm
-        norm_ratio = clip_norm / np.maximum(clip_norm, global_norm)
-        return norm_ratio
-
-    def test_global_norm_based_gradient_clipping(self):
-        max_gradient_norm = 1.0
-        model, perfect_model, data, label = self._createDense()
-        opt = self.build_optimizer(model, max_gradient_norm=max_gradient_norm)
-
-        params = []
-        for param in model.GetParams(top_scope=True):
-            if param in model.param_to_grad:
-                if not isinstance(
-                    model.param_to_grad[param],
-                    core.GradientSlice,
-                ):
-                    params.append(param)
-
-        workspace.FeedBlob('data', data[0])
-        workspace.FeedBlob('label', label[0])
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.CreateNet(model.net, True)
-        self.assertIsNotNone(opt._lr_multiplier)
-
-        # Run net once
-        idx = np.random.randint(data.shape[0])
-        workspace.FeedBlob('data', data[idx])
-        workspace.FeedBlob('label', label[idx])
-        workspace.RunNet(model.net.Proto().name)
-
-        reference = self._gradient_ratio_reference(
-            model,
-            params,
-            max_gradient_norm,
-        )
-        norm_ratio = workspace.FetchBlob(
-            'norm_clipped_grad_update/norm_ratio')
-        np.testing.assert_almost_equal(norm_ratio, reference)
-        self.assertTrue(
-            reference < 1.0, "Bad test, gradient not being scaled."
-        )
-
-    def test_lr_injection(self):
-        model, perfect_model, data, label = self._createDense()
-        opt = self.build_optimizer(
-            model, max_gradient_norm=1, allow_lr_injection=True
-        )
-
-        workspace.FeedBlob('data', data[0])
-        workspace.FeedBlob('label', label[0])
-        workspace.RunNetOnce(model.param_init_net)
-        workspace.CreateNet(model.net, True)
-
-        # Test LR injection initialized properly
-        self.assertIsNotNone(opt._lr_multiplier)
-        self.assertEqual(optimizer.get_lr_injection(), 1)
-
-        # Test that we're able to modify the value of the lr_injection
-        optimizer.set_lr_injection(0)
-        self.assertEqual(optimizer.get_lr_injection(), 0)
-
-        # Test that setting the lr_injector properly propagates to the
-        # lr_multiplier. Here, we have both lr_injector and norm_ratio that
-        # affect the lr_multiplier
-        workspace.RunNet(model.net.Proto().name)
-        self.assertEqual(workspace.FetchBlob('lr_multiplier'), 0)
diff --git a/caffe2/python/parallel_workers.py b/caffe2/python/parallel_workers.py
deleted file mode 100644
index a561ae43acb9..000000000000
--- a/caffe2/python/parallel_workers.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# @package parallel_workers
-# Module caffe2.python.parallel_workers
-
-
-
-
-
-
-'''
-This module provides a python-land multithreaded mechanism for executing work.
-
-Basic usage is as follows:
-   coordinator = parallel_workers.init_workers(
-      my_worker_fun,
-      worker_name="train"
-   )
-   ...
-   coordinator.start()
-
-First argument is the function to run in a loop on potentially multiple threads.
-It has the call signature
-    worker_fun(worker_id)
-
-Argument 'worker_name' is used to distinguish different workers,
-such as workers processing train data or workers processing test data.
-
-Optionally, one can define an "init function" that is called once before
-threads start, and has call signature:
-   my_init_fun(worker_coordinator, global_coordinator)
-
-Note that for data_parallel_models, init_workers will be called
-for each GPU. Note that the 'coordinator' returned by the function is same
-each time.
-'''
-
-import logging
-import threading
-import atexit
-import time
-import collections
-import traceback
-
-from abc import ABCMeta, abstractmethod
-
-log = logging.getLogger("parallel_workers")
-log.setLevel(logging.INFO)
-LOG_INT_SECS = 60
-
-
-def init_workers(
-    worker_fun,
-    num_worker_threads=2,
-    worker_name="train",
-    init_fun=None,
-    external_loggers=None,
-    shutdown_fun=None,
-):
-    global global_coordinator
-
-    metrics = Metrics(external_loggers)
-
-    worker_ids = [
-        global_coordinator.get_new_worker_id()
-        for i in range(num_worker_threads)
-    ]
-
-    # Create coordinator object
-    coordinator = WorkerCoordinator(
-        worker_name, worker_ids, init_fun, shutdown_fun=shutdown_fun)
-
-    # Launch fetch worker threads
-    workers = [
-        threading.Thread(
-            target=run_worker,
-            name="parallel_workers worker id {}".format(worker_id),
-            args=[coordinator,
-                  Worker(coordinator, worker_id, worker_fun, metrics)],
-        ) for worker_id in worker_ids
-    ]
-
-    coordinator._workers = workers
-    global_coordinator.add(coordinator)
-
-    return global_coordinator
-
-
-class Metrics:
-    def __init__(self, external_loggers):
-        self._metrics = collections.defaultdict(lambda: 0)
-        self._external_loggers = external_loggers
-
-    def reset_metrics(self):
-        self._metrics = collections.defaultdict(lambda: 0)
-
-    def log_metrics(self):
-        if not self._external_loggers:
-            return
-        for logger in self._external_loggers:
-            try:
-                logger.log(self._metrics)
-            except Exception as e:
-                print("Failed to call ExternalLogger: {}".format(e))
-
-    def put_metric(self, key, value, count=True):
-        self._metrics[key] += value
-        if count:
-            count_key = '{}_count'.format(key)
-            self._metrics[count_key] += 1
-
-
-class State():
-    __metaclass__ = ABCMeta
-
-    @abstractmethod
-    def start(self):
-        pass
-
-    @abstractmethod
-    def stop(self):
-        pass
-
-    @abstractmethod
-    def cleanup(self):
-        pass
-
-
-class WorkerCoordinator:
-    def __init__(
-        self, worker_name, worker_ids, init_fun,
-        state=None, shutdown_fun=None
-    ):
-        self._active = True
-        self._started = False
-        self._workers = []
-        self._worker_name = worker_name
-        self._worker_ids = worker_ids
-        self._init_fun = init_fun
-        self._state = state
-        self._shutdown_fun = shutdown_fun
-
-    def is_active(self):
-        return self._active
-
-    def init(self, global_coordinator):
-        if self._init_fun and not self._started:
-            data_coordinator = self
-            self._init_fun(data_coordinator, global_coordinator)
-
-    def _start(self):
-        if self._started:
-            return
-        self._active = True
-        self._started = True
-        if self._state:
-            self._state.start()
-
-        for w in self._workers:
-            w.daemon = True
-            w.start()
-
-    def _stop(self, reason=None):
-        self._active = False
-        if reason is not None:
-            log.error("Data input failed due to an error: {}".format(reason))
-        if self._shutdown_fun and self._started:
-            self._shutdown_fun()
-        if self._state:
-            self._state.stop()
-
-        self._started = False
-
-    def _wait_finish(self, cleanup=None):
-        print("Wait for workers to die: {}".format(self._worker_name))
-        for w in self._workers:
-            if w != threading.current_thread():
-                w.join(5.0)  # don't wait forever, thread may be blocked in i/o
-        success = True
-        for w in self._workers:
-            if w.is_alive():
-                print("Worker {} failed to close while waiting".format(w))
-                success = False
-
-        # Release memory for the scratch blobs
-        if success and self._state:
-            self._state.cleanup()
-
-        print("All workers terminated: {}".format(success))
-        return success
-
-    def get_worker_ids(self):
-        return self._worker_ids
-
-
-class GlobalWorkerCoordinator:
-    def __init__(self):
-        self._coordinators = []
-        self._fetcher_id_seq = 0
-        self._worker_ids = []
-        self.register_shutdown_handler()
-
-    def add(self, coordinator):
-        self._coordinators.append(coordinator)
-
-    def get_new_worker_id(self):
-        worker_id = self._fetcher_id_seq
-        self._worker_ids.append(worker_id)
-        self._fetcher_id_seq += 1
-        return worker_id
-
-    def get_worker_ids(self):
-        return self._worker_ids
-
-    def start(self):
-        # run init and start in separate for loop to
-        # ensure init happens serially before threads are spawn.
-        for c in self._coordinators:
-            c.init(self)
-        for c in self._coordinators:
-            c._start()
-
-    def stop(self):
-        all_success = True
-        for c in self._coordinators:
-            c._stop()
-        for c in self._coordinators:
-            success = c._wait_finish()
-            all_success = all_success and success
-        self._coordinators = []
-        return all_success
-
-    def stop_coordinator(self, worker_name):
-        '''
-        Stop a specific coordinator
-        '''
-        for c in self._coordinators:
-            if c._worker_name == worker_name:
-                c._stop()
-                c._wait_finish()
-        self._coordinators = [
-            c for c in self._coordinators
-            if c._worker_name != worker_name
-        ]
-
-    def register_shutdown_handler(self):
-        def cleanup():
-            self.stop()
-
-        atexit.register(cleanup)
-
-
-class Worker:
-    def __init__(
-        self,
-        coordinator,
-        worker_id,
-        worker_fun=None,
-        metrics=None
-    ):
-        self._coordinator = coordinator
-        self._worker_id = worker_id
-        self._worker_fun = worker_fun
-        self._metrics = metrics
-
-    def start(self):
-        self._start_time = time.time()
-
-    def run(self):
-        self._worker_fun(self._worker_id)
-
-    def handle_exception(self, e):
-        traceback.print_exc()
-        logging.exception("Exception in worker", e)
-        self._coordinator._stop("Exception in worker {}: {}".format(
-            self._worker_id, e
-        ))
-
-    def finish(self):
-        self._metrics.put_metric(
-            'worker_time', time.time() - self._start_time)
-        self._metrics.log_metrics()
-
-
-global_coordinator = GlobalWorkerCoordinator()
-
-
-def run_worker(coordinator, worker):
-    while coordinator.is_active():
-        worker.start()
-        try:
-            worker.run()
-        except Exception as e:
-            worker.handle_exception(e)
-        finally:
-            worker.finish()
diff --git a/caffe2/python/parallel_workers_test.py b/caffe2/python/parallel_workers_test.py
deleted file mode 100644
index a7c5c7271038..000000000000
--- a/caffe2/python/parallel_workers_test.py
+++ /dev/null
@@ -1,119 +0,0 @@
-
-
-
-
-
-import unittest
-
-from caffe2.python import workspace, core
-import caffe2.python.parallel_workers as parallel_workers
-
-
-def create_queue():
-    queue = 'queue'
-
-    workspace.RunOperatorOnce(
-        core.CreateOperator(
-            "CreateBlobsQueue", [], [queue], num_blobs=1, capacity=1000
-        )
-    )
-    # Technically, blob creations aren't thread safe. Since the unittest below
-    # does RunOperatorOnce instead of CreateNet+RunNet, we have to precreate
-    # all blobs beforehand
-    for i in range(100):
-        workspace.C.Workspace.current.create_blob("blob_" + str(i))
-        workspace.C.Workspace.current.create_blob("status_blob_" + str(i))
-    workspace.C.Workspace.current.create_blob("dequeue_blob")
-    workspace.C.Workspace.current.create_blob("status_blob")
-
-    return queue
-
-
-def create_worker(queue, get_blob_data):
-    def dummy_worker(worker_id):
-        blob = 'blob_' + str(worker_id)
-
-        workspace.FeedBlob(blob, get_blob_data(worker_id))
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                'SafeEnqueueBlobs', [queue, blob], [blob, 'status_blob_' + str(worker_id)]
-            )
-        )
-
-    return dummy_worker
-
-
-def dequeue_value(queue):
-    dequeue_blob = 'dequeue_blob'
-    workspace.RunOperatorOnce(
-        core.CreateOperator(
-            "SafeDequeueBlobs", [queue], [dequeue_blob, 'status_blob']
-        )
-    )
-
-    return workspace.FetchBlob(dequeue_blob)
-
-
-class ParallelWorkersTest(unittest.TestCase):
-    def testParallelWorkers(self):
-        workspace.ResetWorkspace()
-
-        queue = create_queue()
-        dummy_worker = create_worker(queue, str)
-        worker_coordinator = parallel_workers.init_workers(dummy_worker)
-        worker_coordinator.start()
-
-        for _ in range(10):
-            value = dequeue_value(queue)
-            self.assertTrue(
-                value in [b'0', b'1'], 'Got unexpected value ' + str(value)
-            )
-
-        self.assertTrue(worker_coordinator.stop())
-
-    def testParallelWorkersInitFun(self):
-        workspace.ResetWorkspace()
-
-        queue = create_queue()
-        dummy_worker = create_worker(
-            queue, lambda worker_id: workspace.FetchBlob('data')
-        )
-        workspace.FeedBlob('data', 'not initialized')
-
-        def init_fun(worker_coordinator, global_coordinator):
-            workspace.FeedBlob('data', 'initialized')
-
-        worker_coordinator = parallel_workers.init_workers(
-            dummy_worker, init_fun=init_fun
-        )
-        worker_coordinator.start()
-
-        for _ in range(10):
-            value = dequeue_value(queue)
-            self.assertEqual(
-                value, b'initialized', 'Got unexpected value ' + str(value)
-            )
-
-        # A best effort attempt at a clean shutdown
-        worker_coordinator.stop()
-
-    def testParallelWorkersShutdownFun(self):
-        workspace.ResetWorkspace()
-
-        queue = create_queue()
-        dummy_worker = create_worker(queue, str)
-        workspace.FeedBlob('data', 'not shutdown')
-
-        def shutdown_fun():
-            workspace.FeedBlob('data', 'shutdown')
-
-        worker_coordinator = parallel_workers.init_workers(
-            dummy_worker, shutdown_fun=shutdown_fun
-        )
-        worker_coordinator.start()
-
-        self.assertTrue(worker_coordinator.stop())
-
-        data = workspace.FetchBlob('data')
-        self.assertEqual(data, b'shutdown', 'Got unexpected value ' + str(data))
diff --git a/caffe2/python/parallelize_bmuf_distributed_test.py b/caffe2/python/parallelize_bmuf_distributed_test.py
deleted file mode 100644
index c38a4ccc34d7..000000000000
--- a/caffe2/python/parallelize_bmuf_distributed_test.py
+++ /dev/null
@@ -1,294 +0,0 @@
-
-
-
-
-from multiprocessing import Process, Manager
-
-import numpy as np
-import unittest
-import tempfile
-import shutil
-import logging
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-
-from caffe2.python import workspace
-
-log = logging.getLogger("parallelize_bmuf_distributed_test")
-log.setLevel(logging.INFO)
-
-
-def bmuf_process(filestore_dir, process_id, shared_results,
-                 cpu_device=False, nesterov=False):
-    # We need to import caffe2 in every process to initialize CUDA independently.
-    from caffe2.python import core, cnn, data_parallel_model, dyndep
-    from caffe2.proto import caffe2_pb2
-    dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
-
-    if not cpu_device:
-        if not workspace.has_gpu_support:
-            log.info('No GPU support test is Ignored.')
-            return
-        if workspace.NumGpuDevices() < 4:
-            log.info('Not enough GPU support, test IGNORED')
-            return
-
-    model = cnn.CNNModelHelper(
-        order="NHWC",
-        name="test"
-    )
-    if not cpu_device:
-        device_type = workspace.GpuDeviceType
-        device_prefix = "gpu"
-    else:
-        device_type = caffe2_pb2.CPU
-        device_prefix = "cpu"
-
-    devices = [0, 1] if process_id == 0 else [2, 3]
-
-    def _model_build_fun(model, loss_scale):
-        fc = model.FC(
-            "data", "fc", 16, 1, ("ConstantFill", {}), ("ConstantFill", {})
-        )
-        fc_fl = model.FlattenToVec(fc, "fc_fl")
-        sigm = model.Sigmoid(fc_fl, "sigm")
-        sq = model.SquaredL2Distance([sigm, "label"], "sq")
-        loss = model.AveragedLoss(sq, "loss")
-        loss = model.Scale(loss, scale=loss_scale)
-
-        # For testing explicit sync
-        model.param_init_net.UniformFill([], ["sync_num"], shape=[1])
-        return [loss]
-
-    def _input_builder_fun(model):
-        return None
-
-    def _param_update_fun(model):
-        ITER = model.Iter("ITER")
-        LR = model.net.LearningRate(
-            [ITER],
-            "LR",
-            base_lr=(-0.1),
-            policy="fixed",
-        )
-        ONE = model.param_init_net.ConstantFill(
-            [], "ONE", shape=[1], value=1.0,
-        )
-        for param in model.GetParams():
-            grad = model.param_to_grad[param]
-            model.WeightedSum([param, ONE, grad, LR], param)
-
-    def _generate_data(devices, process_id, device_type, device_prefix):
-        np.random.seed(26 + process_id * 10)
-        # Each run has same input, independent of number of gpus
-        batch_size = 64
-        for _ in range(0, 10):
-            full_data = np.random.rand(batch_size, 16)
-            full_labels = np.round(full_data[:, 0])
-            batch_per_device = batch_size // len(devices)
-
-            for (j, g) in enumerate(devices):
-                st = j * batch_per_device
-                en = st + batch_per_device
-                data = full_data[st:en, :].astype(np.float32)
-                labels = full_labels[st:en].astype(np.float32)
-                with core.DeviceScope(core.DeviceOption(device_type, g)):
-                    workspace.FeedBlob("{}_{}/data".format(device_prefix, g), data)
-                    workspace.FeedBlob("{}_{}/label".format(device_prefix, g), labels)
-
-    _generate_data(devices, process_id, device_type, device_prefix)
-
-    workspace.RunOperatorOnce(
-        core.CreateOperator(
-            "FileStoreHandlerCreate", [], ["store_handler"],
-            path=filestore_dir
-        )
-    )
-    rendezvous = dict(
-        kv_handler="store_handler",
-        shard_id=process_id,
-        num_shards=2,
-        engine="GLOO",
-        exit_nets=None
-    )
-
-    data_parallel_model.Parallelize_BMUF(
-        model,
-        _input_builder_fun,
-        _model_build_fun,
-        _param_update_fun,
-        devices=devices,
-        rendezvous=rendezvous,
-        nesterov=nesterov,
-        add_blobs_to_sync=["sync_num"],
-        cpu_device=cpu_device
-    )
-
-    data_parallel_model.RunInitNet(model)
-
-    def _device_pid(device, pid):
-        if pid == 1:
-            return device + 2
-        return device
-
-    np.testing.assert_equal(
-        workspace.FetchBlob("{}_{}/fc_w_v".format(
-            device_prefix, _device_pid(0, process_id))),
-        np.zeros(16).astype(np.float32).reshape(1, 16)
-    )
-
-    # Run the algorithm for one iteration to have non-zero params.
-    data_parallel_model.RunNet(model, 1)
-
-    # Save iteration momentum and post local update params
-    results = {}
-    v_b_ = workspace.FetchBlob(
-        "{}_{}/fc_b_v".format(device_prefix, _device_pid(0, process_id)))
-    v_w_ = workspace.FetchBlob(
-        "{}_{}/fc_w_v".format(device_prefix, _device_pid(0, process_id)))
-
-    results['v_b_'] = v_b_
-    results['v_w_'] = v_w_
-
-    workspace.RunNetOnce(model.net)
-
-    b_0_ = workspace.FetchBlob(
-        "{}_{}/fc_b".format(device_prefix, _device_pid(0, process_id)))
-    w_0_ = workspace.FetchBlob(
-        "{}_{}/fc_w".format(device_prefix, _device_pid(0, process_id)))
-    b_1_ = workspace.FetchBlob(
-        "{}_{}/fc_b".format(device_prefix, _device_pid(1, process_id)))
-    w_1_ = workspace.FetchBlob(
-        "{}_{}/fc_w".format(device_prefix, _device_pid(1, process_id)))
-
-    results['b_0_'] = b_0_
-    results['w_0_'] = w_0_
-    results['b_1_'] = b_1_
-    results['w_1_'] = w_1_
-
-    # Test sync
-    if process_id == 0:
-        workspace.FeedBlob(
-            device_prefix + "_0/sync_num",
-            np.array([2603]).astype(np.float32),
-            device_option=core.DeviceOption(device_type, 0))
-
-    # Compute block gradients.
-    b_g_ = workspace.FetchBlob(
-        "{}_{}/fc_b_g".format(device_prefix, _device_pid(0, process_id)))
-    w_g_ = workspace.FetchBlob(
-        "{}_{}/fc_w_g".format(device_prefix, _device_pid(0, process_id)))
-    results['b_g_'] = b_g_
-    results['w_g_'] = w_g_
-    workspace.RunNetOnce(model._global_model_param_updates_net)
-
-    #  g_b = (b_0_ + b_1_) / 2 - b_g_
-    #  g_w = (w_0_ + w_1_) / 2 - w_g_
-    v_b = workspace.FetchBlob(
-        "{}_{}/fc_b_v".format(device_prefix, _device_pid(0, process_id)))
-    v_w = workspace.FetchBlob(
-        "{}_{}/fc_w_v".format(device_prefix, _device_pid(0, process_id)))
-    w_g = workspace.FetchBlob(
-        "{}_{}/fc_w_g".format(device_prefix, _device_pid(0, process_id)))
-    b_g = workspace.FetchBlob(
-        "{}_{}/fc_b_g".format(device_prefix, _device_pid(0, process_id)))
-    w_0 = workspace.FetchBlob(
-        "{}_{}/fc_w".format(device_prefix, _device_pid(0, process_id)))
-    b_0 = workspace.FetchBlob(
-        "{}_{}/fc_b".format(device_prefix, _device_pid(0, process_id)))
-    w_1 = workspace.FetchBlob(
-        "{}_{}/fc_w".format(device_prefix, _device_pid(1, process_id)))
-    b_1 = workspace.FetchBlob(
-        "{}_{}/fc_b".format(device_prefix, _device_pid(1, process_id)))
-    results['v_b'] = v_b
-    results['v_w'] = v_w
-    results['w_g'] = w_g
-    results['b_g'] = b_g
-    results['w_0'] = w_0
-    results['b_0'] = b_0
-    results['w_1'] = w_1
-    results['b_1'] = b_1
-
-    # Test add_blobs_to_sync
-    for j in devices:
-        sync = workspace.FetchBlob(
-            device_prefix + "_{}/sync_num".format(j))[0]
-        results['sync_{}'.format(j)] = sync
-
-    shared_results[process_id] = results
-
-
-class DistributedTest(unittest.TestCase):
-
-    @given(
-        cpu_device=st.booleans(),
-        nesterov=st.booleans()
-    )
-    @settings(deadline=10000)
-    def test_bmuf_distributed(self, cpu_device, nesterov):
-        if (not cpu_device) and workspace.has_hip_support:
-            log.info('Skipping the test on ROCm due to regression in ROCm3.5')
-            return
-        self._test_bmuf_distributed(cpu_device=cpu_device, nesterov=nesterov)
-
-    def _test_bmuf_distributed(self, cpu_device=False, nesterov=False):
-        processes = []
-        filestore_dir = tempfile.mkdtemp()
-        results = Manager().dict()
-        for idx in range(0, 2):
-            process = Process(
-                target=bmuf_process,
-                args=(filestore_dir, idx, results, cpu_device, nesterov)
-            )
-            processes.append(process)
-            process.start()
-
-        while len(processes) > 0:
-            process = processes.pop()
-            process.join()
-        shutil.rmtree(filestore_dir)
-
-        if len(results) == 0:
-            return
-
-        w_0 = results[0]['w_0']
-        w_1 = results[0]['w_1']
-        b_0 = results[0]['b_0']
-        b_1 = results[0]['b_1']
-        # Check parameters are in sync.
-        np.testing.assert_equal(w_0, w_1)
-        np.testing.assert_equal(w_0, results[1]['w_0'])
-        np.testing.assert_equal(w_0, results[1]['w_1'])
-        np.testing.assert_equal(b_0, b_1)
-        np.testing.assert_equal(b_0, results[1]['b_0'])
-        np.testing.assert_equal(b_0, results[1]['b_1'])
-
-        w_g_ = results[0]['w_g_']
-        b_g_ = results[0]['b_g_']
-
-        g_b = (results[0]['b_0_'] + results[1]['b_0_'] + results[0]['b_1_'] +
-               results[1]['b_1_']) / 4 - b_g_
-        g_w = (results[0]['w_0_'] + results[1]['w_0_'] + results[0]['w_1_'] +
-               results[1]['w_1_']) / 4 - w_g_
-        v_b_ = results[0]['v_b_']
-        v_b = results[0]['v_b']
-        v_w_ = results[0]['v_w_']
-        v_w = results[0]['v_w']
-
-        for pid in results.keys():
-            for k in results[pid].keys():
-                if k.startswith("sync_num"):
-                    self.assertEqual(2603, results[pid][k])
-
-        # Check block gradients are correct.
-        np.testing.assert_almost_equal(v_b, 0.75 * v_b_ + g_b)
-        np.testing.assert_almost_equal(v_w, 0.75 * v_w_ + g_w)
-
-        # Check params update step
-        if nesterov:
-            np.testing.assert_equal(w_0, w_g_ + v_w - 0.75 * (v_w - v_w_))
-            np.testing.assert_equal(b_0, b_g_ + v_b - 0.75 * (v_b - v_b_))
-        else:
-            np.testing.assert_equal(w_0, w_g_ + v_w)
-            np.testing.assert_equal(b_0, b_g_ + v_b)
diff --git a/caffe2/python/pipeline.py b/caffe2/python/pipeline.py
deleted file mode 100644
index 195ac8285c83..000000000000
--- a/caffe2/python/pipeline.py
+++ /dev/null
@@ -1,451 +0,0 @@
-## @package pipeline
-# Module caffe2.python.pipeline
-
-
-
-
-
-from caffe2.python import core, queue_util
-from caffe2.python.dataio import Reader, Writer
-from caffe2.python.net_builder import NetBuilder, ops
-from caffe2.python.schema import as_record, Field
-from caffe2.python.task import Node, Task, TaskGroup
-
-
-class Output:
-    """
-    Represents the result of a processor function. A processor can either
-    return an Output, or it can return a record, in which case an Output will be
-    created for it afterwards.
-    """
-    def __init__(self, nets=None, record=None, should_stop=None):
-        builder_children = NetBuilder.current().get()
-        assert nets is None or len(builder_children) == 0, (
-            'Cannot both use `ops` syntax and return a list of nets.')
-        if nets is None:
-            nets = builder_children
-        if isinstance(nets, core.Net):
-            nets = [nets]
-        self.nets = [] if nets is None else list(nets)
-        self.record = None if record is None else as_record(record)
-        self.should_stop = should_stop
-
-
-DEFAULT_QUEUE_CAPACITY = 100
-
-
-def _init_output(output, capacity, global_init_net, global_exit_net):
-    if output is None:
-        out_queue = queue_util.Queue(
-            capacity=(
-                capacity if capacity is not None
-                else DEFAULT_QUEUE_CAPACITY))
-        writer = out_queue.writer()
-    elif isinstance(output, Writer):
-        assert capacity is None, 'capacity would not be used.'
-        out_queue = None
-        writer = output
-    elif hasattr(output, 'writer'):
-        assert capacity is None, 'capacity would not be used.'
-        out_queue = output
-        writer = output.writer()
-    else:
-        raise ValueError('output must be a reader, queue or stream.')
-    writer.setup_ex(global_init_net, global_exit_net)
-    return out_queue, writer
-
-
-def make_processor(processor, reader=None):
-    if processor is None:
-        return lambda rec: rec
-    elif isinstance(processor, core.Net):
-        return NetProcessor(processor)
-    else:
-        if reader is not None and hasattr(processor, "schema_func"):
-            def processor_schema():
-                return processor.schema_func(reader)
-
-            processor.schema = processor_schema
-        return processor
-
-
-def normalize_processor_output(output):
-    """
-    Allow for processors to return results in several formats.
-    TODO(azzolini): simplify once all processors use NetBuilder API.
-    """
-    if isinstance(output, Output):
-        """ Processor returned an Output. """
-        return output
-    elif isinstance(output, Field):
-        """ Processor returned a record. """
-        return Output(record=output)
-    elif isinstance(output, tuple):
-        is_record_and_blob = (
-            len(output) == 2 and
-            isinstance(output[0], Field) and
-            isinstance(output[1], core.BlobReference))
-        if is_record_and_blob:
-            """ Processor returned (record, stop_blob) """
-            return Output(None, *output)
-        else:
-            """ Processor returned (nets, record, stop_blob) """
-            return Output(*output)
-    else:
-        """ Processor returned nets, no output """
-        return Output(output)
-
-
-def pipe(
-        input, output=None, num_threads=1, processor=None, name=None,
-        capacity=None, group=None, num_runtime_threads=1):
-    """
-    Given a Reader, Queue or DataStream in `input`, and optionally, a Writer,
-    Queue or DataStream in `output`, creates a Task that, when run, will
-    pipe the input into the output, using multiple parallel threads.
-    Additionally, if a processor is given, it will be called between reading
-    and writing steps, allowing it to transform the record.
-
-    Args:
-        input:       either a Reader, Queue or DataStream that will be read
-                     until a stop is signaled either by the reader or the
-                     writer.
-        output:      either a Writer, a Queue or a DataStream that will be
-                     written to as long as neither reader nor writer signal
-                     a stop condition. If output is not provided or is None,
-                     a Queue is created with given `capacity` and written to.
-        num_threads: number of concurrent threads used for processing and
-                     piping. If set to 0, no Task is created, and a
-                     reader is returned instead -- the reader returned will
-                     read from the reader passed in and process it.
-                     ** DEPRECATED **. Use `num_runtime_threads` instead.
-                     This option will be removed once all readers/processors
-                     support `num_runtime_threads`.
-        processor:   (optional) function that takes an input record and
-                     optionally returns a record; this will be called
-                     between read and write steps. If the processor does
-                     not return a record, a writer will not be instantiated.
-                     Processor can also be a core.Net with input and output
-                     records properly set. In that case, a NetProcessor is
-                     instantiated, cloning the net for each of the threads.
-        name:        (optional) name of the task to be created.
-        capacity:    when output is not passed, a queue of given `capacity`
-                     is created and written to.
-        group:       (optional) explicitly add the created Task to this
-                     TaskGroup, instead of using the currently active one.
-        num_runtime_threads: Similar to `num_threads`, but instead of expanding
-                     the tasks with a `for` loop in python, does that at
-                     runtime. This is preferable to `num_threads`, but some
-                     processors/readers still require to be called multiple
-                     times in python.
-
-    Returns:
-        Output Queue, DataStream, Reader, or None, depending on the parameters
-        passed.
-    """
-    result, _ = _pipe_step(
-        input, output, num_threads, processor, name, capacity, group,
-        num_runtime_threads)
-    return result
-
-
-def pipe_and_output(
-        input, output=None, num_threads=1, processor=None, name=None,
-        capacity=None, group=None, num_runtime_threads=1, final_outputs=None):
-    """
-    Similar to `pipe`, with the additional ability for the pipe Task to
-    return output values to the `Session` once done.
-
-    Returns:
-        Tuple (out_queue, *task_outputs)
-            out_queue:    same as return value of `pipe`.
-            task_outputs: TaskOutput object, fetchable from the client after
-                          session.run() returns.
-    """
-    assert num_threads > 0
-    result, task = _pipe_step(
-        input, output, num_threads, processor, name, capacity, group,
-        num_runtime_threads, final_outputs)
-    output = None
-    if final_outputs is not None:
-        output = task.outputs()
-        if type(final_outputs) not in (list, tuple):
-            output = output[0]
-    return result, output
-
-
-def processor_name(processor):
-    if hasattr(processor, 'name'):
-        return processor.name
-    if hasattr(processor, 'func_name'):
-        if processor.func_name == '<lambda>':
-            return processor.__module__
-        if hasattr(processor, 'im_class'):
-            return '%s.%s' % (processor.im_class.__name__, processor.func_name)
-        return processor.func_name
-    return processor.__class__.__name__
-
-
-def _runtime_threads_task(name, group, final_outputs, reader, num_threads,
-                          output, capacity):
-    node_name = str(Node.current())
-    profiler_name = "{0}/{1}/{2}/{3}/{4}".format(
-        node_name,
-        "pipe",
-        name,
-        processor_name(input) if input else "NoInput",
-        processor_name(output) if output else "NoOutput")
-
-    with Task(name=name, group=group, outputs=final_outputs,
-              num_instances=num_threads) as task:
-        global_exit_net = core.Net('pipe:exit')
-        global_init_net = core.Net('pipe:init')
-        reader.setup_ex(global_init_net, global_exit_net)
-
-        init_net = core.Net('pipe:instance:init')
-        exit_net = core.Net('pipe:instance:exit')
-        read_nets, status, rec = reader.read_record_ex(init_net, exit_net)
-        init_net.ConstantFill(
-            [], [status],
-            shape=[],
-            value=False,
-            dtype=core.DataType.BOOL
-        )
-
-        if rec is not None:
-            out_queue, writer = _init_output(
-                output, capacity, global_init_net, global_exit_net)
-            write_nets, _ = writer.write_record_ex(
-                rec, init_net, exit_net, status)
-        else:
-            out_queue = None
-            write_nets = []
-
-        with ops.task_init():
-            ops.net(global_init_net)
-        with ops.task_instance_init():
-            ops.net(init_net)
-
-        timer_start_net = core.Net('timer_start')
-        timer = timer_start_net.TimerBegin([], counter_name=profiler_name)
-        timer_end_net = core.Net('timer_end')
-        timer_end_net.TimerEnd(timer, [])
-
-        ops.net(core.execution_step(
-            'body',
-            [timer_start_net] + list(read_nets) + list(write_nets) +
-            [timer_end_net],
-            should_stop_blob=status))
-        ops.net(timer_end_net)
-
-        with ops.task_instance_exit():
-            ops.net(exit_net)
-        with ops.task_exit():
-            ops.net(global_exit_net)
-
-    return out_queue, task
-
-
-def _static_threads_task(name, group, final_outputs, reader, num_threads,
-                         output, capacity):
-    node_name = str(Node.current())
-    profiler_name = "{0}/{1}/{2}/{3}/{4}".format(
-        node_name,
-        "pipe",
-        name,
-        processor_name(input) if input else "NoInput",
-        processor_name(output) if output else "NoOutput")
-
-    with Task(name=name, group=group, outputs=final_outputs) as task:
-        global_exit_net = core.Net('exit')
-        global_init_net = core.Net('init')
-        reader.setup_ex(global_init_net, global_exit_net)
-
-        out_queue = None
-        writer = None
-
-        steps = []
-        for thread_id in range(num_threads):
-            with NetBuilder(name='t:%d' % thread_id) as nb:
-                init_net = core.Net('init')
-                exit_net = core.Net('exit')
-                read_nets, status, rec = reader.read_record_ex(
-                    init_net, exit_net)
-                init_net.ConstantFill(
-                    [], [status],
-                    shape=[],
-                    value=False,
-                    dtype=core.DataType.BOOL
-                )
-
-                if rec is not None:
-                    if writer is None:
-                        # hack so that the out queue gets the right name prefix
-                        # (otherwise they would be prefixed with the thread id)
-                        with NetBuilder(_fullname=task.name):
-                            out_queue, writer = _init_output(
-                                output, capacity, global_init_net,
-                                global_exit_net)
-                    write_nets, _ = writer.write_record_ex(
-                        rec, init_net, exit_net, status)
-                else:
-                    write_nets = []
-
-                timer_start_net = core.Net('timer_start')
-                timer = timer_start_net.TimerBegin([], counter_name=profiler_name)
-                timer_end_net = core.Net('timer_end')
-                timer_end_net.TimerEnd(timer, [])
-
-                ops.net(init_net)
-                ops.net(core.execution_step(
-                    'body',
-                    [timer_start_net] + list(read_nets) + list(write_nets) +
-                    [timer_end_net],
-                    should_stop_blob=status))
-                ops.net(timer_end_net)
-                ops.net(exit_net)
-            steps.append(core.to_execution_step(nb))
-        ops.net(global_init_net)
-        ops.net(core.execution_step('body', steps, concurrent_substeps=True))
-        ops.net(global_exit_net)
-    return out_queue, task
-
-
-def _pipe_step(
-        input, output=None, num_threads=1, processor=None, name=None,
-        capacity=None, group=None, num_runtime_threads=None, final_outputs=None):
-    """
-    """
-    assert num_threads <= 1 or num_runtime_threads <= 1, (
-        'Only one of num_threads or num_runtime_threads must be set.')
-
-    if isinstance(input, Reader):
-        reader = input
-    elif hasattr(input, 'reader'):
-        reader = input.reader()
-    else:
-        raise ValueError(
-            'Input must be a reader, queue or stream. Got {}'.format(type(input)))
-
-    if processor is not None:
-        reader = ProcessingReader(reader, processor)
-
-    if num_threads == 0 or num_runtime_threads == 0:
-        assert output is None
-        return reader, None
-
-    if name is None and processor is not None:
-        name = processor_name(processor)
-    if name is None and output is not None:
-        name = 'pipe_into:%s' % processor_name(output)
-    if name is None:
-        name = 'pipe_from:%s' % processor_name(input)
-
-    if num_threads > 1:
-        return _static_threads_task(
-            name, group, final_outputs, reader, num_threads, output, capacity)
-    else:
-        return _runtime_threads_task(
-            name, group, final_outputs, reader, num_runtime_threads, output,
-            capacity)
-
-
-class ProcessingReader(Reader):
-    """
-    Reader that reads from an upstream reader, calls the processor, and returns
-    the processed record.
-    """
-    def __init__(self, reader, processor):
-        Reader.__init__(self)
-        self.reader = reader
-        self.processor = make_processor(processor, reader)
-
-    def schema(self):
-        return self.processor.schema()
-
-    def setup_ex(self, init_net, finish_net):
-        self.reader.setup_ex(init_net, finish_net)
-
-    def read_ex(self, init_net, exit_net):
-        read_nets, status, rec = self.reader.read_record_ex(init_net, exit_net)
-        # We don't use status as stop_blob of NetBuilder it's not guarantee that
-        # it would end up being the true stob_blob. For example,
-        # ReaderWithLimitBase doesn't pass the status through but rather copy
-        # from it.
-        with NetBuilder() as nb:
-            # Current NetBuilder is optionally used inside the processor,
-            # then its children are retrieved inside of
-            # normalize_processor_output.
-            # Once readers and writers also use NetBuilder,
-            # this logic will be more natural.
-            result = normalize_processor_output(self.processor(rec))
-        read_nets += result.nets
-        if result.should_stop or nb._stop_blob:
-            stop_net = core.Net('stop_net')
-            if result.should_stop:
-                stop_net.Or([status, result.should_stop], [status])
-            if nb._stop_blob:
-                stop_net.Or([status, nb._stop_blob], [status])
-            read_nets.append(stop_net)
-        if hasattr(self.processor, 'setup'):
-            init_net.add_attribute(TaskGroup.LOCAL_SETUP, self.processor)
-        self._set_schema(result.record)
-        fields = result.record.field_blobs() if result.record else None
-        return read_nets, status, fields
-
-
-class NetProcessor:
-    """
-    Processor that clones a core.Net each time it's called, executing
-    the cloned net as the processor. It requires the Net to have input
-    and (optionally) output records set, with net.set_input_record() and
-    net.set_output_record().
-    """
-    def __init__(self, net, stop_signal=None, thread_init_nets=None, name=None):
-        assert isinstance(net, core.Net)
-        assert stop_signal is None or isinstance(
-            stop_signal, core.BlobReference)
-        self.name = name or str(net)
-        self.thread_init_nets = thread_init_nets or []
-        self.net = net
-        self._stop_signal = stop_signal
-        self._blob_maps = []
-        self._frozen = False
-        self._cloned_init_nets = []
-
-    def schema(self):
-        return self.net.output_record()
-
-    def setup(self, init_net):
-        self._frozen = True
-        cloned_init_nets = self._cloned_init_nets
-        self._cloned_init_nets = []
-        return cloned_init_nets
-
-    def __call__(self, rec):
-        assert not self._frozen
-        prefix = NetBuilder.current().name + '/'
-        blob_remap = {}
-        for net in self.thread_init_nets:
-            new_net, _ = core.clone_and_bind_net(
-                net, str(net) + prefix, prefix, blob_remap)
-            self._cloned_init_nets.append(new_net)
-
-        new_net, remappings = core.clone_and_bind_net(
-            self.net, str(self.net) + prefix, prefix, blob_remap, rec)
-
-        if self._stop_signal is None:
-            stop_signal = None
-        elif str(self._stop_signal) in remappings:
-            stop_signal = core.BlobReference(
-                remappings[str(self._stop_signal)],
-                net=new_net)
-        else:
-            stop_signal = self._stop_signal
-
-        self._blob_maps.append(remappings)
-        return Output([new_net], new_net.output_record(), stop_signal)
-
-    def blob_maps(self):
-        self._frozen = True
-        return self._blob_maps
diff --git a/caffe2/python/pipeline_test.py b/caffe2/python/pipeline_test.py
deleted file mode 100644
index 0764aec4ef96..000000000000
--- a/caffe2/python/pipeline_test.py
+++ /dev/null
@@ -1,77 +0,0 @@
-
-
-
-
-
-from caffe2.python.schema import (
-    Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord)
-from caffe2.python import core, workspace
-from caffe2.python.session import LocalSession
-from caffe2.python.dataset import Dataset
-from caffe2.python.pipeline import pipe
-from caffe2.python.queue_util import Queue
-from caffe2.python.task import TaskGroup
-from caffe2.python.test_util import TestCase
-from caffe2.python.net_builder import ops
-import numpy as np
-import math
-
-
-class TestPipeline(TestCase):
-    def test_dequeue_many(self):
-        init_net = core.Net('init')
-        N = 17
-        NUM_DEQUEUE_RECORDS = 3
-        src_values = Struct(
-            ('uid', np.array(range(N))),
-            ('value', 0.1 * np.array(range(N))))
-        expected_dst = Struct(
-            ('uid', 2 * np.array(range(N))),
-            ('value', np.array(N * [0.0])))
-
-        with core.NameScope('init'):
-            src_blobs = NewRecord(init_net, src_values)
-            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())
-            counter = init_net.Const(0)
-            ONE = init_net.Const(1)
-
-        def proc1(rec):
-            with core.NameScope('proc1'):
-                out = NewRecord(ops, rec)
-            ops.Add([rec.uid(), rec.uid()], [out.uid()])
-            out.value.set(blob=rec.value(), unsafe=True)
-            return out
-
-        def proc2(rec):
-            with core.NameScope('proc2'):
-                out = NewRecord(ops, rec)
-            out.uid.set(blob=rec.uid(), unsafe=True)
-            ops.Sub([rec.value(), rec.value()], [out.value()])
-            ops.Add([counter, ONE], [counter])
-            return out
-
-        src_ds = Dataset(src_blobs)
-        dst_ds = Dataset(dst_blobs)
-
-        with TaskGroup() as tg:
-            out1 = pipe(
-                src_ds.reader(),
-                output=Queue(
-                    capacity=11, num_dequeue_records=NUM_DEQUEUE_RECORDS),
-                processor=proc1)
-            out2 = pipe(out1, processor=proc2)
-            pipe(out2, dst_ds.writer())
-
-        ws = workspace.C.Workspace()
-        FeedRecord(src_blobs, src_values, ws)
-        session = LocalSession(ws)
-        session.run(init_net)
-        session.run(tg)
-        output = FetchRecord(dst_blobs, ws=ws)
-        num_dequeues = ws.blobs[str(counter)].fetch()
-
-        self.assertEqual(
-            num_dequeues, int(math.ceil(float(N) / NUM_DEQUEUE_RECORDS)))
-
-        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
-            np.testing.assert_array_equal(a, b)
diff --git a/caffe2/python/predictor/__init__.py b/caffe2/python/predictor/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/predictor/mobile_exporter.py b/caffe2/python/predictor/mobile_exporter.py
deleted file mode 100644
index e0fa90bffb6e..000000000000
--- a/caffe2/python/predictor/mobile_exporter.py
+++ /dev/null
@@ -1,106 +0,0 @@
-## @package mobile_exporter
-# Module caffe2.python.mobile_exporter
-
-
-
-
-
-from caffe2.python import core, utils
-from caffe2.proto import caffe2_pb2
-import numpy as np
-
-
-def add_tensor(net, name, blob):
-    ''' Create an operator to store the tensor 'blob',
-        run the operator to put the blob to workspace.
-        uint8 is stored as an array of string with one element.
-    '''
-    kTypeNameMapper = {
-        np.dtype('float32'): "GivenTensorFill",
-        np.dtype('int32'): "GivenTensorIntFill",
-        np.dtype('int64'): "GivenTensorInt64Fill",
-        np.dtype('uint8'): "GivenTensorByteStringToUInt8Fill",
-        np.dtype('O'): "GivenTensorStringFill"
-    }
-
-    shape = blob.shape
-    values = blob
-    # pass array of uint8 as a string to save storage
-    # storing uint8_t has a large overhead for now
-    if blob.dtype == np.dtype('uint8'):
-        shape = blob.shape
-        values = [blob.tobytes()]
-    # Only allow string arrays as objects.
-    # The only intended use case for this is to store arrays of strings in the
-    # model which can be used for post processing results in subsequent ops.
-    if blob.dtype == np.dtype('O'):
-        for blob_val in blob:
-            assert(isinstance(blob_val, bytes))
-
-    op = core.CreateOperator(
-        kTypeNameMapper[blob.dtype],
-        [], [name],
-        arg=[
-            utils.MakeArgument("shape", shape),
-            utils.MakeArgument("values", values),
-        ]
-    )
-    net.op.extend([op])
-
-
-def Export(workspace, net, params):
-    """Returns init_net and predict_net suitable for writing to disk
-       and loading into a Predictor"""
-    proto = net if isinstance(net, caffe2_pb2.NetDef) else net.Proto()
-    predict_net = caffe2_pb2.NetDef()
-    predict_net.CopyFrom(proto)
-    init_net = caffe2_pb2.NetDef()
-    # Populate the init_net.
-    ssa, blob_versions = core.get_ssa(net)
-    inputs = []
-    for versioned_inputs, _ in ssa:
-        inputs += [name for name, _ in versioned_inputs]
-
-    input_blobs = [blob_name for blob_name, version in
-                   blob_versions.items()
-                   if version == 0 and blob_name not in params]
-    # Blobs that are never used as an input to another layer,
-    # i.e. strictly output blobs.
-    output_blobs = [blob_name for blob_name, version in
-                    blob_versions.items()
-                    if version != 0 and blob_name not in inputs]
-
-    for blob_ref in params:
-        blob_name = str(blob_ref)
-        blob = workspace.FetchBlob(blob_name)
-        add_tensor(init_net, blob_name, blob)
-    # We have to make sure the blob exists in the namespace
-    # and we can do so with fake data. (Which is immediately overwritten
-    # by any typical usage)
-    for blob_name in input_blobs:
-        init_net.op.extend(
-            [
-                core.CreateOperator(
-                    "GivenTensorFill", [], [blob_name],
-                    arg=[
-                        utils.MakeArgument("shape", [1, 1]),
-                        utils.MakeArgument("values", [0.0])
-                    ]
-                )
-            ]
-        )
-
-    # Now we make input/output_blobs line up with what Predictor expects.
-    del predict_net.external_input[:]
-
-    new_external_inputs = input_blobs
-    for external_input in proto.external_input:
-        if external_input not in new_external_inputs:
-            new_external_inputs.append(external_input)
-
-    # For populating weights
-    predict_net.external_input.extend(new_external_inputs)
-    # Ensure the output is also consistent with what we want
-    del predict_net.external_output[:]
-    predict_net.external_output.extend(output_blobs)
-    return init_net, predict_net
diff --git a/caffe2/python/predictor/mobile_exporter_test.py b/caffe2/python/predictor/mobile_exporter_test.py
deleted file mode 100644
index 0269ec229888..000000000000
--- a/caffe2/python/predictor/mobile_exporter_test.py
+++ /dev/null
@@ -1,132 +0,0 @@
-
-
-
-
-from caffe2.python.test_util import TestCase
-from caffe2.python import workspace, brew
-from caffe2.python.model_helper import ModelHelper
-from caffe2.python.predictor import mobile_exporter
-import numpy as np
-
-
-class TestMobileExporter(TestCase):
-    def test_mobile_exporter(self):
-        model = ModelHelper(name="mobile_exporter_test_model")
-        # Test LeNet
-        brew.conv(model, 'data', 'conv1', dim_in=1, dim_out=20, kernel=5)
-        brew.max_pool(model, 'conv1', 'pool1', kernel=2, stride=2)
-        brew.conv(model, 'pool1', 'conv2', dim_in=20, dim_out=50, kernel=5)
-        brew.max_pool(model, 'conv2', 'pool2', kernel=2, stride=2)
-        brew.fc(model, 'pool2', 'fc3', dim_in=50 * 4 * 4, dim_out=500)
-        brew.relu(model, 'fc3', 'fc3')
-        brew.fc(model, 'fc3', 'pred', 500, 10)
-        brew.softmax(model, 'pred', 'out')
-
-        # Create our mobile exportable networks
-        workspace.RunNetOnce(model.param_init_net)
-        init_net, predict_net = mobile_exporter.Export(
-            workspace, model.net, model.params
-        )
-
-        # Populate the workspace with data
-        np_data = np.random.rand(1, 1, 28, 28).astype(np.float32)
-        workspace.FeedBlob("data", np_data)
-
-        workspace.CreateNet(model.net)
-        workspace.RunNet(model.net)
-        ref_out = workspace.FetchBlob("out")
-
-        # Clear the workspace
-        workspace.ResetWorkspace()
-
-        # Populate the workspace with data
-        workspace.RunNetOnce(init_net)
-        # Fake "data" is populated by init_net, we have to replace it
-        workspace.FeedBlob("data", np_data)
-
-        # Overwrite the old net
-        workspace.CreateNet(predict_net, True)
-        workspace.RunNet(predict_net.name)
-        manual_run_out = workspace.FetchBlob("out")
-        np.testing.assert_allclose(
-            ref_out, manual_run_out, atol=1e-10, rtol=1e-10
-        )
-
-        # Clear the workspace
-        workspace.ResetWorkspace()
-
-        # Predictor interface test (simulates writing to disk)
-        predictor = workspace.Predictor(
-            init_net.SerializeToString(), predict_net.SerializeToString()
-        )
-
-        # Output is a vector of outputs but we only care about the first and only result
-        predictor_out = predictor.run([np_data])
-        assert len(predictor_out) == 1
-        predictor_out = predictor_out[0]
-
-        np.testing.assert_allclose(
-            ref_out, predictor_out, atol=1e-10, rtol=1e-10
-        )
-
-    def test_mobile_exporter_datatypes(self):
-        model = ModelHelper(name="mobile_exporter_test_model")
-        model.Copy("data_int", "out")
-        model.params.append("data_int")
-        model.Copy("data_obj", "out_obj")
-        model.params.append("data_obj")
-
-        # Create our mobile exportable networks
-        workspace.RunNetOnce(model.param_init_net)
-        np_data_int = np.random.randint(100, size=(1, 1, 28, 28), dtype=np.int32)
-        workspace.FeedBlob("data_int", np_data_int)
-        np_data_obj = np.array(['aa', 'bb']).astype(np.dtype('O'))
-        workspace.FeedBlob("data_obj", np_data_obj)
-
-        init_net, predict_net = mobile_exporter.Export(
-            workspace, model.net, model.params
-        )
-
-        workspace.CreateNet(model.net)
-        workspace.RunNet(model.net)
-        ref_out = workspace.FetchBlob("out")
-        ref_out_obj = workspace.FetchBlob("out_obj")
-
-        # Clear the workspace
-        workspace.ResetWorkspace()
-
-        # Populate the workspace with data
-        workspace.RunNetOnce(init_net)
-
-        # Overwrite the old net
-        workspace.CreateNet(predict_net, True)
-        workspace.RunNet(predict_net.name)
-        manual_run_out = workspace.FetchBlob("out")
-        manual_run_out_obj = workspace.FetchBlob("out_obj")
-        np.testing.assert_allclose(
-            ref_out, manual_run_out, atol=1e-10, rtol=1e-10
-        )
-        np.testing.assert_equal(ref_out_obj, manual_run_out_obj)
-
-        # Clear the workspace
-        workspace.ResetWorkspace()
-
-        # Predictor interface test (simulates writing to disk)
-        predictor = workspace.Predictor(
-            init_net.SerializeToString(), predict_net.SerializeToString()
-        )
-
-        # Output is a vector of outputs.
-        predictor_out = predictor.run([])
-        assert len(predictor_out) == 2
-        predictor_out_int = predictor_out[1]
-        predictor_out_obj = predictor_out[0]
-        # The order in predictor_out is non-deterministic. Use type of the entry
-        # to figure out what to compare it to.
-        if isinstance(predictor_out[1][0], bytes):
-            predictor_out_int = predictor_out[0]
-            predictor_out_obj = predictor_out[1]
-        np.testing.assert_allclose(
-            ref_out, predictor_out_int, atol=1e-10, rtol=1e-10
-        )
-        np.testing.assert_equal(ref_out_obj, predictor_out_obj)
diff --git a/caffe2/python/predictor/predictor_exporter.py b/caffe2/python/predictor/predictor_exporter.py
deleted file mode 100644
index 0790aa267281..000000000000
--- a/caffe2/python/predictor/predictor_exporter.py
+++ /dev/null
@@ -1,264 +0,0 @@
-## @package predictor_exporter
-# Module caffe2.python.predictor.predictor_exporter
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.proto import metanet_pb2
-from caffe2.python import workspace, core, scope
-from caffe2.python.predictor_constants import predictor_constants
-import caffe2.python.predictor.serde as serde
-import caffe2.python.predictor.predictor_py_utils as utils
-import collections
-
-
-def get_predictor_exporter_helper(submodelNetName):
-    """ constracting stub for the PredictorExportMeta
-        Only used to construct names to subfields,
-        such as calling to predict_net_name
-        Args:
-            submodelNetName - name of the model
-    """
-    stub_net = core.Net(submodelNetName)
-    pred_meta = PredictorExportMeta(predict_net=stub_net,
-                                    parameters=[],
-                                    inputs=[],
-                                    outputs=[],
-                                    shapes=None,
-                                    name=submodelNetName,
-                                    extra_init_net=None)
-    return pred_meta
-
-
-# pyre-fixme[13]: Pyre can't detect the attribute initialization via cls.super() here
-class PredictorExportMeta(collections.namedtuple(
-    'PredictorExportMeta',
-        'predict_net, parameters, inputs, outputs, shapes, name, '
-        'extra_init_net, global_init_net, net_type, num_workers, trainer_prefix')):
-    """
-    Metadata to be used for serializaing a net.
-
-    parameters, inputs, outputs could be either BlobReference or blob's names
-
-    predict_net can be either core.Net, NetDef, PlanDef or object
-
-    Override the named tuple to provide optional name parameter.
-    name will be used to identify multiple prediction nets.
-
-    net_type is the type field in caffe2 NetDef - can be 'simple', 'dag', etc.
-
-    num_workers specifies for net type 'dag' how many threads should run ops
-
-    trainer_prefix specifies the type of trainer.
-
-    extra_init_net gets appended to pred_init_net, useful for thread local init
-
-    global_init_net gets appended to global_init_net, useful for global init
-    on a shared across threads parameter workspace
-    (in a case of multi-threaded inference)
-
-    """
-    def __new__(
-        cls,
-        predict_net,
-        parameters,
-        inputs,
-        outputs,
-        shapes=None,
-        name="",
-        extra_init_net=None,
-        global_init_net=None,
-        net_type=None,
-        num_workers=None,
-        trainer_prefix=None,
-    ):
-        inputs = [str(i) for i in inputs]
-        outputs = [str(o) for o in outputs]
-        assert len(set(inputs)) == len(inputs), (
-            "All inputs to the predictor should be unique")
-        parameters = [str(p) for p in parameters]
-        assert set(parameters).isdisjoint(inputs), (
-            "Parameters and inputs are required to be disjoint. "
-            "Intersection: {}".format(set(parameters).intersection(inputs)))
-        assert set(parameters).isdisjoint(outputs), (
-            "Parameters and outputs are required to be disjoint. "
-            "Intersection: {}".format(set(parameters).intersection(outputs)))
-        shapes = shapes or {}
-
-        if predict_net is not None:
-            if isinstance(predict_net, (core.Net, core.Plan)):
-                predict_net = predict_net.Proto()
-
-            assert isinstance(predict_net, (caffe2_pb2.NetDef, caffe2_pb2.PlanDef))
-        return super(PredictorExportMeta, cls).__new__(
-            cls, predict_net, parameters, inputs, outputs, shapes, name,
-            extra_init_net, global_init_net, net_type, num_workers, trainer_prefix)
-
-    def inputs_name(self):
-        return utils.get_comp_name(predictor_constants.INPUTS_BLOB_TYPE,
-                                   self.name)
-
-    def outputs_name(self):
-        return utils.get_comp_name(predictor_constants.OUTPUTS_BLOB_TYPE,
-                                   self.name)
-
-    def parameters_name(self):
-        return utils.get_comp_name(predictor_constants.PARAMETERS_BLOB_TYPE,
-                                   self.name)
-
-    def global_init_name(self):
-        return utils.get_comp_name(predictor_constants.GLOBAL_INIT_NET_TYPE,
-                                   self.name)
-
-    def predict_init_name(self):
-        return utils.get_comp_name(predictor_constants.PREDICT_INIT_NET_TYPE,
-                                   self.name)
-
-    def predict_net_name(self):
-        return utils.get_comp_name(predictor_constants.PREDICT_NET_TYPE,
-                                   self.name)
-
-    def train_init_plan_name(self):
-        plan_name = utils.get_comp_name(predictor_constants.TRAIN_INIT_PLAN_TYPE,
-                                   self.name)
-        return self.trainer_prefix + '_' + plan_name \
-            if self.trainer_prefix else plan_name
-
-    def train_plan_name(self):
-        plan_name = utils.get_comp_name(predictor_constants.TRAIN_PLAN_TYPE,
-                                   self.name)
-        return self.trainer_prefix + '_' + plan_name \
-            if self.trainer_prefix else plan_name
-
-
-def prepare_prediction_net(filename, db_type, device_option=None):
-    '''
-    Helper function which loads all required blobs from the db
-    and returns prediction net ready to be used
-    '''
-    metanet_def = load_from_db(filename, db_type, device_option)
-
-    global_init_net = utils.GetNet(
-        metanet_def, predictor_constants.GLOBAL_INIT_NET_TYPE)
-    workspace.RunNetOnce(global_init_net)
-
-    predict_init_net = utils.GetNet(
-        metanet_def, predictor_constants.PREDICT_INIT_NET_TYPE)
-    workspace.RunNetOnce(predict_init_net)
-
-    predict_net = core.Net(
-        utils.GetNet(metanet_def, predictor_constants.PREDICT_NET_TYPE))
-    workspace.CreateNet(predict_net)
-
-    return predict_net
-
-
-def _global_init_net(predictor_export_meta, db_type):
-    net = core.Net("global-init")
-    # manifold_db does not need DBReader
-    if db_type != "manifold_db":
-        net.Load(
-            [predictor_constants.PREDICTOR_DBREADER],
-            predictor_export_meta.parameters)
-        net.Proto().external_input.extend([predictor_constants.PREDICTOR_DBREADER])
-        net.Proto().external_output.extend(predictor_export_meta.parameters)
-
-    if predictor_export_meta.global_init_net:
-        net.AppendNet(predictor_export_meta.global_init_net)
-
-    # Add the model_id in the predict_net to the global_init_net
-    utils.AddModelIdArg(predictor_export_meta, net.Proto())
-    return net.Proto()
-
-
-def get_meta_net_def(predictor_export_meta, ws=None, db_type=None):
-    """
-    """
-
-    ws = ws or workspace.C.Workspace.current
-    meta_net_def = metanet_pb2.MetaNetDef()
-
-    # Predict net is the core network that we use.
-    utils.AddNet(meta_net_def, predictor_export_meta.predict_init_name(),
-                 utils.create_predict_init_net(ws, predictor_export_meta))
-    utils.AddNet(meta_net_def, predictor_export_meta.global_init_name(),
-                 _global_init_net(predictor_export_meta, db_type))
-    utils.AddNet(meta_net_def, predictor_export_meta.predict_net_name(),
-                 utils.create_predict_net(predictor_export_meta))
-    utils.AddBlobs(meta_net_def, predictor_export_meta.parameters_name(),
-                   predictor_export_meta.parameters)
-    utils.AddBlobs(meta_net_def, predictor_export_meta.inputs_name(),
-                   predictor_export_meta.inputs)
-    utils.AddBlobs(meta_net_def, predictor_export_meta.outputs_name(),
-                   predictor_export_meta.outputs)
-    return meta_net_def
-
-
-def set_model_info(meta_net_def, project_str, model_class_str, version):
-    assert isinstance(meta_net_def, metanet_pb2.MetaNetDef)
-    meta_net_def.modelInfo.project = project_str
-    meta_net_def.modelInfo.modelClass = model_class_str
-    meta_net_def.modelInfo.version = version
-
-
-def save_to_db(db_type, db_destination, predictor_export_meta, use_ideep=False,
-               *args, **kwargs):
-    meta_net_def = get_meta_net_def(predictor_export_meta, db_type=db_type)
-    device_type = caffe2_pb2.IDEEP if use_ideep else caffe2_pb2.CPU
-    with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU)):
-        workspace.FeedBlob(
-            predictor_constants.META_NET_DEF,
-            serde.serialize_protobuf_struct(meta_net_def)
-        )
-
-    blobs_to_save = [predictor_constants.META_NET_DEF] + \
-        predictor_export_meta.parameters
-
-    op = core.CreateOperator(
-        "Save",
-        blobs_to_save, [],
-        device_option = core.DeviceOption(device_type),
-        absolute_path=True,
-        db=db_destination, db_type=db_type,
-        **kwargs
-    )
-
-    workspace.RunOperatorOnce(op)
-
-
-def load_from_db(filename, db_type, device_option=None, *args, **kwargs):
-    # global_init_net in meta_net_def will load parameters from
-    # predictor_constants.PREDICTOR_DBREADER
-    create_db = core.CreateOperator(
-        'CreateDB', [],
-        [core.BlobReference(predictor_constants.PREDICTOR_DBREADER)],
-        db=filename, db_type=db_type)
-    assert workspace.RunOperatorOnce(create_db), (
-        'Failed to create db {}'.format(filename))
-
-    # predictor_constants.META_NET_DEF is always stored before the parameters
-    load_meta_net_def = core.CreateOperator(
-        'Load',
-        [core.BlobReference(predictor_constants.PREDICTOR_DBREADER)],
-        [core.BlobReference(predictor_constants.META_NET_DEF)])
-    assert workspace.RunOperatorOnce(load_meta_net_def)
-
-    blob = workspace.FetchBlob(predictor_constants.META_NET_DEF)
-    meta_net_def = serde.deserialize_protobuf_struct(
-        blob if isinstance(blob, bytes)
-        else str(blob).encode('utf-8'),
-        metanet_pb2.MetaNetDef)
-
-    if device_option is None:
-        device_option = scope.CurrentDeviceScope()
-
-    if device_option is not None:
-        # Set the device options of all loaded blobs
-        for kv in meta_net_def.nets:
-            net = kv.value
-            for op in net.op:
-                op.device_option.CopyFrom(device_option)
-
-    return meta_net_def
diff --git a/caffe2/python/predictor/predictor_exporter_test.py b/caffe2/python/predictor/predictor_exporter_test.py
deleted file mode 100644
index dff1ad6662b6..000000000000
--- a/caffe2/python/predictor/predictor_exporter_test.py
+++ /dev/null
@@ -1,240 +0,0 @@
-
-
-
-
-
-import tempfile
-import unittest
-import numpy as np
-from caffe2.python import cnn, workspace, core
-
-from caffe2.python.predictor_constants import predictor_constants as pc
-import caffe2.python.predictor.predictor_exporter as pe
-import caffe2.python.predictor.predictor_py_utils as pred_utils
-from caffe2.proto import caffe2_pb2, metanet_pb2
-
-
-class MetaNetDefTest(unittest.TestCase):
-    def test_minimal(self):
-        '''
-        Tests that a NetsMap message can be created with a NetDef message
-        '''
-        # This calls the constructor for a metanet_pb2.NetsMap
-        metanet_pb2.NetsMap(key="test_key", value=caffe2_pb2.NetDef())
-
-    def test_adding_net(self):
-        '''
-        Tests that NetDefs can be added to MetaNetDefs
-        '''
-        meta_net_def = metanet_pb2.MetaNetDef()
-        net_def = caffe2_pb2.NetDef()
-        meta_net_def.nets.add(key="test_key", value=net_def)
-
-    def test_replace_blobs(self):
-        '''
-        Tests that NetDefs can be added to MetaNetDefs
-        '''
-        meta_net_def = metanet_pb2.MetaNetDef()
-        blob_name = "Test"
-        blob_def = ["AA"]
-        blob_def2 = ["BB"]
-        replaced_blob_def = ["CC"]
-        pred_utils.AddBlobs(meta_net_def, blob_name, blob_def)
-        self.assertEqual(blob_def, pred_utils.GetBlobs(meta_net_def, blob_name))
-        pred_utils.AddBlobs(meta_net_def, blob_name, blob_def2)
-        self.assertEqual(blob_def + blob_def2, pred_utils.GetBlobs(meta_net_def, blob_name))
-
-        pred_utils.ReplaceBlobs(meta_net_def, blob_name, replaced_blob_def)
-        self.assertEqual(replaced_blob_def, pred_utils.GetBlobs(meta_net_def, blob_name))
-
-
-class PredictorExporterTest(unittest.TestCase):
-    def _create_model(self):
-        m = cnn.CNNModelHelper()
-        m.FC("data", "y",
-             dim_in=5, dim_out=10,
-             weight_init=m.XavierInit,
-             bias_init=m.XavierInit)
-        return m
-
-    def setUp(self):
-        np.random.seed(1)
-        m = self._create_model()
-
-        self.predictor_export_meta = pe.PredictorExportMeta(
-            predict_net=m.net.Proto(),
-            parameters=[str(b) for b in m.params],
-            inputs=["data"],
-            outputs=["y"],
-            shapes={"y": (1, 10), "data": (1, 5)},
-        )
-        workspace.RunNetOnce(m.param_init_net)
-
-        self.params = {
-            param: workspace.FetchBlob(param)
-            for param in self.predictor_export_meta.parameters}
-        # Reset the workspace, to ensure net creation proceeds as expected.
-        workspace.ResetWorkspace()
-
-    def test_meta_constructor(self):
-        '''
-        Test that passing net itself instead of proto works
-        '''
-        m = self._create_model()
-        pe.PredictorExportMeta(
-            predict_net=m.net,
-            parameters=m.params,
-            inputs=["data"],
-            outputs=["y"],
-            shapes={"y": (1, 10), "data": (1, 5)},
-        )
-
-    def test_param_intersection(self):
-        '''
-        Test that passes intersecting parameters and input/output blobs
-        '''
-        m = self._create_model()
-        with self.assertRaises(Exception):
-            pe.PredictorExportMeta(
-                predict_net=m.net,
-                parameters=m.params,
-                inputs=["data"] + m.params,
-                outputs=["y"],
-                shapes={"y": (1, 10), "data": (1, 5)},
-            )
-        with self.assertRaises(Exception):
-            pe.PredictorExportMeta(
-                predict_net=m.net,
-                parameters=m.params,
-                inputs=["data"],
-                outputs=["y"] + m.params,
-                shapes={"y": (1, 10), "data": (1, 5)},
-            )
-
-    def test_meta_net_def_net_runs(self):
-        for param, value in self.params.items():
-            workspace.FeedBlob(param, value)
-
-        extra_init_net = core.Net('extra_init')
-        extra_init_net.ConstantFill('data', 'data', value=1.0)
-
-        global_init_net = core.Net('global_init')
-        global_init_net.ConstantFill(
-            [],
-            'global_init_blob',
-            value=1.0,
-            shape=[1, 5],
-            dtype=core.DataType.FLOAT
-        )
-        pem = pe.PredictorExportMeta(
-            predict_net=self.predictor_export_meta.predict_net,
-            parameters=self.predictor_export_meta.parameters,
-            inputs=self.predictor_export_meta.inputs,
-            outputs=self.predictor_export_meta.outputs,
-            shapes=self.predictor_export_meta.shapes,
-            extra_init_net=extra_init_net,
-            global_init_net=global_init_net,
-            net_type='dag',
-        )
-
-        db_type = 'minidb'
-        db_file = tempfile.NamedTemporaryFile(
-            delete=False, suffix=".{}".format(db_type))
-        pe.save_to_db(
-            db_type=db_type,
-            db_destination=db_file.name,
-            predictor_export_meta=pem)
-
-        workspace.ResetWorkspace()
-
-        meta_net_def = pe.load_from_db(
-            db_type=db_type,
-            filename=db_file.name,
-        )
-
-        self.assertTrue("data" not in workspace.Blobs())
-        self.assertTrue("y" not in workspace.Blobs())
-
-        init_net = pred_utils.GetNet(meta_net_def, pc.PREDICT_INIT_NET_TYPE)
-
-        # 0-fills externalblobs blobs and runs extra_init_net
-        workspace.RunNetOnce(init_net)
-
-        self.assertTrue("data" in workspace.Blobs())
-        self.assertTrue("y" in workspace.Blobs())
-
-        print(workspace.FetchBlob("data"))
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("data"), np.ones(shape=(1, 5)))
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("y"), np.zeros(shape=(1, 10)))
-
-        self.assertTrue("global_init_blob" not in workspace.Blobs())
-        # Load parameters from DB
-        global_init_net = pred_utils.GetNet(meta_net_def,
-                                            pc.GLOBAL_INIT_NET_TYPE)
-        workspace.RunNetOnce(global_init_net)
-
-        # make sure the extra global_init_net is running
-        self.assertTrue(workspace.HasBlob('global_init_blob'))
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("global_init_blob"), np.ones(shape=(1, 5)))
-
-        # Run the net with a reshaped input and verify we are
-        # producing good numbers (with our custom implementation)
-        workspace.FeedBlob("data", np.random.randn(2, 5).astype(np.float32))
-        predict_net = pred_utils.GetNet(meta_net_def, pc.PREDICT_NET_TYPE)
-        self.assertEqual(predict_net.type, 'dag')
-        workspace.RunNetOnce(predict_net)
-        np.testing.assert_array_almost_equal(
-            workspace.FetchBlob("y"),
-            workspace.FetchBlob("data").dot(self.params["y_w"].T) +
-            self.params["y_b"])
-
-    def test_load_device_scope(self):
-        for param, value in self.params.items():
-            workspace.FeedBlob(param, value)
-
-        pem = pe.PredictorExportMeta(
-            predict_net=self.predictor_export_meta.predict_net,
-            parameters=self.predictor_export_meta.parameters,
-            inputs=self.predictor_export_meta.inputs,
-            outputs=self.predictor_export_meta.outputs,
-            shapes=self.predictor_export_meta.shapes,
-            net_type='dag',
-        )
-
-        db_type = 'minidb'
-        db_file = tempfile.NamedTemporaryFile(
-            delete=False, suffix=".{}".format(db_type))
-        pe.save_to_db(
-            db_type=db_type,
-            db_destination=db_file.name,
-            predictor_export_meta=pem)
-
-        workspace.ResetWorkspace()
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 1)):
-            meta_net_def = pe.load_from_db(
-                db_type=db_type,
-                filename=db_file.name,
-            )
-
-        init_net = core.Net(pred_utils.GetNet(meta_net_def,
-                            pc.GLOBAL_INIT_NET_TYPE))
-        predict_init_net = core.Net(pred_utils.GetNet(
-            meta_net_def, pc.PREDICT_INIT_NET_TYPE))
-
-        # check device options
-        for op in list(init_net.Proto().op) + list(predict_init_net.Proto().op):
-            self.assertEqual(1, op.device_option.device_id)
-            self.assertEqual(caffe2_pb2.CPU, op.device_option.device_type)
-
-    def test_db_fails_without_params(self):
-        with self.assertRaises(Exception):
-            for db_type in ["minidb"]:
-                db_file = tempfile.NamedTemporaryFile(
-                    delete=False, suffix=".{}".format(db_type))
-                pe.save_to_db(
-                    db_type=db_type,
-                    db_destination=db_file.name,
-                    predictor_export_meta=self.predictor_export_meta)
diff --git a/caffe2/python/predictor/predictor_py_utils.py b/caffe2/python/predictor/predictor_py_utils.py
deleted file mode 100644
index 0a23ab5c2512..000000000000
--- a/caffe2/python/predictor/predictor_py_utils.py
+++ /dev/null
@@ -1,218 +0,0 @@
-## @package predictor_py_utils
-# Module caffe2.python.predictor.predictor_py_utils
-
-
-from caffe2.python import core, scope
-
-
-def create_predict_net(predictor_export_meta):
-    """
-    Return the input prediction net.
-    """
-    # Construct a new net to clear the existing settings.
-    net = core.Net(predictor_export_meta.predict_net.name or "predict")
-    net.Proto().op.extend(predictor_export_meta.predict_net.op)
-    net.Proto().partition_info.extend(predictor_export_meta.predict_net.partition_info)
-    net.Proto().external_input.extend(
-        predictor_export_meta.inputs + predictor_export_meta.parameters
-    )
-    net.Proto().external_output.extend(predictor_export_meta.outputs)
-    net.Proto().arg.extend(predictor_export_meta.predict_net.arg)
-    if predictor_export_meta.net_type is not None:
-        net.Proto().type = predictor_export_meta.net_type
-    if predictor_export_meta.num_workers is not None:
-        net.Proto().num_workers = predictor_export_meta.num_workers
-    return net.Proto()
-
-
-def create_predict_init_net(ws, predictor_export_meta):
-    """
-    Return an initialization net that zero-fill all the input and
-    output blobs, using the shapes from the provided workspace. This is
-    necessary as there is no shape inference functionality in Caffe2.
-    """
-    net = core.Net("predict-init")
-
-    def zero_fill(blob):
-        shape = predictor_export_meta.shapes.get(blob)
-        if shape is None:
-            if blob not in ws.blobs:
-                raise Exception(
-                    "{} not in workspace but needed for shape: {}".format(
-                        blob, ws.blobs
-                    )
-                )
-
-            shape = ws.blobs[blob].fetch().shape
-
-        # Explicitly null-out the scope so users (e.g. PredictorGPU)
-        # can control (at a Net-global level) the DeviceOption of
-        # these filling operators.
-        with scope.EmptyDeviceScope():
-            net.ConstantFill([], blob, shape=shape, value=0.0)
-
-    external_blobs = predictor_export_meta.inputs + predictor_export_meta.outputs
-    for blob in external_blobs:
-        zero_fill(blob)
-
-    net.Proto().external_input.extend(external_blobs)
-    if predictor_export_meta.extra_init_net:
-        net.AppendNet(predictor_export_meta.extra_init_net)
-
-    # Add the model_id in the predict_net to the init_net
-    AddModelIdArg(predictor_export_meta, net.Proto())
-
-    return net.Proto()
-
-
-def get_comp_name(string, name):
-    if name:
-        return string + "_" + name
-    return string
-
-
-def to_first_match_dict(kv_list):
-    """
-    Construct dict from kv_list
-    """
-    d = {}
-    for item in kv_list:
-        if item.key not in d:
-            d[item.key] = item.value
-    return d
-
-
-def _ProtoMapGet(field, key):
-    """
-    Given the key, get the value of the repeated field.
-    Helper function used by protobuf since it doesn't have map construct
-    """
-    for v in field:
-        if v.key == key:
-            return v.value
-    return None
-
-
-def GetPlan(meta_net_def, key):
-    return _ProtoMapGet(meta_net_def.plans, key)
-
-
-def GetPlanOriginal(meta_net_def, key):
-    return _ProtoMapGet(meta_net_def.plans, key)
-
-
-def GetBlobs(meta_net_def, key):
-    blobs = _ProtoMapGet(meta_net_def.blobs, key)
-    if blobs is None:
-        return []
-    return blobs
-
-
-def GetBlobsByTypePrefix(meta_net_def, blob_type_prefix):
-    blob_map = {}
-    for b in meta_net_def.blobs:
-        if b.key.startswith(blob_type_prefix):
-            for blob in b.value:
-                if blob not in blob_map:
-                    blob_map[blob] = len(blob_map)
-    return sorted(blob_map, key=lambda blob: blob_map[blob])
-
-
-def GetNet(meta_net_def, key):
-    return _ProtoMapGet(meta_net_def.nets, key)
-
-
-def GetNetOriginal(meta_net_def, key):
-    return _ProtoMapGet(meta_net_def.nets, key)
-
-
-def GetApplicationSpecificInfo(meta_net_def, key):
-    return _ProtoMapGet(meta_net_def.applicationSpecificInfo, key)
-
-
-def GetApplicationSpecificInfoDict(meta_net_def):
-    return to_first_match_dict(meta_net_def.applicationSpecificInfo)
-
-
-def AddBlobs(meta_net_def, blob_name, blob_def):
-    blobs = _ProtoMapGet(meta_net_def.blobs, blob_name)
-    if blobs is None:
-        blobs = meta_net_def.blobs.add()
-        blobs.key = blob_name
-        blobs = blobs.value
-    for blob in blob_def:
-        blobs.append(blob)
-
-
-def ReplaceBlobs(meta_net_def, blob_name, blob_def):
-    blobs = _ProtoMapGet(meta_net_def.blobs, blob_name)
-    assert blobs is not None, "The blob_name:{} does not exist".format(blob_name)
-    del blobs[:]
-    for blob in blob_def:
-        blobs.append(blob)
-
-
-def AddPlan(meta_net_def, plan_name, plan_def):
-    meta_net_def.plans.add(key=plan_name, value=plan_def)
-
-
-def AddNet(meta_net_def, net_name, net_def):
-    meta_net_def.nets.add(key=net_name, value=net_def)
-
-
-def SetBlobsOrder(meta_net_def, blobs_order):
-    for blob in blobs_order:
-        meta_net_def.blobsOrder.append(blob)
-
-
-def SetPreLoadBlobs(meta_net_def, pre_load_blobs):
-    for blob in pre_load_blobs:
-        meta_net_def.preLoadBlobs.append(blob)
-
-
-def SetRequestOnlyEmbeddings(meta_net_def, request_only_embeddings):
-    for blob in request_only_embeddings:
-        meta_net_def.requestOnlyEmbeddings.append(blob)
-
-
-def GetBlobsOrder(meta_net_def):
-    return meta_net_def.blobsOrder
-
-
-def SetTensorBoundShapes(meta_net_def, tensor_bound_shapes):
-    meta_net_def.tensorBoundShapes.CopyFrom(tensor_bound_shapes)
-
-
-def SetAOTConfig(meta_net_def, aot_config):
-    meta_net_def.aotConfig.CopyFrom(aot_config)
-
-
-def GetArgumentByName(net_def, arg_name):
-    for arg in net_def.arg:
-        if arg.name == arg_name:
-            return arg
-    return None
-
-
-def AddModelIdArg(meta_net_def, net_def):
-    """Takes the model_id from the predict_net of meta_net_def (if it is
-    populated) and adds it to the net_def passed in. This is intended to be
-    called on init_nets, as their model_id is not populated by default, but
-    should be the same as that of the predict_net
-    """
-    # Get model_id from the predict_net, assuming it's an integer
-    model_id = GetArgumentByName(meta_net_def.predict_net, "model_id")
-    if model_id is None:
-        return
-    model_id = model_id.i
-
-    # If there's another model_id on the net, replace it with the new one
-    old_id = GetArgumentByName(net_def, "model_id")
-    if old_id is not None:
-        old_id.i = model_id
-        return
-
-    # Add as an integer argument, this is also assumed above
-    arg = net_def.arg.add()
-    arg.name = "model_id"
-    arg.i = model_id
diff --git a/caffe2/python/predictor/predictor_test.py b/caffe2/python/predictor/predictor_test.py
deleted file mode 100644
index 64c88006686c..000000000000
--- a/caffe2/python/predictor/predictor_test.py
+++ /dev/null
@@ -1,73 +0,0 @@
-
-
-
-
-
-import unittest
-import numpy as np
-from caffe2.python import workspace, core
-
-from caffe2.proto import caffe2_pb2
-
-
-class TestPredictor(unittest.TestCase):
-    def setUp(self):
-        np.random.seed(1)
-        self.predict_net = self._predict_net
-        self.init_net = self._init_net
-
-    @property
-    def _predict_net(self):
-        net = caffe2_pb2.NetDef()
-        net.name = 'test-predict-net'
-        net.external_input[:] = ['A', 'B']
-        net.external_output[:] = ['C']
-        net.op.extend([
-            core.CreateOperator(
-                'MatMul',
-                ['A', 'B'],
-                ['C'],
-            )
-        ])
-        return net.SerializeToString()
-
-    @property
-    def _init_net(self):
-        net = caffe2_pb2.NetDef()
-        net.name = 'test-init-net'
-        net.external_output[:] = ['A', 'B']
-        net.op.extend([
-            core.CreateOperator(
-                'GivenTensorFill',
-                [],
-                ['A'],
-                shape=(2, 3),
-                values=np.zeros((2, 3), np.float32).flatten().tolist(),
-            ),
-            core.CreateOperator(
-                'GivenTensorFill',
-                [],
-                ['B'],
-                shape=(3, 4),
-                values=np.zeros((3, 4), np.float32).flatten().tolist(),
-            ),
-        ])
-        return net.SerializeToString()
-
-    def test_run(self):
-        A = np.ones((2, 3), np.float32)
-        B = np.ones((3, 4), np.float32)
-        predictor = workspace.Predictor(self.init_net, self.predict_net)
-        outputs = predictor.run([A, B])
-        self.assertEqual(len(outputs), 1)
-        np.testing.assert_almost_equal(np.dot(A, B), outputs[0])
-
-    def test_run_map(self):
-        A = np.zeros((2, 3), np.float32)
-        B = np.ones((3, 4), np.float32)
-        predictor = workspace.Predictor(self.init_net, self.predict_net)
-        outputs = predictor.run({
-            'B': B,
-        })
-        self.assertEqual(len(outputs), 1)
-        np.testing.assert_almost_equal(np.dot(A, B), outputs[0])
diff --git a/caffe2/python/predictor/serde.py b/caffe2/python/predictor/serde.py
deleted file mode 100644
index 2b8f1544803d..000000000000
--- a/caffe2/python/predictor/serde.py
+++ /dev/null
@@ -1,16 +0,0 @@
-## @package serde
-# Module caffe2.python.predictor.serde
-
-
-
-
-
-
-def serialize_protobuf_struct(protobuf_struct):
-    return protobuf_struct.SerializeToString()
-
-
-def deserialize_protobuf_struct(serialized_protobuf, struct_type):
-    deser = struct_type()
-    deser.ParseFromString(serialized_protobuf)
-    return deser
diff --git a/caffe2/python/predictor_constants.py b/caffe2/python/predictor_constants.py
deleted file mode 100644
index eda0c66974f4..000000000000
--- a/caffe2/python/predictor_constants.py
+++ /dev/null
@@ -1,9 +0,0 @@
-## @package predictor_constants
-# Module caffe2.python.predictor_constants
-
-
-
-
-import caffe2.proto.predictor_consts_pb2 as predictor_consts
-
-predictor_constants = predictor_consts.PredictorConsts()
diff --git a/caffe2/python/pybind_state.cc b/caffe2/python/pybind_state.cc
deleted file mode 100644
index 0a8498103a82..000000000000
--- a/caffe2/python/pybind_state.cc
+++ /dev/null
@@ -1,1977 +0,0 @@
-#include "pybind_state.h"
-
-#include <chrono>
-#include <future>
-#include <memory>
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <c10/macros/Macros.h>
-
-#include "caffe2/core/blob_serialization.h"
-#include "caffe2/core/blob_stats.h"
-#include "caffe2/core/common.h"
-#include "caffe2/core/db.h"
-#include "caffe2/core/numa.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/stats.h"
-#include "caffe2/core/transform.h"
-#include "caffe2/observers/profile_observer.h"
-#include "caffe2/observers/runcnt_observer.h"
-#include "caffe2/observers/time_observer.h"
-#include "caffe2/onnx/backend.h"
-#include "caffe2/onnx/helper.h"
-#include "caffe2/onnx/offline_tensor.h"
-#include "caffe2/onnx/onnx_exporter.h"
-#include "caffe2/opt/converter.h"
-#include "caffe2/opt/fakefp16_transform.h"
-#include "caffe2/opt/fusion.h"
-#include "caffe2/opt/mobile.h"
-#include "caffe2/opt/onnxifi_transformer.h"
-#include "caffe2/opt/optimize_ideep.h"
-#include "caffe2/opt/passes.h"
-#include "caffe2/opt/shape_info.h"
-#include "caffe2/predictor/emulator/data_filler.h"
-#include "caffe2/predictor/predictor.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/proto/torch.pb.h"
-#include "caffe2/python/pybind_state_registry.h"
-#include "caffe2/python/pybind_workspace.h"
-#include "caffe2/utils/cpuid.h"
-#include "caffe2/utils/string_utils.h"
-#include "torch/csrc/autograd/variable.h"
-#include "torch/csrc/jit/python/module_python.h"
-
-// Because of CMake setup, we can't depend on script module here just yet -
-// it pulls in generated files from a different directory and it
-// probabilistically breaks the build.
-// TODO: enable if once shared libraries are unified in CMake
-#ifdef FBCODE_CAFFE2
-#include "torch/script.h"
-#endif
-
-namespace caffe2 {
-namespace python {
-
-// A dummy variable to overcome the pybind11 py::arg::operator= ambiguity
-// for some earlier versions of pybind11.
-constexpr bool kPyBindFalse = false;
-
-namespace py = pybind11;
-
-// NOLINTNEXTLINE(modernize-use-equals-default)
-BlobFeederBase::~BlobFeederBase() {}
-
-C10_DEFINE_TYPED_REGISTRY(
-    BlobFeederRegistry,
-    caffe2::DeviceType,
-    BlobFeederBase,
-    std::unique_ptr);
-
-REGISTER_BLOB_FETCHER((TypeMeta::Id<Tensor>()), TensorFetcher);
-REGISTER_BLOB_FEEDER(CPU, TensorFeeder<CPUContext>);
-
-class StringFetcher : public BlobFetcherBase {
- public:
-  py::object Fetch(const Blob& blob) override {
-    return py::bytes(blob.Get<string>());
-  }
-};
-REGISTER_BLOB_FETCHER((TypeMeta::Id<string>()), StringFetcher);
-
-#ifdef FBCODE_CAFFE2
-class ScriptModuleFetcher : public BlobFetcherBase {
- public:
-  pybind11::object Fetch(const Blob& blob) override {
-    return py::cast(*blob.Get<std::unique_ptr<torch::jit::Module>>());
-  }
-};
-
-REGISTER_BLOB_FETCHER(
-    (TypeMeta::Id<std::unique_ptr<torch::jit::Module>>()),
-    caffe2::python::ScriptModuleFetcher);
-#endif
-
-static_assert(
-    sizeof(int) == sizeof(int32_t),
-    "We make an assumption that int is always int32 for numpy "
-    "type mapping.");
-int CaffeToNumpyType(const TypeMeta meta) {
-#ifdef USE_NUMPY
-  static std::map<TypeIdentifier, int> numpy_type_map{
-      {TypeMeta::Id<bool>(), NPY_BOOL},
-      {TypeMeta::Id<double>(), NPY_DOUBLE},
-      {TypeMeta::Id<float>(), NPY_FLOAT},
-      {TypeMeta::Id<std::complex<double>>(), NPY_COMPLEX128},
-      {TypeMeta::Id<std::complex<float>>(), NPY_COMPLEX64},
-      {TypeMeta::Id<at::Half>(), NPY_FLOAT16},
-      {TypeMeta::Id<int>(), NPY_INT},
-      {TypeMeta::Id<int8_t>(), NPY_INT8},
-      {TypeMeta::Id<int16_t>(), NPY_INT16},
-      {TypeMeta::Id<int64_t>(), NPY_LONGLONG},
-      {TypeMeta::Id<uint8_t>(), NPY_UINT8},
-      {TypeMeta::Id<uint16_t>(), NPY_UINT16},
-      {TypeMeta::Id<std::string>(), NPY_OBJECT},
-      // Note: Add more types here.
-  };
-  const auto it = numpy_type_map.find(meta.id());
-  return it == numpy_type_map.end() ? -1 : it->second;
-#else
-  CAFFE_THROW("Caffe2 compiled without NumPy support.");
-#endif // USE_NUMPY
-}
-
-const TypeMeta NumpyTypeToCaffe(int numpy_type) {
-#ifdef USE_NUMPY
-  static std::map<int, TypeMeta> caffe_type_map{
-      {NPY_BOOL, TypeMeta::Make<bool>()},
-      {NPY_DOUBLE, TypeMeta::Make<double>()},
-      {NPY_FLOAT, TypeMeta::Make<float>()},
-      {NPY_FLOAT16, TypeMeta::Make<at::Half>()},
-      {NPY_INT, TypeMeta::Make<int>()},
-      {NPY_INT8, TypeMeta::Make<int8_t>()},
-      {NPY_INT16, TypeMeta::Make<int16_t>()},
-      {NPY_INT64, TypeMeta::Make<int64_t>()},
-      {NPY_LONG,
-       sizeof(long) == sizeof(int) ? TypeMeta::Make<int>()
-                                   : TypeMeta::Make<int64_t>()},
-      {NPY_LONGLONG, TypeMeta::Make<int64_t>()},
-      {NPY_UINT8, TypeMeta::Make<uint8_t>()},
-      {NPY_UINT16, TypeMeta::Make<uint16_t>()},
-      {NPY_OBJECT, TypeMeta::Make<std::string>()},
-      {NPY_UNICODE, TypeMeta::Make<std::string>()},
-      {NPY_STRING, TypeMeta::Make<std::string>()},
-      // Note: Add more types here.
-  };
-  static TypeMeta unknown_type;
-  const auto it = caffe_type_map.find(numpy_type);
-  return it == caffe_type_map.end() ? unknown_type : it->second;
-#else
-  CAFFE_THROW("Caffe2 compiled without NumPy support.");
-#endif // USE_NUMPY
-}
-
-template <typename Registry>
-std::function<const char*(const string&)> DefinitionGetter(
-    const Registry* registry) {
-  return [registry](const string& name) { return registry->HelpMessage(name); };
-}
-
-namespace python_detail {
-// Python Op implementations.
-using FuncRegistry = std::unordered_map<std::string, Func>;
-
-FuncRegistry& gRegistry() {
-  // Always leak the objects registered here.
-  static FuncRegistry* r = new FuncRegistry();
-  return *r;
-}
-
-const Func& getOpFunc(const std::string& token) {
-  CAFFE_ENFORCE(
-      gRegistry().count(token),
-      "Python operator for ",
-      token,
-      " is not available. If you use distributed training it probably means "
-      "that python implementation has to be registered in each of the workers");
-  return gRegistry()[token];
-}
-
-const Func& getGradientFunc(const std::string& token) {
-  return getOpFunc(token + "_gradient");
-}
-
-py::object fetchBlob(Workspace* ws, const std::string& name) {
-  CAFFE_ENFORCE(ws->HasBlob(name), "Can't find blob: ", name);
-  const caffe2::Blob& blob = *(ws->GetBlob(name));
-  auto fetcher = CreateFetcher(blob.meta().id());
-  if (fetcher) {
-    return fetcher->Fetch(blob);
-  } else {
-    // If there is no fetcher registered, return a metainfo string.
-    // If all branches failed, we will return a metainfo string.
-    std::stringstream ss;
-    ss << std::string(name) << ", a C++ native class of type "
-       << blob.TypeName() << ".";
-    return py::bytes(ss.str());
-  }
-}
-
-// This function can only return true, but keeping it for backward compatibility
-bool feedBlob(
-    Blob* blob,
-    const py::object& arg,
-    const py::object device_option) {
-  DeviceOption option;
-  if (!device_option.is_none()) {
-    // If we have a device option passed in, read it.
-    CAFFE_ENFORCE(ParseProtoFromLargeString(
-        py::bytes(device_option).cast<std::string>(), &option));
-  }
-#ifdef USE_NUMPY
-  if (PyArray_Check(arg.ptr())) { // numpy array
-    PyArrayObject* array = reinterpret_cast<PyArrayObject*>(arg.ptr());
-    auto feeder = CreateFeeder(option.device_type());
-    CAFFE_ENFORCE(feeder, "Unknown device type encountered in FeedBlob.");
-    feeder->Feed(option, array, blob, true); /* default to inplace feed */
-    return true;
-  }
-#else
-  CAFFE_THROW("Caffe2 compiled without NumPy support.");
-#endif // USE_NUMPY
-  if (PyBytes_Check(arg.ptr()) || PyUnicode_Check(arg.ptr())) {
-    *blob->GetMutable<std::string>() = arg.cast<std::string>();
-    return true;
-  }
-#ifdef FBCODE_CAFFE2
-  if (auto module = torch::jit::as_module(arg)) {
-    blob->GetMutable<std::unique_ptr<torch::jit::Module>>()->reset(
-        new torch::jit::Module(*module));
-    return true;
-  }
-#endif
-  CAFFE_THROW(
-      "Unexpected type of argument - only numpy array or string are "
-      "supported for feeding");
-  return false;
-}
-
-Blob deserializeBlob(const string& content) {
-  Blob blob;
-  DeserializeBlob(content, &blob);
-  return blob;
-}
-} // namespace python_detail
-
-class GetPythonGradient : public GradientMakerBase {
- public:
-  using GradientMakerBase::GradientMakerBase;
-  std::vector<OperatorDef> GetGradientDefs() override {
-    CAFFE_ENFORCE(Def().type() == "Python" || Def().type() == "PythonDLPack");
-    ArgumentHelper helper(Def());
-    auto gradOutputIndices =
-        helper.GetRepeatedArgument<int>("grad_output_indices");
-    auto gradInputIndices =
-        helper.GetRepeatedArgument<int>("grad_input_indices");
-    std::vector<std::string> gradientInputs;
-    for (int i = 0; i < def_.input_size(); ++i) {
-      // NOLINTNEXTLINE(performance-inefficient-vector-operation)
-      gradientInputs.push_back(I(i));
-    }
-    for (int i = 0; i < def_.output_size(); ++i) {
-      gradientInputs.push_back(O(i));
-    }
-    if (gradOutputIndices.size() > 0) {
-      // NOLINTNEXTLINE(modernize-loop-convert)
-      for (unsigned i = 0; i < gradOutputIndices.size(); ++i) {
-        int GO_i = gradOutputIndices[i];
-        gradientInputs.push_back(GO(GO_i));
-      }
-    } else {
-      for (int i = 0; i < def_.output_size(); ++i) {
-        gradientInputs.push_back(GO(i));
-      }
-    }
-    std::vector<std::string> gradientOutputs;
-    if (gradInputIndices.size() > 0) {
-      // NOLINTNEXTLINE(modernize-loop-convert)
-      for (unsigned i = 0; i < gradInputIndices.size(); ++i) {
-        int GI_i = gradInputIndices[i];
-        gradientOutputs.push_back(GI(GI_i));
-      }
-    } else {
-      for (int i = 0; i < def_.input_size(); ++i) {
-        gradientOutputs.push_back(GI(i));
-      }
-    }
-
-    std::string grad_op_name = "PythonGradient";
-    if (Def().type() == "PythonDLPack") {
-      grad_op_name = "PythonDLPackGradient";
-    }
-    return SingleGradientDef(grad_op_name, "", gradientInputs, gradientOutputs);
-  }
-};
-
-REGISTER_CPU_OPERATOR(Python, PythonOp<CPUContext, false>);
-REGISTER_CPU_OPERATOR(PythonGradient, PythonGradientOp<CPUContext, false>);
-// Always allow running in-place
-OPERATOR_SCHEMA(Python).AllowInplace([](int, int) { return true; });
-OPERATOR_SCHEMA(PythonGradient).AllowInplace([](int, int) { return true; });
-REGISTER_GRADIENT(Python, GetPythonGradient);
-
-REGISTER_CPU_OPERATOR(PythonDLPack, PythonOp<CPUContext, true>);
-REGISTER_CPU_OPERATOR(PythonDLPackGradient, PythonGradientOp<CPUContext, true>);
-OPERATOR_SCHEMA(PythonDLPack).AllowInplace([](int, int) { return true; });
-OPERATOR_SCHEMA(PythonDLPackGradient).AllowInplace([](int, int) {
-  return true;
-});
-REGISTER_GRADIENT(PythonDLPack, GetPythonGradient);
-
-class BackgroundPlan {
- public:
-  // NOLINTNEXTLINE(modernize-pass-by-value)
-  BackgroundPlan(Workspace* ws, PlanDef def) : ws_(ws), def_(def) {}
-
-  void run() {
-    fut_ =
-        std::async(std::launch::async, [this]() { return ws_->RunPlan(def_); });
-  }
-
-  bool isDone() {
-    CAFFE_ENFORCE(fut_.valid());
-    auto status = fut_.wait_for(std::chrono::milliseconds(0));
-    return status == std::future_status::ready;
-  }
-
-  bool isSucceeded() {
-    CAFFE_ENFORCE(isDone());
-    return fut_.get();
-  }
-
- private:
-  Workspace* ws_;
-  PlanDef def_;
-
-  std::future<bool> fut_;
-};
-
-void addObjectMethods(py::module& m) {
-  py::class_<NetBase>(m, "Net")
-      .def(
-          "run",
-          [](NetBase* net) {
-            py::gil_scoped_release g;
-            CAFFE_ENFORCE(net->Run());
-          })
-      .def("cancel", [](NetBase* net) {
-        py::gil_scoped_release g;
-        net->Cancel();
-      });
-
-  py::class_<ObserverBase<NetBase>>(m, "Observer")
-      .def(
-          "average_time",
-          [](ObserverBase<NetBase>* ob) {
-            auto* cast_ob = dynamic_cast_if_rtti<TimeObserver*>(ob);
-            CAFFE_ENFORCE(
-                cast_ob, "Observer does not implement this function.");
-            return cast_ob->average_time();
-          })
-      .def(
-          "average_time_children",
-          [](ObserverBase<NetBase>* ob) {
-            auto* cast_ob = dynamic_cast_if_rtti<TimeObserver*>(ob);
-            CAFFE_ENFORCE(
-                cast_ob, "Observer does not implement this function.");
-            return cast_ob->average_time_children();
-          })
-      .def("debug_info", [](ObserverBase<NetBase>* ob) {
-        return ob->debugInfo();
-      });
-
-  py::class_<Blob>(m, "Blob")
-      .def(
-          "serialize",
-          [](const Blob& blob, const std::string& name) -> py::bytes {
-            return SerializeBlob(blob, name);
-          })
-      .def(
-          "deserialize",
-          [](Blob* blob, py::bytes serialized) {
-            DeserializeBlob(serialized, blob);
-          })
-      .def(
-          "fetch",
-          [](const Blob& blob) {
-            auto fetcher = CreateFetcher(blob.meta().id());
-            CAFFE_ENFORCE(
-                fetcher,
-                "Could not fetch for blob of type: ",
-                blob.meta().name());
-            return fetcher->Fetch(blob);
-          })
-      .def("is_tensor", [](Blob* blob) { return blob->IsType<Tensor>(); })
-      // return any device Tensor
-      .def(
-          "as_tensor",
-          [](Blob* blob) {
-            CAFFE_ENFORCE(
-                blob->IsType<Tensor>(),
-                "Passed in blob doesn't contain Tensor and instead has ",
-                blob->meta());
-            return py::cast(&blob->Get<Tensor>());
-          },
-          py::return_value_policy::reference_internal)
-      // legacy API that resets tensor to CPUTensor if it's not already
-      .def(
-          "tensor",
-          [](Blob* blob) { return py::cast(BlobGetMutableTensor(blob, CPU)); },
-          py::return_value_policy::reference_internal)
-      .def(
-          "_feed",
-          &python_detail::feedBlob,
-          "Feed an input array or string, with the (optional) DeviceOption",
-          py::arg("arg"),
-          py::arg("device_option") = py::none())
-      .def("_wrap_tensor_impl", [](Blob* blob, void* ptr) {
-        auto p = c10::intrusive_ptr<c10::TensorImpl, at::UndefinedTensorImpl>::
-            unsafe_reclaim_from_nonowning(static_cast<c10::TensorImpl*>(ptr));
-        TORCH_CHECK(p.defined(), "Can't wrap undefined tensor");
-        TORCH_CHECK(
-            !p->requires_grad(), "Can wrap only non-requires-grad tensor");
-        auto at_tensor = at::Tensor::wrap_tensor_impl(std::move(p));
-        BlobSetTensor(blob, Tensor(std::move(at_tensor)));
-      });
-
-  py::class_<DLPackWrapper<CPUContext>>(m, "DLPackTensorCPU")
-      .def_property_readonly(
-          "data",
-          [](DLPackWrapper<CPUContext>* t) -> py::object {
-            CAFFE_ENFORCE_EQ(
-                t->device_option.device_type(),
-                PROTO_CPU,
-                "Expected CPU device option for CPU tensor");
-            return t->data();
-          },
-          "Return DLPack tensor with tensor's data.")
-      .def(
-          "feed",
-          [](DLPackWrapper<CPUContext>* t, py::object obj) {
-            CAFFE_ENFORCE_EQ(
-                t->device_option.device_type(),
-                PROTO_CPU,
-                "Expected CPU device option for CPU tensor");
-            t->feed(obj);
-          },
-          "Copy data from given DLPack tensor into this tensor.")
-      .def_property_readonly(
-          "_shape",
-          [](const DLPackWrapper<CPUContext>& t) {
-            auto* tensor = t.tensor;
-            // TODO: This is marginally less efficient than it could
-            // be, since we're doing an extra allocation we didn't
-            // need to do.  But I don't remember how to clue in
-            // pybind11 how to convert ArrayRef to vector.
-            return tensor->sizes().vec();
-          })
-      .def(
-          "_reshape",
-          [](DLPackWrapper<CPUContext>* t, std::vector<int64_t> dims) {
-            auto* tensor = t->tensor;
-            tensor->Resize(dims);
-          });
-
-  py::class_<TensorCPU>(m, "TensorCPU")
-      .def_property_readonly(
-          "data",
-          [](TensorCPU* t) -> py::object {
-            if (t->dtype() == TypeMeta{}) {
-              // keep this behavior for backward compatibility
-              t->mutable_data<float>();
-            }
-            auto res = TensorFetcher().FetchTensor(*t, false);
-            return res.obj;
-          },
-          "Return numpy array pointing to this tensor's data if possible. "
-          "Otherwise (e.g. for strings) copies the data (same as fetch).")
-      .def(
-          "feed",
-          [](TensorCPU* t, py::object obj) {
-#ifdef USE_NUMPY
-            if (!PyArray_Check(obj.ptr())) {
-              CAFFE_THROW(
-                  "Unexpected type of argument -- expected numpy array");
-            }
-            *t = TensorFeeder<CPUContext>().FeedTensor(
-                DeviceOption{}, reinterpret_cast<PyArrayObject*>(obj.ptr()));
-#else
-            CAFFE_THROW("Caffe2 compiled without NumPy support.");
-#endif // USE_NUMPY
-          },
-          "Copy data from given numpy array into this tensor.")
-      .def(
-          "fetch",
-          [](TensorCPU* t) {
-            auto res = TensorFetcher().FetchTensor(*t, true);
-            return res.obj;
-          },
-          "Copy data from this tensor into a new numpy array.")
-      .def(
-          "init",
-          [](Tensor* t, std::vector<int64_t> dims, int caffe_type) {
-            const auto& meta =
-                DataTypeToTypeMeta((TensorProto::DataType)caffe_type);
-            CAFFE_ENFORCE(
-                !TensorFetcher().NeedsCopy(t, meta),
-                "Cannot init tensor of this type. Use `feed` instead.");
-            t->Resize(dims);
-            t->raw_mutable_data(meta);
-          },
-          "Initialize this tensor to given shape and data type. "
-          "Fail if the given data type cannot be accessed from python.")
-      .def(
-          "_tensor_impl_raw_handle",
-          [](TensorCPU* t) -> void* {
-            // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
-            auto p = t->getIntrusivePtr();
-            // We return a raw non-owning pointer here, we rely on surrounding
-            // code to keep the original tensor alive
-            return p.get();
-          })
-      .def_property_readonly(
-          "_shape", [](const TensorCPU& t) { return t.sizes().vec(); })
-      .def("_reshape", [](TensorCPU* t, std::vector<int64_t> dims) {
-        t->Resize(dims);
-      });
-
-  py::class_<Workspace>(m, "Workspace")
-      .def(py::init<>())
-      .def(py::init<Workspace*>())
-      .def_property_readonly(
-          "nets",
-          [](Workspace* self) {
-            TORCH_CHECK_NOTNULL(self);
-            std::map<std::string, py::object> nets;
-            for (const auto& name : self->Nets()) {
-              LOG(INFO) << "name: " << name;
-              nets[name] = py::cast(self->GetNet(name));
-            }
-            return nets;
-          },
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
-          "blobs",
-          [](Workspace* self) {
-            TORCH_CHECK_NOTNULL(self);
-            std::map<std::string, py::object> blobs;
-            for (const auto& name : self->Blobs()) {
-              blobs[name] = py::cast(self->GetBlob(name));
-            }
-            return blobs;
-          },
-          py::return_value_policy::reference_internal)
-      .def(
-          "_create_net",
-          [](Workspace* self, py::bytes def, bool overwrite) -> py::object {
-            caffe2::NetDef proto;
-            CAFFE_ENFORCE(
-                ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-            NetBase* net = self->CreateNet(proto, overwrite);
-            CAFFE_ENFORCE(net);
-            return py::cast(net);
-          },
-          py::return_value_policy::reference_internal,
-          py::arg("def"),
-          py::arg("overwrite") = kPyBindFalse)
-      .def(
-          "create_blob",
-          [](Workspace* self, const std::string& name) -> py::object {
-            return py::cast(self->CreateBlob(name));
-          },
-          py::return_value_policy::reference_internal)
-      .def(
-          "_remove_blob",
-          [](Workspace* self, const std::string& name) -> py::bool_ {
-            return self->RemoveBlob(name);
-          })
-      .def("fetch_blob", &python_detail::fetchBlob)
-      .def(
-          "has_blob",
-          [](Workspace* self, const std::string& name) {
-            return self->HasBlob(name);
-          })
-      .def(
-          "_run_net",
-          [](Workspace* self, py::bytes def) {
-            caffe2::NetDef proto;
-            CAFFE_ENFORCE(
-                ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-            py::gil_scoped_release g;
-            CAFFE_ENFORCE(self->RunNetOnce(proto));
-          })
-      .def(
-          "_run_operator",
-          [](Workspace* self, py::bytes def) {
-            caffe2::OperatorDef proto;
-            CAFFE_ENFORCE(
-                ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-            py::gil_scoped_release g;
-            CAFFE_ENFORCE(self->RunOperatorOnce(proto));
-          })
-      .def(
-          "_run_plan",
-          [](Workspace* self, py::bytes def) {
-            caffe2::PlanDef proto;
-            CAFFE_ENFORCE(
-                ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-            py::gil_scoped_release g;
-            CAFFE_ENFORCE(self->RunPlan(proto));
-          })
-      .def(
-          "_last_failed_op_net_position",
-          [](Workspace* self) {
-            CAFFE_ENFORCE(self);
-            return (int)self->last_failed_op_net_position;
-          })
-      .def_property_readonly_static("current", [](py::object /* type */) {
-        auto ws = caffe2::python::GetCurrentWorkspace();
-        CAFFE_ENFORCE(ws);
-        return py::cast(ws, py::return_value_policy::reference);
-      });
-
-  py::class_<BackgroundPlan, std::shared_ptr<BackgroundPlan>>(
-      m, "BackgroundPlan")
-      .def("is_done", &BackgroundPlan::isDone)
-      .def("is_succeeded", &BackgroundPlan::isSucceeded);
-
-  // Gradients
-  py::class_<GradientWrapper>(m, "GradientWrapper")
-      .def(py::init<>())
-      .def_readwrite("dense", &GradientWrapper::dense_)
-      .def_readwrite("indices", &GradientWrapper::indices_)
-      .def_readwrite("values", &GradientWrapper::values_)
-      .def("is_sparse", &GradientWrapper::IsSparse)
-      .def("is_dense", &GradientWrapper::IsDense)
-      .def("is_empty", &GradientWrapper::IsEmpty);
-
-  m.def(
-      "get_gradient_defs",
-      [](py::bytes op_def, std::vector<GradientWrapper> output_gradients) {
-        OperatorDef def;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
-        CAFFE_ENFORCE(caffe2::GradientRegistry()->Has(def.type()));
-        const auto& meta = GetGradientForOp(def, output_gradients);
-        std::vector<py::bytes> grad_ops;
-        for (const auto& op : meta.ops_) {
-          // NOLINTNEXTLINE(modernize-use-emplace)
-          grad_ops.push_back(
-              SerializeAsString_EnforceCheck(op, "addObjectMethods"));
-        }
-        return std::pair<std::vector<py::bytes>, std::vector<GradientWrapper>>{
-            grad_ops, meta.g_input_};
-      },
-      pybind11::return_value_policy::copy);
-
-  // DB
-  py::class_<db::Transaction>(m, "Transaction")
-      .def("put", &db::Transaction::Put)
-      .def("commit", &db::Transaction::Commit);
-  py::class_<db::Cursor>(m, "Cursor")
-      .def("supports_seek", &db::Cursor::SupportsSeek)
-      .def("seek_to_first", &db::Cursor::SeekToFirst)
-      .def("next", &db::Cursor::Next)
-      .def("key", [](db::Cursor* self) -> py::bytes { return self->key(); })
-      .def("value", [](db::Cursor* self) -> py::bytes { return self->value(); })
-      .def("valid", &db::Cursor::Valid);
-  py::enum_<db::Mode>(m, "Mode")
-      .value("read", db::Mode::READ)
-      .value("write", db::Mode::WRITE)
-      .value("new", db::Mode::NEW)
-      .export_values();
-  py::class_<db::DB /*, std::unique_ptr<DB>*/>(m, "DB")
-      .def("new_transaction", &db::DB::NewTransaction)
-      .def("new_cursor", &db::DB::NewCursor)
-      .def("close", &db::DB::Close);
-  m.def("create_db", &db::CreateDB);
-  m.def("registered_dbs", []() {
-    return caffe2::db::Caffe2DBRegistry()->Keys();
-  });
-
-  // OpSchema
-  py::class_<OpSchema> op_schema(m, "OpSchema");
-  op_schema.def_property_readonly("file", &OpSchema::file)
-      .def_property_readonly("line", &OpSchema::line)
-      .def_property_readonly("private", &OpSchema::private_op)
-      .def_property_readonly(
-          "doc", &OpSchema::doc, py::return_value_policy::reference)
-      .def_property_readonly("args", &OpSchema::args)
-      .def_property_readonly("input_desc", &OpSchema::input_desc)
-      .def_property_readonly("output_desc", &OpSchema::output_desc)
-      .def_property_readonly("max_input", &OpSchema::max_input)
-      .def_property_readonly("max_output", &OpSchema::max_output)
-      .def_property_readonly("min_input", &OpSchema::min_input)
-      .def_property_readonly("min_output", &OpSchema::min_output)
-      .def_property_readonly("inf", &OpSchema::inf)
-      // Note: this does not work yet, we will need to figure out how to pass
-      // protobuf objects.
-      .def("infer_tensor", &OpSchema::InferTensor)
-      .def("CalculateOutput", &OpSchema::CalculateOutput)
-      .def("inplace_enforced", &OpSchema::inplace_enforced)
-      .def("num_inputs_allowed", &OpSchema::num_inputs_allowed)
-      .def("num_outputs_allowed", &OpSchema::num_outputs_allowed)
-      .def("num_inputs_outputs_allowed", &OpSchema::num_inputs_outputs_allowed)
-      .def_static(
-          "get", &OpSchemaRegistry::Schema, py::return_value_policy::reference)
-      .def_static(
-          "get_cpu_impl",
-          DefinitionGetter(CPUOperatorRegistry()),
-          py::return_value_policy::reference)
-      .def_static(
-          "get_cuda_impl",
-          DefinitionGetter(CUDAOperatorRegistry()),
-          py::return_value_policy::reference)
-      .def_static(
-          "get_gradient_impl",
-          DefinitionGetter(GradientRegistry()),
-          py::return_value_policy::reference);
-
-  py::class_<OpSchema::Argument>(op_schema, "Argument")
-      .def_property_readonly("name", &OpSchema::Argument::name)
-      .def_property_readonly("description", &OpSchema::Argument::description)
-      .def_property_readonly("required", &OpSchema::Argument::is_required);
-
-  py::class_<caffe2::onnx::Caffe2Ops>(m, "Caffe2Ops")
-      .def(py::init([](const std::vector<py::bytes>& init_ops,
-                       const std::vector<py::bytes>& ops,
-                       const std::vector<std::string>& interface_blobs) {
-        auto* c2ops = new caffe2::onnx::Caffe2Ops();
-        for (const auto& s : init_ops) {
-          ParseProtoFromLargeString(
-              s.cast<std::string>(), c2ops->init_ops.Add());
-        }
-        for (const auto& s : ops) {
-          ParseProtoFromLargeString(s.cast<std::string>(), c2ops->ops.Add());
-        }
-        for (const auto& s : interface_blobs) {
-          auto* tmp = c2ops->interface_blobs.Add();
-          *tmp = s;
-        }
-        return c2ops;
-      }));
-
-  py::class_<caffe2::onnx::DummyName>(m, "DummyName")
-      .def(py::init<>())
-      .def(
-          "reset",
-          [](caffe2::onnx::DummyName& instance, const py::object& args) {
-            if (args.is_none()) {
-              instance.Reset(std::unordered_set<std::string>());
-            } else {
-              instance.Reset(args.cast<std::unordered_set<std::string>>());
-            }
-          },
-          "Reset the dummy name generator",
-          py::arg("args") = py::none())
-      .def(
-          "new_dummy_name",
-          [](caffe2::onnx::DummyName& instance) -> std::string {
-            return instance.NewDummyName();
-          });
-
-  py::class_<caffe2::onnx::Caffe2BackendRep>(m, "Caffe2BackenRep")
-      .def(py::init<>())
-      .def(
-          "init_net",
-          [](caffe2::onnx::Caffe2BackendRep& instance) {
-            const auto& init_net = instance.init_net();
-            std::string out;
-            init_net.SerializeToString(&out);
-            return py::bytes(out);
-          })
-
-      .def(
-          "pred_net",
-          [](caffe2::onnx::Caffe2BackendRep& instance) {
-            const auto& pred_net = instance.pred_net();
-            std::string out;
-            pred_net.SerializeToString(&out);
-            return py::bytes(out);
-          })
-      .def(
-          "external_outputs",
-          [](caffe2::onnx::Caffe2BackendRep& instance) {
-            std::vector<std::string> outputs;
-            for (const auto& o : instance.pred_net().external_output()) {
-              outputs.emplace_back(o);
-            }
-            return outputs;
-          })
-      .def(
-          "external_inputs",
-          [](caffe2::onnx::Caffe2BackendRep& instance) {
-            std::vector<std::string> inputs;
-            for (const auto& o : instance.pred_net().external_input()) {
-              inputs.emplace_back(o);
-            }
-            return inputs;
-          })
-      .def(
-          "uninitialized_inputs",
-          [](caffe2::onnx::Caffe2BackendRep& instance) {
-            return instance.uninitialized_inputs();
-          })
-      .def(
-          "run",
-          [](caffe2::onnx::Caffe2BackendRep& instance,
-             std::map<std::string, py::object> inputs)
-              -> std::vector<py::object> {
-            caffe2::Predictor::TensorMap tensors_data{};
-            for (const auto& pair : inputs) {
-              const auto& name = pair.first;
-              const auto& input = pair.second;
-#ifdef USE_NUMPY
-              CAFFE_ENFORCE(
-                  PyArray_Check(input.ptr()),
-                  "Input must be of type numpy array.");
-              PyArrayObject* array =
-                  reinterpret_cast<PyArrayObject*>(input.ptr());
-              tensors_data.emplace(
-                  name,
-                  TensorFeeder<CPUContext>().FeedTensor(DeviceOption(), array));
-#else
-              CAFFE_THROW("Caffe2 was compiled without NumPy support.");
-#endif // USE_NUMPY
-            }
-            caffe2::Predictor::TensorList out;
-            instance.RunMap(tensors_data, &out);
-            std::vector<py::object> pyout;
-            for (auto& t : out) {
-              pyout.push_back(TensorFetcher().FetchTensor(t, true).obj);
-            }
-            return pyout;
-          })
-      .def(
-          "run",
-          [](caffe2::onnx::Caffe2BackendRep& instance,
-             std::vector<py::object> inputs) -> std::vector<py::object> {
-            std::vector<TensorCPU> tensors_data;
-#ifdef USE_NUMPY
-            // NOLINTNEXTLINE(modernize-loop-convert)
-            for (auto i = 0U; i < inputs.size(); ++i) {
-              auto input = inputs[i];
-              CAFFE_ENFORCE(
-                  PyArray_Check(input.ptr()),
-                  "Input must be of type numpy array.");
-              PyArrayObject* array =
-                  reinterpret_cast<PyArrayObject*>(input.ptr());
-              tensors_data.push_back(
-                  TensorFeeder<CPUContext>().FeedTensor(DeviceOption(), array));
-            }
-#else
-            CAFFE_THROW("Caffe2 was compiled without NumPy support.");
-#endif // USE_NUMPY
-            std::vector<TensorCPU> out;
-            instance.Run(tensors_data, &out);
-            std::vector<py::object> pyout;
-            for (auto& t : out) {
-              // NOLINTNEXTLINE(performance-inefficient-vector-operation)
-              pyout.push_back(TensorFetcher().FetchTensor(t, true).obj);
-            }
-            return pyout;
-          });
-
-  py::class_<caffe2::onnx::Caffe2Backend>(m, "Caffe2Backend")
-      .def(py::init<>())
-      .def(py::init<caffe2::onnx::DummyName*>())
-      .def(
-          "support_onnx_import",
-          [](caffe2::onnx::Caffe2Backend& instance,
-             const std::string& op) -> bool { return instance.SupportOp(op); })
-      .def(
-          "prepare",
-          [](caffe2::onnx::Caffe2Backend& instance,
-             const py::bytes& onnx_model_str,
-             const std::string& device,
-             const std::vector<caffe2::onnx::Caffe2Ops>& extras) {
-            auto* rep = instance.Prepare(
-                onnx_model_str.cast<std::string>(), device, extras);
-            return rep;
-          })
-      .def(
-          "convert_node",
-          [](caffe2::onnx::Caffe2Backend& instance,
-             const py::bytes& node_str,
-             const std::vector<py::bytes>& value_infos_bytes,
-             int opset_version) -> std::vector<std::vector<py::bytes>> {
-            // Note that we return two lists of serialized ops. The first set is
-            // init_ops and the second set is ops for pred net. When converting
-            // RNN related op, it is possible that we will create ops in the
-            // init_net. Hence the return structure here
-            caffe2::onnx::ValueInfoMap value_infos{};
-            for (const auto& vi_bytes : value_infos_bytes) {
-              ::ONNX_NAMESPACE::ValueInfoProto vi{};
-              vi.ParseFromString(vi_bytes);
-              auto name = vi.name();
-              value_infos.emplace(std::move(name), std::move(vi));
-            }
-            auto c2ops = instance.ConvertNode(
-                node_str.cast<std::string>(), {value_infos, opset_version});
-            std::vector<std::vector<py::bytes>> vals;
-            vals.emplace_back();
-            auto& init_vals = vals.back();
-            for (const auto& init_op : c2ops.init_ops) {
-              std::string out;
-              init_op.SerializeToString(&out);
-              init_vals.emplace_back(py::bytes(out));
-            }
-            vals.emplace_back();
-            auto& normal_vals = vals.back();
-            for (const auto& op : c2ops.ops) {
-              std::string out;
-              op.SerializeToString(&out);
-              normal_vals.emplace_back(py::bytes(out));
-            }
-            return vals;
-          },
-          py::arg("node_str"),
-          py::arg("value_infos_bytes") = std::vector<py::bytes>{},
-          py::arg("opset_version") = kKnownOpsetVersion)
-      .def(
-          "_build_tensor_filling_op",
-          [](caffe2::onnx::Caffe2Backend& instance,
-             const py::bytes& tensor_proto_str,
-             const std::string& name = "") -> py::bytes {
-            caffe2::OperatorDef op;
-            ::ONNX_NAMESPACE::TensorProto tp;
-            ParseProtoFromLargeString(tensor_proto_str, &tp);
-            instance.BuildTensorFillingOp(&op, tp, name);
-            std::string out;
-            op.SerializeToString(&out);
-            return py::bytes(out);
-          });
-
-  py::class_<Predictor>(m, "Predictor")
-      .def(py::init([](py::bytes init_net, py::bytes predict_net) {
-        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-        CAFFE_ENFORCE(workspace);
-        NetDef init_net_, predict_net_;
-        CAFFE_ENFORCE(ParseProtoFromLargeString(
-            init_net.cast<std::string>(), &init_net_));
-        CAFFE_ENFORCE(ParseProtoFromLargeString(
-            predict_net.cast<std::string>(), &predict_net_));
-        return new Predictor(
-            makePredictorConfig(init_net_, predict_net_, workspace));
-      }))
-      .def(
-          "run",
-          [](Predictor& instance,
-             std::vector<py::object> inputs) -> std::vector<py::object> {
-            std::vector<Tensor> tensors_data;
-#ifdef USE_NUMPY
-            // NOLINTNEXTLINE(modernize-loop-convert)
-            for (auto i = 0U; i < inputs.size(); ++i) {
-              auto input = inputs[i];
-              CAFFE_ENFORCE(
-                  PyArray_Check(input.ptr()),
-                  "Input must be of type numpy array.");
-              PyArrayObject* array =
-                  reinterpret_cast<PyArrayObject*>(input.ptr());
-              tensors_data.push_back(
-                  TensorFeeder<CPUContext>().FeedTensor(DeviceOption(), array));
-            }
-#else
-            CAFFE_THROW("Caffe2 was compiled without NumPy support.");
-#endif // USE_NUMPY
-            std::vector<TensorCPU> out;
-            instance(tensors_data, &out);
-            std::vector<py::object> pyout;
-            for (auto& t : out) {
-              // NOLINTNEXTLINE(performance-inefficient-vector-operation)
-              pyout.push_back(TensorFetcher().FetchTensor(t, true).obj);
-            }
-            return pyout;
-          })
-      .def(
-          "run",
-          [](Predictor& instance, std::map<std::string, py::object> inputs)
-              -> std::vector<py::object> {
-            Predictor::TensorMap tensors_data;
-#ifdef USE_NUMPY
-            for (const auto& pair : inputs) {
-              const auto& name = pair.first;
-              const auto& input = pair.second;
-              CAFFE_ENFORCE(
-                  PyArray_Check(input.ptr()),
-                  "Input must be of type numpy array.");
-              PyArrayObject* array =
-                  reinterpret_cast<PyArrayObject*>(input.ptr());
-              tensors_data.emplace(
-                  name,
-                  TensorFeeder<CPUContext>().FeedTensor(DeviceOption(), array));
-            }
-#else
-            CAFFE_THROW("Caffe2 was compiled without NumPy support.");
-#endif // USE_NUMPY
-            Predictor::TensorList out;
-            instance(tensors_data, &out);
-            std::vector<py::object> pyout;
-            for (auto& t : out) {
-              pyout.push_back(TensorFetcher().FetchTensor(t, true).obj);
-            }
-            return pyout;
-          });
-}
-
-void addGlobalMethods(py::module& m) {
-  m.attr("is_asan") = py::bool_(C10_ASAN_ENABLED);
-  m.attr("has_fbgemm") = py::bool_(
-#ifdef USE_FBGEMM
-      true
-#else
-      false
-#endif
-  );
-  m.def("get_build_options", []() { return GetBuildOptions(); });
-
-  // The old mkl backend has been removed permanently, but we
-  // keep this Python attribute for BC
-  m.attr("has_mkldnn") = py::bool_(false);
-
-  m.attr("use_mkldnn") = py::bool_(
-#ifdef USE_MKLDNN
-      true
-#else // USE_MKLDNN
-      false
-#endif // USE_MKLDNN
-  );
-
-  // if the binary is built with USE_ROCM, this is a ROCm build
-  // and therefore we need to ignore dyndep failures (because the module
-  // may not have a ROCm equivalent yet e.g. nccl)
-  m.attr("use_rocm") = py::bool_(
-#if defined(USE_ROCM)
-      true
-#else // USE_ROCM
-      false
-#endif // USE_ROCM
-  );
-
-  m.attr("use_trt") = py::bool_(
-#ifdef CAFFE2_USE_TRT
-      true
-#else // CAFFE2_USE_TRT
-      false
-#endif // CAFFE2_USE_TRT
-  );
-
-  m.attr("define_caffe2_no_operator_schema") = py::bool_(
-#ifdef CAFFE2_NO_OPERATOR_SCHEMA
-      true
-#else // CAFFE2_NO_OPERATOR_SCHEMA
-      false
-#endif // CAFFE2_NO_OPERATOR_SCHEMA
-  );
-
-  m.def("set_per_op_engine_pref", [](const PerOpEnginePrefType& pref) -> void {
-    caffe2::SetPerOpEnginePref(pref);
-  });
-
-  m.def("set_global_engine_pref", [](const GlobalEnginePrefType& pref) -> void {
-    caffe2::SetGlobalEnginePref(pref);
-  });
-  m.def(
-      "set_engine_pref",
-      [](const PerOpEnginePrefType& per_op_pref,
-         const GlobalEnginePrefType& global_pref) -> void {
-        caffe2::SetEnginePref(per_op_pref, global_pref);
-      });
-  m.def(
-      "set_op_engine_pref",
-      [](const std::string& op_type,
-         const CaffeMap<DeviceType, EnginePrefType>& op_pref) -> void {
-        caffe2::SetOpEnginePref(op_type, op_pref);
-      });
-
-  m.def(
-      "op_registry_key",
-      [](const std::string& op_type,
-         const std::string& engine) -> const std::string {
-        return caffe2::OpRegistryKey(op_type, engine);
-      });
-  m.def("global_init", [](std::vector<std::string> args) -> void {
-    int argc = args.size();
-    std::vector<char*> argv;
-    for (auto& arg : args) {
-      // NOLINTNEXTLINE(performance-inefficient-vector-operation,cppcoreguidelines-pro-type-const-cast)
-      argv.push_back(const_cast<char*>(arg.data()));
-    }
-    char** pargv = argv.data();
-    CAFFE_ENFORCE(caffe2::GlobalInit(&argc, &pargv));
-  });
-
-  m.def("registered_operators", []() {
-    std::set<string> all_keys = caffe2::GetRegisteredOperators();
-
-    // Ensure we are lexicographically ordered.
-    std::vector<std::string> keys;
-    for (const auto& key : all_keys) {
-      // NOLINTNEXTLINE(performance-inefficient-vector-operation)
-      keys.push_back(key);
-    }
-    return keys;
-  });
-  m.def("on_module_exit", []() { caffe2::python::ClearWorkspaces(); });
-  // create_if_missing not used by necessary for pybind to do
-  // properly do function overloading.
-  m.def(
-      "switch_workspace", [](Workspace* ws, py::object /*create_if_missing*/) {
-        // TODO
-        caffe2::python::SetCurrentWorkspace(ws);
-      });
-  m.def(
-      "create_child_workspace",
-      [](const std::string& parent_ws_name, const std::string& child_ws_name) {
-        auto parent_gws = caffe2::python::GetWorkspaceByName(parent_ws_name);
-        CAFFE_ENFORCE(parent_gws, "Parent ws does not exist.");
-        std::unique_ptr<Workspace> child_ws(new Workspace(parent_gws));
-        caffe2::python::InsertWorkspace(child_ws_name, std::move(child_ws));
-      },
-      "Create and register child ws, sharing existing blobs in parent ws.",
-      py::arg("parent_ws_name"),
-      py::arg("child_ws_name"));
-  m.def(
-      "switch_workspace",
-      [](const std::string& name, const py::object create_if_missing) {
-        if (create_if_missing.is_none()) {
-          return caffe2::python::SwitchWorkspaceInternal(name, false);
-        }
-        return caffe2::python::SwitchWorkspaceInternal(
-            name, create_if_missing.cast<bool>());
-      },
-      "Switch to the specified workspace, creating if necessary",
-      py::arg("name"),
-      py::arg("create_if_missing") = py::none());
-  m.def(
-      "reset_workspace",
-      [](const py::object& root_folder) {
-        VLOG(1) << "Resetting workspace.";
-        if (root_folder.is_none()) {
-          caffe2::python::ResetWorkspace(new Workspace());
-        } else {
-          caffe2::python::ResetWorkspace(
-              new Workspace(root_folder.cast<std::string>()));
-        }
-        return true;
-      },
-      "Reset the workspace",
-      py::arg("root_folder") = py::none());
-
-  m.def("root_folder", []() {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    return workspace->RootFolder();
-  });
-  m.def("current_workspace", []() {
-    return caffe2::python::GetCurrentWorkspaceName();
-  });
-  m.def("workspaces", []() {
-    std::vector<std::string> names;
-    caffe2::python::GetWorkspaceNames(names);
-    return names;
-  });
-  m.def("nearby_opnames", [](const std::string& name) {
-    std::vector<std::string> alternatives;
-    unsigned editTolerance = 3;
-    // NOLINTNEXTLINE(performance-for-range-copy)
-    for (auto it : caffe2::CPUOperatorRegistry()->Keys()) {
-      if (editDistance(it, name, editTolerance) < editTolerance + 1) {
-        alternatives.push_back(it);
-      }
-    }
-    return alternatives;
-  });
-  m.def("local_blobs", []() {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    return workspace->LocalBlobs();
-  });
-  m.def("blobs", []() {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    return workspace->Blobs();
-  });
-  m.def("has_blob", [](const std::string& name) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    return workspace->HasBlob(name);
-  });
-  m.def(
-      "fill_random_network_inputs",
-      [](const py::bytes& net_def,
-         const std::vector<std::vector<std::vector<int64_t>>>& inputDims,
-         const std::vector<std::vector<std::string>>& inputTypes) {
-        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-        CAFFE_ENFORCE(workspace);
-        py::gil_scoped_release g;
-        NetDef net;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(net_def.cast<std::string>(), &net));
-        caffe2::emulator::fillRandomNetworkInputs(
-            net, inputDims, inputTypes, workspace);
-      });
-  m.def(
-      "create_net",
-      [](py::bytes net_def, bool overwrite) {
-        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-        CAFFE_ENFORCE(workspace);
-        caffe2::NetDef proto;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(net_def.cast<std::string>(), &proto),
-            "Can't parse net proto: ",
-            net_def.cast<std::string>());
-        CAFFE_ENFORCE(
-            workspace->CreateNet(proto, overwrite),
-            "Error creating net with proto: ",
-            net_def.cast<std::string>());
-        return true;
-      },
-      py::arg("net_def"),
-      py::arg("overwrite") = kPyBindFalse);
-  m.def("run_net", [](const std::string& name, int num_iter, bool allow_fail) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    CAFFE_ENFORCE(workspace->GetNet(name), "Can't find net ", name);
-    py::gil_scoped_release g;
-    for (int i = 0; i < num_iter; i++) {
-      bool success = workspace->RunNet(name);
-      if (!allow_fail) {
-        CAFFE_ENFORCE(success, "Error running net ", name);
-      } else {
-        if (!success) {
-          return false;
-        }
-      }
-    }
-    return true;
-  });
-  m.def(
-      "add_observer_to_net",
-      [](const std::string& net_name, const std::string& observer_type) {
-        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-        CAFFE_ENFORCE(workspace);
-        CAFFE_ENFORCE(workspace->GetNet(net_name), "Can't find net ", net_name);
-        py::gil_scoped_release g;
-
-        NetBase* net = workspace->GetNet(net_name);
-        const Observable<NetBase>::Observer* observer = nullptr;
-
-#define REGISTER_PYTHON_EXPOSED_OBSERVER(ob_type)             \
-  {                                                           \
-    if (observer_type.compare(#ob_type) == 0) {               \
-      unique_ptr<ob_type> net_ob = make_unique<ob_type>(net); \
-      observer = net->AttachObserver(std::move(net_ob));      \
-    }                                                         \
-  }
-
-        REGISTER_PYTHON_EXPOSED_OBSERVER(ProfileObserver);
-        REGISTER_PYTHON_EXPOSED_OBSERVER(TimeObserver);
-#undef REGISTER_PYTHON_EXPOSED_OBSERVER
-
-        if (observer_type.compare("RunCountObserver") == 0) {
-          unique_ptr<RunCountNetObserver> net_ob =
-              make_unique<RunCountNetObserver>(net);
-          observer = net->AttachObserver(std::move(net_ob));
-        }
-
-        CAFFE_ENFORCE(observer != nullptr);
-        return py::cast(observer);
-      });
-  m.def(
-      "remove_observer_from_net",
-      [](const std::string& net_name, const ObserverBase<NetBase>* observer) {
-        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-        CAFFE_ENFORCE(workspace);
-        CAFFE_ENFORCE(workspace->GetNet(net_name), "Can't find net ", net_name);
-        py::gil_scoped_release g;
-
-        NetBase* net = workspace->GetNet(net_name);
-        net->DetachObserver(observer);
-      });
-  m.def("clear_global_net_observer", []() {
-    py::gil_scoped_release g;
-    caffe2::ClearGlobalNetObservers();
-  });
-  m.def("num_observers_on_net", [](const std::string& net_name) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    CAFFE_ENFORCE(workspace->GetNet(net_name), "Can't find net ", net_name);
-    py::gil_scoped_release g;
-
-    NetBase* net = workspace->GetNet(net_name);
-    return net->NumObservers();
-  });
-  m.def(
-      "benchmark_net",
-      [](const std::string& name,
-         size_t warmup_runs,
-         size_t main_runs,
-         bool run_individual) {
-        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-        CAFFE_ENFORCE(workspace);
-        auto* net = workspace->GetNet(name);
-        CAFFE_ENFORCE(net, "Didn't find net: ", name);
-        py::gil_scoped_release g;
-        vector<float> stat =
-            net->TEST_Benchmark(warmup_runs, main_runs, run_individual);
-        return stat;
-      });
-  m.def("benchmark_net_once", [](const std::string& name) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    auto* net = workspace->GetNet(name);
-    CAFFE_ENFORCE(net, "Didn't find net: ", name);
-    py::gil_scoped_release g;
-    float stat = net->TEST_Benchmark_One_Run();
-    return stat;
-  });
-
-  m.def("delete_net", [](const std::string& name) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    workspace->DeleteNet(name);
-    return true;
-  });
-  m.def("nets", []() {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    return workspace->Nets();
-  });
-  m.def("run_operator_once", [](const py::bytes& op_def) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    OperatorDef def;
-    CAFFE_ENFORCE(ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
-    py::gil_scoped_release g;
-    CAFFE_ENFORCE(workspace->RunOperatorOnce(def));
-    return true;
-  });
-  // Run an operator multiple times.
-  // This is needed for microbenchmarking as we want the benchmark loop to be in
-  // C++ to minimize overhead.
-  m.def("run_operator_multiple", [](const py::bytes& op_def, int num_runs) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    OperatorDef def;
-    CAFFE_ENFORCE(ParseProtoFromLargeString(op_def.cast<std::string>(), &def));
-    py::gil_scoped_release g;
-    std::unique_ptr<OperatorBase> op(CreateOperator(def, workspace));
-    for (int i = 0; i < num_runs; i++) {
-      if (!op->Run()) {
-        return false;
-      }
-    }
-    return true;
-  });
-  m.def(
-      "get_operator_cost",
-      [](const py::bytes& op_def, const std::vector<string>& input_blobs) {
-        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-        CAFFE_ENFORCE(workspace);
-        OperatorDef def;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(op_def.cast<std::string>(), &def),
-            "Couldn't parse operator proto.");
-        const auto op_type = def.type();
-        auto* schema = OpSchemaRegistry::Schema(op_type);
-        CAFFE_ENFORCE(schema);
-        vector<TensorShape> shapes;
-        for (const auto& blob_name : input_blobs) {
-          auto* blob = workspace->GetBlob(blob_name);
-          shapes.emplace_back(GetTensorShapeOfBlob(blob));
-        }
-        const auto c = schema->InferCost(def, shapes);
-        return std::make_tuple(c.flops, c.bytes_written, c.bytes_read);
-      });
-  m.def("run_net_once", [](const py::bytes& net_def) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    NetDef def;
-    CAFFE_ENFORCE(ParseProtoFromLargeString(net_def.cast<std::string>(), &def));
-    py::gil_scoped_release g;
-    CAFFE_ENFORCE(workspace->RunNetOnce(def));
-    return true;
-  });
-  m.def("run_plan", [](const py::bytes& plan_def) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    PlanDef def;
-    CAFFE_ENFORCE(
-        ParseProtoFromLargeString(plan_def.cast<std::string>(), &def));
-    py::gil_scoped_release g;
-    CAFFE_ENFORCE(workspace->RunPlan(def));
-    return true;
-  });
-  m.def("run_plan_in_background", [](const py::bytes& plan_def) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    PlanDef def;
-    CAFFE_ENFORCE(
-        ParseProtoFromLargeString(plan_def.cast<std::string>(), &def));
-    py::gil_scoped_release g;
-
-    auto background_plan = std::make_shared<BackgroundPlan>(workspace, def);
-    background_plan->run();
-    return background_plan;
-  });
-  m.def(
-      "apply_transform",
-      [](const string& transform_key, const py::bytes& net_def) {
-        NetDef def;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(net_def.cast<std::string>(), &def));
-        py::gil_scoped_release g;
-
-        auto transformed_net = ApplyTransform(transform_key, def);
-
-        std::string protob;
-        CAFFE_ENFORCE(transformed_net.SerializeToString(&protob));
-        return py::bytes(protob);
-      });
-  m.def(
-      "apply_transform_if_faster",
-      [](const string& transform_key,
-         const py::bytes& net_def_bytes,
-         const py::bytes& init_def_bytes,
-         int warmup_runs,
-         int main_runs,
-         double improvement_threshold) {
-        NetDef def;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(net_def_bytes.cast<std::string>(), &def));
-        NetDef init_def;
-        CAFFE_ENFORCE(ParseProtoFromLargeString(
-            init_def_bytes.cast<std::string>(), &init_def));
-        py::gil_scoped_release g;
-
-        std::string protob;
-
-        auto transformed_net = ApplyTransformIfFaster(
-            transform_key,
-            def,
-            init_def,
-            warmup_runs,
-            main_runs,
-            improvement_threshold);
-
-        CAFFE_ENFORCE(transformed_net.SerializeToString(&protob));
-        return py::bytes(protob);
-      });
-  m.def(
-      "memonger_compute_blob_recycling_for_dag",
-      [](const py::bytes& net_def,
-         const std::vector<string>& input_blobs,
-         const std::vector<int>& op_indices,
-         const std::unordered_set<string>& shareable_blob_names,
-         const string& namescope,
-         const std::unordered_set<string>& dont_share_blob_names,
-         const std::unordered_map<string, vector<int>>& blob_shapes) {
-        py::gil_scoped_release g;
-        NetDef net;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(net_def.cast<std::string>(), &net));
-        NetDef optimized_proto =
-            caffe2::memonger::compute_blob_recycling_for_dag(
-                net,
-                input_blobs,
-                op_indices,
-                shareable_blob_names,
-                namescope,
-                dont_share_blob_names,
-                blob_shapes);
-        std::string protob;
-        CAFFE_ENFORCE(optimized_proto.SerializeToString(&protob));
-        return py::bytes(protob);
-      });
-  m.def(
-      "memonger_optimize_inference_net",
-      [](const py::bytes& net_def,
-         const std::vector<std::string>& static_blobs) {
-        NetDef def;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(net_def.cast<std::string>(), &def));
-        py::gil_scoped_release g;
-
-        std::set<string> static_blobs_set(
-            static_blobs.begin(), static_blobs.end());
-        NetDef optimized =
-            caffe2::memonger::optimize_inference_net(def, static_blobs_set);
-
-        std::string protob;
-        CAFFE_ENFORCE(optimized.SerializeToString(&protob));
-        return py::bytes(protob);
-      });
-  m.def(
-      "infer_shapes_and_types_from_workspace",
-      [](const std::vector<py::bytes>& net_protos) {
-        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-        CAFFE_ENFORCE(workspace);
-
-        // Parse protobuffers to NetDefs
-        std::vector<std::unique_ptr<caffe2::NetDef>> nets;
-        std::vector<caffe2::NetDef*> nets_ptr;
-        // NOLINTNEXTLINE(performance-for-range-copy)
-        for (auto proto : net_protos) {
-          std::unique_ptr<NetDef> def(new NetDef());
-          CAFFE_ENFORCE(def->ParseFromString(proto));
-          nets_ptr.push_back(def.get());
-          nets.push_back(std::move(def));
-        }
-
-        auto blob_info =
-            InferBlobShapesAndTypesFromWorkspace(workspace, nets_ptr);
-
-        std::string protob;
-        CAFFE_ENFORCE(blob_info.SerializeToString(&protob));
-        return py::bytes(protob);
-      });
-  m.def(
-      "infer_shapes_and_types_from_map",
-      [](const std::vector<py::bytes>& net_protos,
-         const std::map<std::string, std::vector<int64_t>> blob_dimensions) {
-        // Parse protobuffers to NetDefs
-        std::vector<std::unique_ptr<caffe2::NetDef>> nets;
-        std::vector<caffe2::NetDef*> nets_ptr;
-        // NOLINTNEXTLINE(performance-for-range-copy)
-        for (auto proto : net_protos) {
-          std::unique_ptr<NetDef> def(new NetDef());
-          CAFFE_ENFORCE(def->ParseFromString(proto));
-          nets_ptr.push_back(def.get());
-          nets.push_back(std::move(def));
-        }
-
-        auto blob_info =
-            InferBlobShapesAndTypesFromMap(blob_dimensions, nets_ptr);
-
-        std::string protob;
-        CAFFE_ENFORCE(blob_info.SerializeToString(&protob));
-        return py::bytes(protob);
-      });
-  m.def(
-      "infer_shapes_and_types_from_map",
-      [](const std::vector<py::bytes>& net_protos,
-         const std::map<std::string, std::vector<int64_t>> blob_dimensions,
-         const std::map<std::string, int> int_blob_types) {
-        // Parse protobuffers to NetDefs
-        std::vector<std::unique_ptr<caffe2::NetDef>> nets;
-        std::vector<caffe2::NetDef*> nets_ptr;
-        // NOLINTNEXTLINE(performance-for-range-copy)
-        for (auto proto : net_protos) {
-          std::unique_ptr<NetDef> def(new NetDef());
-          CAFFE_ENFORCE(def->ParseFromString(proto));
-          nets_ptr.push_back(def.get());
-          nets.push_back(std::move(def));
-        }
-        std::map<std::string, TensorProto_DataType> blob_types;
-        // NOLINTNEXTLINE(performance-for-range-copy)
-        for (auto blob_type : int_blob_types) {
-          blob_types[blob_type.first] =
-              static_cast<TensorProto_DataType>(blob_type.second);
-        }
-
-        auto blob_info = InferBlobShapesAndTypesFromMap(
-            blob_dimensions, blob_types, nets_ptr);
-
-        std::string protob;
-        CAFFE_ENFORCE(blob_info.SerializeToString(&protob));
-        return py::bytes(protob);
-      });
-  m.def("ssa_rewrite", [](const py::bytes& net_proto) {
-    auto net_def = std::make_unique<NetDef>();
-    CAFFE_ENFORCE(net_def->ParseFromString(net_proto));
-    onnx::SsaRewrite(nullptr, net_def.get());
-    std::string output_net_proto;
-    CAFFE_ENFORCE(net_def->SerializeToString(&output_net_proto));
-    return py::bytes(output_net_proto);
-  });
-  m.def("create_blob", [](const std::string& name) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    CAFFE_ENFORCE(workspace->CreateBlob(name));
-    return true;
-  });
-  m.def("reset_blob", [](const std::string& name) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    auto* b = workspace->GetBlob(name);
-    CAFFE_ENFORCE(b);
-    b->Reset();
-  });
-  m.def("fetch_blob", [](const std::string& name) -> py::object {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    return python_detail::fetchBlob(workspace, name);
-  });
-  m.def(
-      "feed_blob",
-      [](const std::string& name, py::object arg, py::object device_option) {
-        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-        auto* blob = workspace->CreateBlob(name);
-        return python_detail::feedBlob(blob, arg, device_option);
-      },
-      "",
-      py::arg("name"),
-      py::arg("arg"),
-      py::arg("device_option") = py::none());
-  m.def("deserialize_blob", [](const string& content) {
-    return python_detail::deserializeBlob(content);
-  });
-  m.def("serialize_blob", [](const std::string& name) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    auto* blob = workspace->GetBlob(name);
-    CAFFE_ENFORCE(blob);
-    return py::bytes(SerializeBlob(*blob, name));
-  });
-  m.def(
-      "deserialize_blob",
-      [](const std::string& name, const py::bytes& serialized) {
-        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-        CAFFE_ENFORCE(workspace);
-        auto* blob = workspace->CreateBlob(name);
-        DeserializeBlob(serialized.cast<std::string>(), blob);
-      });
-
-  // we support 2 possible signatures of python op: (inputs, outputs) or
-  // (inputs, outputs, workspace)
-  m.def(
-      "register_python_op",
-      [](py::object func, bool pass_workspace, std::string name) {
-        using namespace python_detail;
-        CAFFE_ENFORCE(!func.is_none());
-        if (!name.empty()) {
-          name += ":";
-        }
-        name += func.attr("__name__").cast<std::string>();
-        std::string token = name;
-        for (int i = 1; gRegistry().count(token) > 0; ++i) {
-          token = name + ":" + to_string(i);
-        }
-        gRegistry()[token] = Func{func, pass_workspace};
-        return token;
-      });
-  m.def(
-      "register_python_gradient_op",
-      [](const std::string& token, py::object func) {
-        using namespace python_detail;
-        CAFFE_ENFORCE(!func.is_none());
-        CAFFE_ENFORCE(gRegistry().find(token) != gRegistry().end());
-        // For global sanity gradient ops shouldn't access workspace
-        gRegistry()[token + "_gradient"] = Func{func, false};
-      });
-  m.def("infer_op_input_output_device", [](const py::bytes& op) {
-    std::unique_ptr<caffe2::OperatorDef> def(new caffe2::OperatorDef());
-    CAFFE_ENFORCE(def.get()->ParseFromString(op));
-    // device_info is a pair of vector of DeviceOption.
-    // `first` is for inputs, `second` is for outputs.
-    auto device_info = InferOpInputOutputDevice(*def);
-
-    std::vector<py::bytes> in_res;
-    std::vector<py::bytes> out_res;
-    for (auto& in_dev : device_info.first) {
-      std::string protob;
-      CAFFE_ENFORCE(in_dev.SerializeToString(&protob));
-      // NOLINTNEXTLINE(modernize-use-emplace)
-      in_res.push_back(py::bytes(protob));
-    }
-    for (auto& out_dev : device_info.second) {
-      std::string protob;
-      CAFFE_ENFORCE(out_dev.SerializeToString(&protob));
-      // NOLINTNEXTLINE(modernize-use-emplace)
-      out_res.push_back(py::bytes(protob));
-    }
-    return std::make_pair(in_res, out_res);
-  });
-  m.def("get_stats", []() {
-    ExportedStatList stats;
-    StatRegistry::get().publish(stats);
-    std::unordered_map<std::string, int> stats_map;
-    for (const auto& stat : stats) {
-      stats_map[stat.key] = stat.value;
-    }
-    return stats_map;
-  });
-  m.def("is_numa_enabled", []() { return IsNUMAEnabled(); });
-  m.def("get_num_numa_nodes", []() { return GetNumNUMANodes(); });
-  m.def("get_blob_numa_node", [](const std::string& blob_name) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    auto* blob = workspace->GetBlob(blob_name);
-    CAFFE_ENFORCE(blob);
-    const TensorCPU& tensor = blob->Get<TensorCPU>();
-    const void* raw_data = tensor.raw_data();
-    CAFFE_ENFORCE(raw_data);
-    return GetNUMANode(raw_data);
-  });
-  m.def("get_blob_size_bytes", [](const std::string& blob_name) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    auto* blob = workspace->GetBlob(blob_name);
-    CAFFE_ENFORCE(blob);
-    return BlobStat::sizeBytes(*blob);
-  });
-  m.def("support_onnx_export", [](const std::string& op) -> bool {
-    const OpSchema* schema = caffe2::OpSchemaRegistry::Schema(op);
-    if (!schema) {
-      return false;
-    }
-    return !schema->onnx_schema().empty();
-  });
-  m.def(
-      "export_to_onnx",
-      [](caffe2::onnx::DummyName* dummy,
-         const py::bytes& c2op,
-         const std::unordered_map<std::string, std::vector<int>>& shapes)
-          -> std::pair<std::vector<py::bytes>, std::vector<py::bytes>> {
-        OperatorDef op;
-        CAFFE_ENFORCE(ParseProtoFromLargeString(c2op.cast<std::string>(), &op));
-        const auto& type = op.type();
-        const OpSchema* schema = caffe2::OpSchemaRegistry::Schema(type);
-        CAFFE_ENFORCE(schema);
-        std::unordered_map<std::string, TensorShape> tensor_shapes;
-        for (const auto& it : shapes) {
-          tensor_shapes.emplace(
-              it.first, CreateTensorShape(it.second, TensorProto::FLOAT));
-        }
-        auto results =
-            onnx::OnnxExporter(dummy).Caffe2OpToOnnxNodes(op, tensor_shapes);
-        std::pair<std::vector<py::bytes>, std::vector<py::bytes>> ret;
-        auto& nodes_str = ret.first;
-        auto& tensors_str = ret.second;
-        for (const auto& node : results.first) {
-          std::string out;
-          node.SerializeToString(&out);
-          nodes_str.emplace_back(py::bytes(out));
-        }
-        for (const auto& tensor : results.second) {
-          std::string out;
-          tensor.SerializeToString(&out);
-          tensors_str.emplace_back(py::bytes(out));
-        }
-        return ret;
-      });
-
-#define CAFFE2_CPU_FEATURE_SUPPORT(feature) \
-  m.def("builtin_cpu_supports_" #feature, []() { return GetCpuId().feature(); })
-
-  CAFFE2_CPU_FEATURE_SUPPORT(avx2);
-
-#undef CAFFE2_CPU_FEATURE_SUPPORT
-  m.def("transform_exists", [](const std::string& transform_name) {
-    return OptimizationPassRegistry()->Has(transform_name);
-  });
-  m.def("workspace_transform_exists", [](const std::string& transform_name) {
-    return WorkspaceOptimizationPassRegistry()->Has(transform_name);
-  });
-  m.def("run_transform", [](const std::string& transform_name, py::bytes def) {
-    caffe2::NetDef proto;
-    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-    auto nn = caffe2::convertToNNModule(proto);
-    auto pass = OptimizationPassRegistry()->Create(transform_name, &nn);
-
-    CAFFE_ENFORCE(pass, "Pass doesn't exist: ", transform_name);
-    pass->run();
-
-    auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
-    std::string out;
-    new_proto.SerializeToString(&out);
-    return py::bytes(out);
-  });
-  m.def(
-      "create_offline_tensor",
-      [](const std::string& name,
-         const std::vector<int>& dims,
-         int datatype) -> bool {
-        Workspace* curr_ws = GetCurrentWorkspace();
-        auto* b = curr_ws->CreateBlob(name);
-        auto* offline = b->GetMutable<OfflineTensor>();
-        CAFFE_ENFORCE(offline);
-        offline->setShapeAndType(
-            dims,
-            CPU,
-            DataTypeToTypeMeta(static_cast<TensorProto::DataType>(datatype)));
-        return true;
-      });
-  m.def(
-      "onnxifi_set_option",
-      [](const std::string& optionName,
-         const std::string& optionValue) -> bool {
-        OnnxifiOptionHelper ts;
-        return ts.setOnnxifiOption(optionName, optionValue);
-      });
-  m.def("onnxifi_get_option", [](const std::string& optionName) -> std::string {
-    OnnxifiOptionHelper ts;
-    return ts.getOnnxifiOption(optionName);
-  });
-  m.def(
-      "onnxifi",
-      [](const py::bytes& pred_net_str,
-         const py::bytes& shapes_str,
-         const std::vector<int>& block_list,
-         const std::vector<std::string>& weight_names,
-         int max_batch_size,
-         int max_seq_size,
-         int timeout,
-         bool adjust_batch,
-         bool debug_builder,
-         bool merge_fp32_inputs_into_fp16,
-         bool net_ssa_rewritten,
-         bool use_onnx) -> py::bytes {
-        caffe2::NetDef pred_net;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(
-                pred_net_str.cast<std::string>(), &pred_net),
-            "broken pred_net protobuf");
-        Workspace* curr_ws = GetCurrentWorkspace();
-        CAFFE_ENFORCE(curr_ws);
-        splitSparseLengthsSumSparse(&pred_net, *curr_ws);
-        caffe2::TensorBoundShapes tbs;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(shapes_str.cast<std::string>(), &tbs),
-            "broken TensorBoundShapes protobuf");
-        ShapeInfoMap shape_map = caffe2::extractShapeInfoFromTensorBoundShapes(
-            tbs, max_batch_size, max_seq_size);
-        OnnxifiTransformerOptions opts;
-        opts.bound_shape_spec.max_batch_size = max_batch_size;
-        opts.bound_shape_spec.max_seq_size = max_seq_size;
-        opts.timeout = timeout;
-        opts.adjust_batch = adjust_batch;
-        opts.debug = debug_builder;
-        opts.merge_fp32_inputs_into_fp16 = merge_fp32_inputs_into_fp16;
-        opts.predictor_net_ssa_rewritten = net_ssa_rewritten;
-        opts.use_onnx = use_onnx;
-        OnnxifiTransformer ts(opts);
-        std::unordered_set<int> blocklist_set(
-            block_list.begin(), block_list.end());
-        std::vector<std::string> weight_names_overwrite{};
-        if (weight_names.size() == 0) {
-          weight_names_overwrite = curr_ws->Blobs();
-        } else {
-          weight_names_overwrite = weight_names;
-        }
-        ts.transform(
-            curr_ws,
-            &pred_net,
-            weight_names_overwrite,
-            shape_map,
-            blocklist_set);
-        std::string pred_net_str2;
-        pred_net.SerializeToString(&pred_net_str2);
-        return py::bytes(pred_net_str2);
-      });
-  m.def(
-      "run_workspace_transform",
-      [](const std::string& transform_name, py::bytes def) {
-        Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-        CAFFE_ENFORCE(workspace);
-        caffe2::NetDef proto;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-        auto nn = caffe2::convertToNNModule(proto);
-        auto pass = WorkspaceOptimizationPassRegistry()->Create(
-            transform_name, &nn, workspace);
-
-        CAFFE_ENFORCE(pass, "Pass doesn't exist: ", transform_name);
-        pass->run();
-
-        auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
-        std::string out;
-        new_proto.SerializeToString(&out);
-        return py::bytes(out);
-      });
-  m.def("fakeFp16FuseOps", [](const py::bytes& net_str) {
-    caffe2::NetDef netDef;
-    CAFFE_ENFORCE(
-        ParseProtoFromLargeString(net_str.cast<std::string>(), &netDef),
-        "broken pred_net protobuf");
-    opt::fakeFp16FuseOps(&netDef);
-    std::string out_net;
-    netDef.SerializeToString(&out_net);
-    return py::bytes(out_net);
-  });
-
-  // Transformations are exposed as functions here and wrapped
-  // into a python interface in transformations.py
-  // Prefix the transformation with transform_ to avoid clobbering the
-  // function namespace.
-  m.def("transform_optimizeForMKLDNN", [](py::bytes def, bool training_mode) {
-    caffe2::NetDef proto;
-    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-
-    auto nn = caffe2::convertToNNModule(proto);
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    opt::OptimizeForMkldnn(&nn, workspace, training_mode);
-    auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
-
-    std::string out;
-    new_proto.SerializeToString(&out);
-    return py::bytes(out);
-  });
-
-  m.def("transform_addNNPACK", [](py::bytes def) {
-    caffe2::NetDef proto;
-    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-
-    auto nn = caffe2::convertToNNModule(proto);
-    opt::addNNPACK(&nn);
-    auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
-
-    std::string out;
-    new_proto.SerializeToString(&out);
-    return py::bytes(out);
-  });
-
-  m.def("transform_fuseConvBN", [](py::bytes def) {
-    Workspace* workspace = caffe2::python::GetCurrentWorkspace();
-    CAFFE_ENFORCE(workspace);
-    caffe2::NetDef proto;
-    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-
-    auto nn = caffe2::convertToNNModule(proto);
-    opt::fuseConvBN(&nn, workspace);
-    auto new_proto = caffe2::convertToCaffe2Proto(nn);
-
-    std::string out;
-    new_proto.SerializeToString(&out);
-    return py::bytes(out);
-  });
-
-  m.def("transform_fuseNNPACKConvRelu", [](py::bytes def) {
-    caffe2::NetDef proto;
-    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-
-    auto nn = caffe2::convertToNNModule(proto);
-    opt::fuseNNPACKConvRelu(&nn);
-    auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
-
-    std::string out;
-    new_proto.SerializeToString(&out);
-    return py::bytes(out);
-  });
-
-  auto initialize = [&]() {
-  // Initialization of the module
-#ifdef USE_NUMPY
-    ([]() -> void {
-      // import_array1() forces a void return value.
-      import_array1();
-    })();
-#endif // USE_NUMPY
-    // Single threaded, so safe
-    static bool initialized = false;
-    if (initialized) {
-      return;
-    }
-    // We will create a default workspace for us to run stuff.
-    caffe2::python::SwitchWorkspaceInternal("default", true);
-    initialized = true;
-  };
-
-  initialize();
-};
-
-PYBIND11_MODULE(caffe2_pybind11_state, m) {
-  m.doc() = "pybind11 stateful interface to Caffe2 workspaces";
-
-  C10_LOG_API_USAGE_ONCE("caffe2.python.import");
-
-  addGlobalMethods(m);
-  addObjectMethods(m);
-  for (const auto& addition : PybindAdditionRegistry()->Keys()) {
-    PybindAdditionRegistry()->Create(addition, m);
-  }
-}
-
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/pybind_state.h b/caffe2/python/pybind_state.h
deleted file mode 100644
index 37292858d5b0..000000000000
--- a/caffe2/python/pybind_state.h
+++ /dev/null
@@ -1,467 +0,0 @@
-#pragma once
-
-#include <unordered_map>
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/memonger.h"
-#include "caffe2/core/net.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/scope_guard.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/core/types.h"
-#include "caffe2/core/workspace.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/python/pybind_state_dlpack.h"
-#include "caffe2/python/pybind_workspace.h"
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <Python.h>
-
-#ifdef USE_NUMPY
-
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
-#define PY_ARRAY_UNIQUE_SYMBOL caffe2_python_ARRAY_API
-#include <numpy/arrayobject.h>
-
-// Temporary solution for numpy < 1.7 versions: old macro, no promises.
-// You're strongly advised to upgrade to >= 1.7.
-#ifndef NPY_ARRAY_C_CONTIGUOUS
-#define NPY_ARRAY_C_CONTIGUOUS NPY_C_CONTIGUOUS
-#define PyArray_SetBaseObject(arr, x) (PyArray_BASE(arr) = (x))
-#endif
-
-#else
-
-struct PyArrayObject; // Forward declaring PyArrayObject for safety
-
-#endif // USE_NUMPY
-
-namespace caffe2 {
-namespace python {
-
-namespace py = pybind11;
-
-// Add methods common to both CPU and GPU mode.
-void addGlobalMethods(pybind11::module& m);
-// Expose Workspace, Net, Blob
-void addObjectMethods(pybind11::module& m);
-
-// Get current workspace
-Workspace* GetCurrentWorkspace();
-
-// Get workspace by name. Returns nullptr if none exists by name.
-Workspace* GetWorkspaceByName(const std::string& name);
-
-class BlobFeederBase {
- public:
-  virtual ~BlobFeederBase();
-  virtual void Feed(
-      const DeviceOption& option,
-      PyArrayObject* array,
-      Blob* blob,
-      bool in_place = false) = 0;
-};
-
-C10_DECLARE_TYPED_REGISTRY(
-    BlobFeederRegistry,
-    DeviceType,
-    BlobFeederBase,
-    std::unique_ptr);
-#define REGISTER_BLOB_FEEDER(device_type, ...) \
-  C10_REGISTER_TYPED_CLASS(BlobFeederRegistry, device_type, __VA_ARGS__)
-inline unique_ptr<BlobFeederBase> CreateFeeder(int device_type) {
-  return BlobFeederRegistry()->Create(
-      caffe2::ProtoToType(static_cast<DeviceTypeProto>(device_type)));
-}
-
-static_assert(
-    sizeof(int) == sizeof(int32_t),
-    "We make an assumption that int is always int32 for numpy "
-    "type mapping.");
-
-int CaffeToNumpyType(const TypeMeta dtype);
-const TypeMeta NumpyTypeToCaffe(int numpy_type);
-
-class TensorFetcher : public BlobFetcherBase {
- public:
-  pybind11::object Fetch(const Blob& blob) override {
-    return FetchTensor(blob.Get<Tensor>(), true).obj;
-  }
-
-  // Checks whether the data with type `dtype` needs to be copied in the context
-  // of `tensor`
-  bool NeedsCopy(const Tensor* tensor, const TypeMeta dtype) const {
-#ifdef USE_NUMPY
-    return tensor->GetDeviceType() != CPU ||
-        CaffeToNumpyType(dtype) == NPY_OBJECT;
-#else
-    return tensor->GetDeviceType() != CPU;
-#endif // USE_NUMPY
-  }
-
-  FetchedBlob FetchTensor(const Tensor& tensor, bool force_copy) {
-#ifdef USE_NUMPY
-    FetchedBlob result;
-    CAFFE_ENFORCE_GE(tensor.numel(), 0, "Trying to fetch uninitialized tensor");
-    const int numpy_type = CaffeToNumpyType(tensor.dtype());
-    CAFFE_ENFORCE(
-        numpy_type != -1,
-        "This tensor's data type is not supported: ",
-        tensor.dtype().name(),
-        ".");
-    std::vector<npy_intp> npy_dims;
-    for (const auto dim : tensor.sizes()) {
-      npy_dims.push_back(dim);
-    }
-    result.copied = force_copy || NeedsCopy(&tensor, tensor.dtype());
-    void* outPtr;
-    if (result.copied) {
-      result.obj = py::reinterpret_steal<py::object>(
-          PyArray_SimpleNew(tensor.dim(), npy_dims.data(), numpy_type));
-      outPtr = static_cast<void*>(
-          PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
-    } else {
-      outPtr = const_cast<Tensor&>(tensor).raw_mutable_data();
-      result.obj = py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
-          tensor.dim(), npy_dims.data(), numpy_type, outPtr));
-    }
-
-    if (numpy_type == NPY_OBJECT) {
-      PyObject** outObj = reinterpret_cast<PyObject**>(outPtr);
-      auto* str = tensor.template data<std::string>();
-      for (const auto i : c10::irange(tensor.numel())) {
-        outObj[i] = PyBytes_FromStringAndSize(str->data(), str->size());
-        str++;
-        // cleanup on failure
-        if (outObj[i] == nullptr) {
-          for (const auto j : c10::irange(i)) {
-            Py_DECREF(outObj[j]);
-          }
-          CAFFE_THROW("Failed to allocate string for ndarray of strings.");
-        }
-      }
-      return result;
-    }
-
-    if (result.copied) {
-      // TODO: use CUDAGuard here instead of context and employ explicit sync
-      // copy
-      auto context = CreateContext(tensor.GetDeviceType());
-      context->CopyBytesToCPU(tensor.nbytes(), tensor.raw_data(), outPtr);
-      context->FinishDeviceComputation();
-    }
-    return result;
-#else
-    CAFFE_THROW("Caffe2 was compiled without NumPy support.");
-#endif // USE_NUMPY
-  }
-};
-
-template <class Context>
-class TensorFeeder : public BlobFeederBase {
- public:
-  Tensor FeedTensor(const DeviceOption& option, PyArrayObject* original_array) {
-    Tensor out;
-    FeedTensor(option, original_array, &out, false);
-    return out;
-  }
-
-  void FeedTensor(
-      const DeviceOption& option,
-      PyArrayObject* original_array,
-      Tensor* out,
-      bool in_place) {
-#ifdef USE_NUMPY
-    PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
-    auto g = MakeGuard([&]() { Py_XDECREF(array); });
-
-    const auto npy_type = PyArray_TYPE(array);
-    const TypeMeta dtype = NumpyTypeToCaffe(npy_type);
-    CAFFE_ENFORCE(
-        dtype != ScalarType::Undefined,
-        "This numpy data type is not supported: ",
-        PyArray_TYPE(array),
-        ".");
-    Context context(option);
-    context.SwitchToDevice();
-    // numpy requires long int as its dims.
-    int ndim = PyArray_NDIM(array);
-    npy_intp* npy_dims = PyArray_DIMS(array);
-    std::vector<int64_t> dims;
-    for (const auto i : c10::irange(ndim)) {
-      dims.push_back(npy_dims[i]);
-    }
-
-    Tensor& tensor = *out;
-    if (in_place) {
-      tensor.Resize(dims);
-    }
-    // Now, copy the data to the tensor.
-    switch (npy_type) {
-      case NPY_OBJECT: {
-        PyObject** input = reinterpret_cast<PyObject**>(PyArray_DATA(array));
-        if (!in_place) {
-          tensor = caffe2::empty(
-              dims, at::dtype<std::string>().device(Context::GetDeviceType()));
-        }
-        auto* outPtr = tensor.template mutable_data<std::string>();
-        for (const auto i : c10::irange(tensor.numel())) {
-          char* str;
-          Py_ssize_t strSize;
-          if (PyBytes_Check(input[i])) {
-            CAFFE_ENFORCE(
-                PyBytes_AsStringAndSize(input[i], &str, &strSize) != -1,
-                "Had a PyBytes object but cannot convert it to a string.");
-          } else if (PyUnicode_Check(input[i])) { // string
-            str =
-                const_cast<char*>(PyUnicode_AsUTF8AndSize(input[i], &strSize));
-            CAFFE_ENFORCE(
-                str,
-                "Had a PyUnicode object but cannot convert it to a string.");
-          } else {
-            CAFFE_THROW("Unsupported python object type passed into ndarray.");
-          }
-          outPtr[i] = std::string(str, strSize);
-        }
-        break;
-      }
-      case NPY_UNICODE:
-        CAFFE_THROW(
-            "You are feeding in a numpy array of unicode. Caffe2 C++ does not "
-            "support unicode yet. Please ensure that you are passing in bytes "
-            "instead of unicode strings.");
-        break;
-      default:
-        if (!in_place) {
-          tensor = caffe2::empty(
-              dims, at::dtype(dtype).device(Context::GetDeviceType()));
-        } else {
-          tensor.raw_mutable_data(dtype);
-        }
-        context.CopyBytesFromCPU(
-            tensor.numel() * dtype.itemsize(),
-            static_cast<void*>(PyArray_DATA(array)),
-            tensor.raw_mutable_data());
-    }
-    context.FinishDeviceComputation();
-#else
-    CAFFE_THROW("Caffe2 compiled without NumPy support.");
-#endif // USE_NUMPY
-  }
-
-  virtual void Feed(
-      const DeviceOption& option,
-      PyArrayObject* original_array,
-      Blob* blob,
-      bool in_place) override {
-    if (in_place) {
-      FeedTensor(
-          option,
-          original_array,
-          BlobGetMutableTensor(blob, OptionToDevice(option).type()),
-          true);
-    } else {
-      blob->Reset<Tensor>(new Tensor(FeedTensor(option, original_array)));
-    }
-  }
-};
-
-namespace python_detail {
-struct Func {
-  py::object py_func;
-  bool needs_workspace;
-};
-
-const Func& getOpFunc(const std::string& token);
-
-const Func& getGradientFunc(const std::string& token);
-
-} // namespace python_detail
-
-// TODO: Remove template?
-template <class Context, bool use_dlpack>
-class PythonOpBase : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  PythonOpBase(
-      const OperatorDef& operator_def,
-      Workspace* ws,
-      const std::string& pickled_builder_arg_name)
-      : Operator<Context>(operator_def, ws),
-        ws_(ws),
-        token_(OperatorBase::template GetSingleArgument<std::string>(
-            "token",
-            "")) {
-    using namespace python_detail;
-    auto pickled = OperatorBase::template GetSingleArgument<std::string>(
-        pickled_builder_arg_name, "");
-    CAFFE_ENFORCE(
-        !pickled.empty() || !token_.empty(),
-        "PythonOp requires either pickled_builder or token arg.");
-    if (!pickled.empty()) {
-      py::gil_scoped_acquire g;
-      try {
-        auto pickle =
-            py::reinterpret_steal<py::object>(PyImport_ImportModule("pickle"));
-
-        CAFFE_ENFORCE(pickle);
-        auto loads = pickle.attr("loads").cast<py::object>();
-        CAFFE_ENFORCE(loads);
-        py::tuple builder_call;
-        try {
-          builder_call = loads(py::bytes(pickled)).cast<py::tuple>();
-        } catch (const py::error_already_set& e) {
-          LOG(INFO) << "Cannot unpickle python operator: " << e.what();
-          LOG(INFO) << "Try latin1 encoding for python3 run";
-          // to use the `_a` literal for arguments
-          using namespace pybind11::literals;
-          builder_call = loads(py::bytes(pickled), "encoding"_a = "latin1")
-                             .template cast<py::tuple>();
-        }
-        CAFFE_ENFORCE(builder_call);
-        CAFFE_ENFORCE_EQ(py::len(builder_call), 3);
-        auto func = builder_call[0].cast<py::object>();
-        auto args = builder_call[1].cast<py::tuple>();
-        auto kwargs = builder_call[2].cast<py::dict>();
-        auto built_func = func(*args, **kwargs);
-        CAFFE_ENFORCE(built_func);
-        built_func_.reset(new Func{
-            built_func,
-            OperatorBase::template GetSingleArgument<bool>(
-                "pass_workspace", false)});
-      } catch (const py::error_already_set& e) {
-        LOG(ERROR) << "Python exception encountered while creating PythonOp: "
-                   << e.what();
-        // Rethrow exception to preserve python exception type.
-        throw;
-      }
-    }
-  }
-
-  bool RunOnDevice() override final {
-    auto* pyFunc = built_func_ ? built_func_.get() : &getFunc(token_);
-    CAFFE_ENFORCE(pyFunc);
-    {
-      // Acquire GIL for call to Python runtime.
-      py::gil_scoped_acquire g;
-
-      DeviceOption cpu_option;
-      cpu_option.set_device_type(PROTO_CPU);
-
-      std::vector<py::object> inputs;
-      inputs.reserve(InputSize());
-      for (const auto i : c10::irange(InputSize())) {
-        const auto* blob = &InputBlob(i);
-        // Allow CPU tensors in addition to operator context's tensors
-        py::object py_obj;
-        CAFFE_ENFORCE(
-            BlobIsTensorType(*blob, CPU),
-            "We only allow input blob to be CPU Tensor");
-        if (use_dlpack) {
-          DLPackWrapper<CPUContext> wrapper(
-              const_cast<Tensor*>(&(BlobGetTensor(*blob, CPU))), cpu_option);
-          // copy wrapper
-          py_obj = py::cast(wrapper, py::return_value_policy::copy);
-        } else {
-          py_obj = py::cast(
-              &(BlobGetTensor(*blob, CPU)), py::return_value_policy::reference);
-        }
-        inputs.push_back(py_obj);
-      }
-      std::vector<py::object> outputs;
-      outputs.reserve(OutputSize());
-      for (const auto i : c10::irange(OutputSize())) {
-        auto* blob = OutputBlob(i);
-
-        // Python op is always used with CPUContext only and treats inputs and
-        // outputs as CPU tensors, CUDA version of PythonOp is implemented
-        // through GPUFallbackOp that copies input CUDA blobs to CPU and copies
-        // outputs from CUDA to CPU.
-        // GPUFallbackOp also allows keeping some of the output blobs on CPU
-        // by specifying their indices explicitly in template parameters.
-
-        // PythonDLPack op allows working CPU blobs only through DLPack tensors.
-        // We don't have use cases of CUDA version yet, but if there is such use
-        // case, we can use GPUFallbackOp to enable it.
-
-        py::object py_obj;
-        if (use_dlpack) {
-          DLPackWrapper<CPUContext> wrapper(
-              BlobGetMutableTensor(blob, CPU), cpu_option);
-          py_obj = py::cast(wrapper, py::return_value_policy::copy);
-        } else {
-          py_obj = py::cast(
-              BlobGetMutableTensor(blob, CPU),
-              py::return_value_policy::reference);
-        }
-        outputs.push_back(py_obj);
-      }
-
-      try {
-        if (pyFunc->needs_workspace) {
-          pyFunc->py_func(inputs, outputs, ws_);
-        } else {
-          pyFunc->py_func(inputs, outputs);
-        }
-      } catch (const py::error_already_set& e) {
-        LOG(ERROR) << "Exception encountered running PythonOp function: "
-                   << e.what();
-        // Rethrow exception to preserve python exception type.
-        throw;
-      }
-    }
-    return true;
-  }
-
-  virtual ~PythonOpBase() {
-    if (built_func_) {
-      // since it may trigger python interpreter when refcount reaches zero
-      py::gil_scoped_acquire g;
-      built_func_.reset();
-    }
-  }
-
- protected:
-  virtual const python_detail::Func& getFunc(const std::string& token) = 0;
-  Workspace* ws_;
-
- private:
-  const std::string token_;
-  std::unique_ptr<python_detail::Func> built_func_;
-};
-
-template <class Context, bool use_dlpack>
-class PythonOp : public PythonOpBase<Context, use_dlpack> {
- public:
-  PythonOp(const OperatorDef& operator_def, Workspace* ws)
-      : PythonOpBase<Context, use_dlpack>(operator_def, ws, "pickled_builder") {
-  }
-
- protected:
-  const python_detail::Func& getFunc(const std::string& token) override {
-    return python_detail::getOpFunc(token);
-  }
-};
-
-template <class Context, bool use_dlpack>
-class PythonGradientOp : public PythonOpBase<Context, use_dlpack> {
- public:
-  PythonGradientOp(const OperatorDef& operator_def, Workspace* ws)
-      : PythonOpBase<Context, use_dlpack>(
-            operator_def,
-            ws,
-            "pickled_grad_builder") {}
-
- protected:
-  const python_detail::Func& getFunc(const std::string& token) override {
-    return python_detail::getGradientFunc(token);
-  }
-};
-
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/pybind_state_dlpack.cc b/caffe2/python/pybind_state_dlpack.cc
deleted file mode 100644
index 83b856f672a9..000000000000
--- a/caffe2/python/pybind_state_dlpack.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-#include "pybind_state_dlpack.h"
-
-namespace caffe2 {
-namespace python {
-
-namespace py = pybind11;
-
-const DLDeviceType* CaffeToDLDeviceType(int device_type) {
-  static std::map<int, DLDeviceType> dl_device_type_map{
-      {PROTO_CPU, kDLCPU},
-      {PROTO_CUDA, kDLCUDA},
-  };
-  const auto it = dl_device_type_map.find(device_type);
-  return it == dl_device_type_map.end() ? nullptr : &it->second;
-}
-
-const DLDataType* CaffeToDLType(const TypeMeta meta) {
-  static std::map<TypeIdentifier, DLDataType> dl_type_map{
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      {TypeMeta::Id<int8_t>(), DLDataType{0, 8, 1}},
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      {TypeMeta::Id<int16_t>(), DLDataType{0, 16, 1}},
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      {TypeMeta::Id<int32_t>(), DLDataType{0, 32, 1}},
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      {TypeMeta::Id<int64_t>(), DLDataType{0, 64, 1}},
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      {TypeMeta::Id<uint8_t>(), DLDataType{1, 8, 1}},
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      {TypeMeta::Id<uint16_t>(), DLDataType{1, 16, 1}},
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      {TypeMeta::Id<at::Half>(), DLDataType{2, 16, 1}},
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      {TypeMeta::Id<float>(), DLDataType{2, 32, 1}},
-      // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-      {TypeMeta::Id<double>(), DLDataType{2, 64, 1}},
-  };
-  const auto it = dl_type_map.find(meta.id());
-  return it == dl_type_map.end() ? nullptr : &it->second;
-}
-
-const TypeMeta DLTypeToCaffe(const DLDataType& dl_type) {
-  try {
-    if (dl_type.lanes != 1) {
-      throw std::invalid_argument("invalid type");
-    }
-    static std::map<int, std::map<int, TypeMeta>> dl_caffe_type_map{
-        {0,
-         std::map<int, TypeMeta>{
-             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-             {8, TypeMeta::Make<int8_t>()},
-             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-             {16, TypeMeta::Make<int16_t>()},
-             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-             {32, TypeMeta::Make<int32_t>()},
-             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-             {64, TypeMeta::Make<int64_t>()},
-         }},
-        {1,
-         std::map<int, TypeMeta>{
-             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-             {8, TypeMeta::Make<uint8_t>()},
-             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-             {16, TypeMeta::Make<uint16_t>()},
-         }},
-        {2,
-         std::map<int, TypeMeta>{
-             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-             {16, TypeMeta::Make<at::Half>()},
-             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-             {32, TypeMeta::Make<float>()},
-             // NOLINTNEXTLINE(cppcoreguidelines-avoid-magic-numbers)
-             {64, TypeMeta::Make<double>()},
-         }},
-    };
-    if (!dl_caffe_type_map.count(dl_type.code)) {
-      throw std::invalid_argument("invalid type");
-    }
-    const auto& bits_map = dl_caffe_type_map.at(dl_type.code);
-    if (!bits_map.count(dl_type.bits)) {
-      throw std::invalid_argument("invalid type");
-    }
-    return bits_map.at(dl_type.bits);
-  } catch (const std::invalid_argument& e) {
-    CAFFE_THROW(
-        "Unsupported DLDataType: ", dl_type.code, dl_type.bits, dl_type.lanes);
-  }
-}
-
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/pybind_state_dlpack.h b/caffe2/python/pybind_state_dlpack.h
deleted file mode 100644
index ab987a2e4da5..000000000000
--- a/caffe2/python/pybind_state_dlpack.h
+++ /dev/null
@@ -1,135 +0,0 @@
-#pragma once
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/core/types.h"
-#include "caffe2/proto/caffe2_pb.h"
-#include "caffe2/python/dlpack.h"
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-namespace caffe2 {
-namespace python {
-
-namespace py = pybind11;
-
-const DLDeviceType* CaffeToDLDeviceType(int device_type);
-
-const DLDataType* CaffeToDLType(const TypeMeta meta);
-
-const TypeMeta DLTypeToCaffe(const DLDataType& dl_type);
-
-// TODO: remove context
-template <class Context>
-class DLPackWrapper {
- public:
-  DLPackWrapper(Tensor* tensor, DeviceOption device_option)
-      : tensor(tensor), device_option(device_option) {}
-
-  py::object data() {
-    DLDevice tensor_context;
-    auto device_type_ptr = CaffeToDLDeviceType(device_option.device_type());
-    CAFFE_ENFORCE(
-        device_type_ptr,
-        "Unsupported device type: ",
-        device_option.device_type());
-    tensor_context.device_type = *device_type_ptr;
-    tensor_context.device_id = device_option.device_id();
-
-    if (tensor->numel() <= 0) {
-      tensor->Resize(0);
-    }
-    if (tensor->dtype() == ScalarType::Undefined) {
-      // treat uninitialized tensor as float tensor
-      tensor->template mutable_data<float>();
-    }
-    CAFFE_ENFORCE_GT(tensor->dim(), 0);
-
-    auto type_ptr = CaffeToDLType(tensor->dtype());
-    CAFFE_ENFORCE(
-        type_ptr,
-        "Tensor type is not supported in DLPack: ",
-        tensor->dtype().name());
-    DLDataType tensor_type = *type_ptr;
-
-    DLTensor dlTensor;
-    dlTensor.data = const_cast<void*>(tensor->raw_data());
-    dlTensor.device = tensor_context;
-    dlTensor.ndim = tensor->dim();
-    dlTensor.dtype = tensor_type;
-    dlTensor.shape = const_cast<int64_t*>(&(tensor->sizes()[0]));
-    dlTensor.strides = nullptr;
-    dlTensor.byte_offset = 0;
-
-    managed_tensor.dl_tensor = dlTensor;
-    // C2 Tensor memory is managed by C2
-    managed_tensor.manager_ctx = nullptr;
-    managed_tensor.deleter = [](DLManagedTensor*) {};
-
-    return py::reinterpret_steal<py::object>(
-        PyCapsule_New(&managed_tensor, "dltensor", nullptr));
-  }
-
-  void feed(py::object obj) {
-    CAFFE_ENFORCE(PyCapsule_CheckExact(obj.ptr()), "Expected DLPack capsule");
-    DLManagedTensor* dlMTensor =
-        (DLManagedTensor*)PyCapsule_GetPointer(obj.ptr(), "dltensor");
-    CAFFE_ENFORCE(dlMTensor, "Invalid DLPack capsule");
-    DLTensor* dlTensor = &dlMTensor->dl_tensor;
-    auto device_type_ptr = CaffeToDLDeviceType(device_option.device_type());
-    CAFFE_ENFORCE(
-        device_type_ptr,
-        "Unsupported device type: ",
-        device_option.device_type());
-    CAFFE_ENFORCE(
-        dlTensor->device.device_type == *device_type_ptr,
-        "DLPack tensor device type mismatch");
-    int dlpack_device_id = dlTensor->device.device_id;
-    CAFFE_ENFORCE_EQ(
-        dlpack_device_id,
-        device_option.device_id(),
-        "Expected same device id for DLPack and C2 tensors");
-
-    std::vector<int64_t> dims;
-    dims.reserve(dlTensor->ndim);
-    for (int idx = 0; idx < dlTensor->ndim; ++idx) {
-      dims.push_back(dlTensor->shape[idx]);
-    }
-
-    if (dlTensor->strides) {
-      int64_t stride = 1;
-      for (int idx = dims.size() - 1; idx >= 0; --idx) {
-        CAFFE_ENFORCE_EQ(
-            stride,
-            dlTensor->strides[idx],
-            "Tensors with non-standard strides are not supported");
-        stride *= dims[idx];
-      }
-    }
-
-    tensor->Resize(dims);
-    caffe2::TypeMeta meta = DLTypeToCaffe(dlTensor->dtype);
-    at::Device device = at::Device(tensor->GetDeviceType());
-    tensor->ShareExternalPointer(
-        at::DataPtr(
-            (void*)(((int8_t*)dlTensor->data) + dlTensor->byte_offset),
-            static_cast<void*>(dlMTensor),
-            [](void* t_ptr) -> void {
-              DLManagedTensor* mt_ptr = static_cast<DLManagedTensor*>(t_ptr);
-              if (mt_ptr->deleter) {
-                mt_ptr->deleter(mt_ptr);
-              }
-            },
-            device),
-        meta,
-        0);
-  }
-
-  Tensor* tensor;
-  DeviceOption device_option;
-  DLManagedTensor managed_tensor;
-};
-
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/pybind_state_gpu.cc b/caffe2/python/pybind_state_gpu.cc
deleted file mode 100644
index f46493d409f8..000000000000
--- a/caffe2/python/pybind_state_gpu.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-// Note(jiayq): the import_array function is done inside
-// caffe2_python.cc. Read
-// http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous
-// for more details.
-
-#define NO_IMPORT_ARRAY
-
-#include "pybind_state.h"
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#ifdef CAFFE2_USE_CUDNN
-#include "caffe2/core/common_cudnn.h"
-#endif // CAFFE2_USE_CUDNN
-#include <c10/cuda/CUDAGuard.h>
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/operators/operator_fallback_gpu.h"
-#include "caffe2/python/pybind_state_registry.h"
-
-namespace caffe2 {
-namespace python {
-
-REGISTER_CUDA_OPERATOR(Python, GPUFallbackOp);
-REGISTER_CUDA_OPERATOR(PythonGradient, GPUFallbackOp);
-
-REGISTER_CUDA_OPERATOR(PythonDLPack, GPUFallbackOp);
-REGISTER_CUDA_OPERATOR(PythonDLPackGradient, GPUFallbackOp);
-
-REGISTER_BLOB_FEEDER(CUDA, TensorFeeder<CUDAContext>);
-
-namespace py = pybind11;
-
-void addCUDAGlobalMethods(py::module& m) {
-  m.def("num_cuda_devices", &NumCudaDevices);
-  m.def("get_cuda_version", &CudaVersion);
-#ifdef CAFFE2_USE_CUDNN
-  m.def("get_cudnn_version", &cudnnCompiledVersion);
-  m.attr("cudnn_convolution_fwd_algo_count") =
-      py::int_((int)CUDNN_CONVOLUTION_FWD_ALGO_COUNT);
-  m.attr("cudnn_convolution_bwd_data_algo_count") =
-      py::int_((int)CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT);
-  m.attr("cudnn_convolution_bwd_filter_algo_count") =
-      py::int_((int)CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT);
-#else
-  m.def("get_cudnn_version", []() { return static_cast<size_t>(0); });
-  m.attr("cudnn_convolution_fwd_algo_count") = py::int_(0);
-  m.attr("cudnn_convolution_bwd_data_algo_count") = py::int_(0);
-  m.attr("cudnn_convolution_bwd_filter_algo_count") = py::int_(0);
-#endif
-  m.def("get_gpu_memory_info", [](int device_id) {
-    CUDAGuard guard(device_id);
-    size_t device_free, device_total;
-    CUDA_CHECK(cudaMemGetInfo(&device_free, &device_total));
-    return std::pair<size_t, size_t>{device_free, device_total};
-  });
-  m.def("get_cuda_peer_access_pattern", []() {
-    std::vector<std::vector<bool>> pattern;
-    CAFFE_ENFORCE(caffe2::GetCudaPeerAccessPattern(&pattern));
-    return pattern;
-  });
-  m.def("get_device_properties", [](int deviceid) {
-    auto& prop = GetDeviceProperty(deviceid);
-    std::map<std::string, py::object> obj;
-    obj["name"] = py::cast(prop.name);
-    obj["major"] = py::cast(prop.major);
-    obj["minor"] = py::cast(prop.minor);
-    obj["totalGlobalMem"] = py::cast(prop.totalGlobalMem);
-    return obj;
-  });
-  m.def(
-      "onnx_to_trt_op",
-      [](const py::bytes& onnx_model_str,
-         const std::unordered_map<std::string, std::vector<int>>&
-             output_size_hints,
-         int max_batch_size,
-         int max_workspace_size,
-         int verbosity,
-         bool debug_builder) -> py::bytes {
-#ifdef CAFFE2_USE_TRT
-#else
-        CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
-#endif // CAFFE2_USE_TRT
-      });
-  m.def(
-      "transform_trt",
-      [](const py::bytes& pred_net_str,
-         const std::unordered_map<std::string, std::vector<int>>& shapes,
-         int max_batch_size,
-         int max_workspace_size,
-         int verbosity,
-         bool debug_builder,
-         bool build_serializable_op) -> py::bytes {
-#ifdef CAFFE2_USE_TRT
-#else
-        CAFFE_THROW("Please build Caffe2 with USE_TENSORRT=1");
-#endif // CAFFE2_USE_TRT
-      });
-};
-
-void addCUDAObjectMethods(py::module& m) {
-  py::class_<DLPackWrapper<CUDAContext>>(m, "DLPackTensorCUDA")
-      .def_property_readonly(
-          "data",
-          [](DLPackWrapper<CUDAContext>* t) -> py::object {
-            CAFFE_ENFORCE_EQ(
-                t->device_option.device_type(),
-                PROTO_CUDA,
-                "Expected CUDA device option for CUDA tensor");
-
-            return t->data();
-          },
-          "Return DLPack tensor with tensor's data.")
-      .def(
-          "feed",
-          [](DLPackWrapper<CUDAContext>* t, py::object obj) {
-            CAFFE_ENFORCE_EQ(
-                t->device_option.device_type(),
-                PROTO_CUDA,
-                "Expected CUDA device option for CUDA tensor");
-            t->feed(obj);
-          },
-          "Copy data from given DLPack tensor into this tensor.")
-      .def_property_readonly(
-          "_shape",
-          [](const DLPackWrapper<CUDAContext>& t) { return t.tensor->sizes(); })
-      .def(
-          "_reshape",
-          [](DLPackWrapper<CUDAContext>* t, std::vector<int64_t> dims) {
-            t->tensor->Resize(dims);
-          });
-}
-
-PYBIND11_MODULE(caffe2_pybind11_state_gpu, m) {
-  m.doc() = "pybind11 stateful interface to Caffe2 workspaces - GPU edition";
-
-  addGlobalMethods(m);
-  addCUDAGlobalMethods(m);
-  addObjectMethods(m);
-  addCUDAObjectMethods(m);
-  for (const auto& addition : PybindAdditionRegistry()->Keys()) {
-    PybindAdditionRegistry()->Create(addition, m);
-  }
-}
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/pybind_state_hip.cc b/caffe2/python/pybind_state_hip.cc
deleted file mode 100644
index bc121cb1767c..000000000000
--- a/caffe2/python/pybind_state_hip.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-#define NO_IMPORT_ARRAY
-
-#include "pybind_state.h"
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <c10/hip/HIPGuard.h>
-#include "caffe2/core/hip/common_miopen.h"
-#include "caffe2/core/hip/context_gpu.h"
-#include "caffe2/operators/hip/operator_fallback_gpu.h"
-#include "caffe2/python/pybind_state_registry.h"
-
-namespace caffe2 {
-namespace python {
-
-REGISTER_HIP_OPERATOR(Python, GPUFallbackOp);
-REGISTER_HIP_OPERATOR(PythonGradient, GPUFallbackOp);
-
-REGISTER_HIP_OPERATOR(PythonDLPack, GPUFallbackOp);
-REGISTER_HIP_OPERATOR(PythonDLPackGradient, GPUFallbackOp);
-
-REGISTER_BLOB_FEEDER(HIP, TensorFeeder<HIPContext>);
-
-namespace py = pybind11;
-
-void addHIPGlobalMethods(py::module& m) {
-  m.def("num_hip_devices", &NumHipDevices);
-  m.def("get_hip_version", &HipVersion);
-  m.def("get_miopen_version", &miopenCompiledVersion);
-  m.def("get_gpu_memory_info", [](int device_id) {
-    HIPGuard guard(device_id);
-    size_t device_free, device_total;
-    HIP_CHECK(hipMemGetInfo(&device_free, &device_total));
-    return std::pair<size_t, size_t>{device_free, device_total};
-  });
-  m.def("get_hip_peer_access_pattern", []() {
-    std::vector<std::vector<bool>> pattern;
-    CAFFE_ENFORCE(caffe2::GetHipPeerAccessPattern(&pattern));
-    return pattern;
-  });
-  m.def("get_device_properties", [](int deviceid) {
-    auto& prop = GetDeviceProperty(deviceid);
-    std::map<std::string, py::object> obj;
-    obj["name"] = py::cast(prop.name);
-    obj["major"] = py::cast(prop.major);
-    obj["minor"] = py::cast(prop.minor);
-    obj["totalGlobalMem"] = py::cast(prop.totalGlobalMem);
-    return obj;
-  });
-};
-
-void addHIPObjectMethods(py::module& m) {
-  py::class_<DLPackWrapper<HIPContext>>(m, "DLPackTensorHIP")
-      .def_property_readonly(
-          "data",
-          [](DLPackWrapper<HIPContext>* t) -> py::object {
-            CAFFE_ENFORCE_EQ(
-                t->device_option.device_type(),
-                PROTO_HIP,
-                "Expected HIP device option for HIP tensor");
-
-            return t->data();
-          },
-          "Return DLPack tensor with tensor's data.")
-      .def(
-          "feed",
-          [](DLPackWrapper<HIPContext>* t, py::object obj) {
-            CAFFE_ENFORCE_EQ(
-                t->device_option.device_type(),
-                PROTO_HIP,
-                "Expected HIP device option for HIP tensor");
-            t->feed(obj);
-          },
-          "Copy data from given DLPack tensor into this tensor.")
-      .def_property_readonly(
-          "_shape",
-          [](const DLPackWrapper<HIPContext>& t) { return t.tensor->sizes(); })
-      .def(
-          "_reshape",
-          [](DLPackWrapper<HIPContext>* t, std::vector<int64_t> dims) {
-            t->tensor->Resize(dims);
-          });
-}
-
-PYBIND11_MODULE(caffe2_pybind11_state_hip, m) {
-  m.doc() = "pybind11 stateful interface to Caffe2 workspaces - GPU edition";
-
-  addGlobalMethods(m);
-  addHIPGlobalMethods(m);
-  addObjectMethods(m);
-  addHIPObjectMethods(m);
-  for (const auto& addition : PybindAdditionRegistry()->Keys()) {
-    PybindAdditionRegistry()->Create(addition, m);
-  }
-}
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/pybind_state_ideep.cc b/caffe2/python/pybind_state_ideep.cc
deleted file mode 100644
index f93524b2f9d3..000000000000
--- a/caffe2/python/pybind_state_ideep.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-// Note(jiayq): the import_array function is done inside
-// caffe2_python.cc. Read
-// http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous
-// for more details.
-#define NO_IMPORT_ARRAY
-
-#include "pybind_state.h"
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <caffe2/ideep/ideep_utils.h>
-#include "caffe2/ideep/operators/operator_fallback_ideep.h"
-
-namespace caffe2 {
-namespace python {
-
-USE_IDEEP_DEF_ALIASES();
-
-class IDeepFetcher;
-class IDeepFeeder;
-
-REGISTER_IDEEP_OPERATOR(Python, IDEEPFallbackOp<PythonOp<CPUContext, false>>);
-
-REGISTER_BLOB_FETCHER((TypeMeta::Id<itensor>()), IDeepFetcher);
-REGISTER_BLOB_FEEDER(IDEEP, IDeepFeeder);
-
-class IDeepFetcher : public BlobFetcherBase {
-  TypeMeta type_transform(const itensor& atensor) {
-    switch (atensor.get_data_type()) {
-      case itensor::data_type::f32:
-        return TypeMeta::Make<float>();
-      case itensor::data_type::s32:
-        return TypeMeta::Make<int>();
-      case itensor::data_type::s8:
-        return TypeMeta::Make<int8_t>();
-      case itensor::data_type::u8:
-        return TypeMeta::Make<uint8_t>();
-      default:
-        // Should we throw exception?
-        return TypeMeta();
-    }
-  }
-
- public:
-  pybind11::object Fetch(const Blob& blob) override {
-    try {
-      return FetchTensor(blob.Get<itensor>(), true).obj;
-    } catch (ideep::error& e) {
-      LOG(ERROR) << "IDEEP error: " << e.message;
-      throw;
-    }
-  }
-
-  FetchedBlob FetchTensor(const itensor& atensor, bool force_copy) {
-#ifdef USE_NUMPY
-    FetchedBlob result;
-    CAFFE_ENFORCE(
-        (atensor.ndims() != 0) &&
-            (atensor.get_nelems() == 0 || atensor.get_data_handle() != nullptr),
-        "Trying to fetch uninitialized tensor");
-    // NOTE: Only support float so far.
-    const int numpy_type = NPY_FLOAT;
-    CAFFE_ENFORCE(
-        numpy_type != -1,
-        "Unsupported ideep memory data type? This usually should not happen "
-        "since ideep memory usually only do float and double.");
-    itensor::dims dims;
-    bool need_reorder = atensor.need_reorder();
-    if (atensor.get_data_type() == idtype::f32 && !atensor.has_scale()) {
-      // For FP32 path, only support NCHW format input, so if atensor
-      // has NHWC format, we need reorder it to NCHW format.
-      dims = atensor.get_dims();
-      need_reorder = need_reorder || atensor.get_desc().is_nhwc();
-    } else {
-      dims = atensor.get_public_format_dims();
-    }
-    std::vector<npy_intp> npy_dims(dims.begin(), dims.end());
-
-    result.copied = force_copy || need_reorder;
-    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
-    void* outPtr;
-    if (result.copied) {
-      result.obj = py::reinterpret_steal<py::object>(
-          PyArray_SimpleNew(atensor.ndims(), npy_dims.data(), numpy_type));
-      outPtr = static_cast<void*>(
-          PyArray_DATA(reinterpret_cast<PyArrayObject*>(result.obj.ptr())));
-    } else {
-      outPtr = atensor.get_data_handle();
-      result.obj = py::reinterpret_steal<py::object>(PyArray_SimpleNewFromData(
-          atensor.ndims(), npy_dims.data(), numpy_type, outPtr));
-    }
-
-    if (numpy_type == NPY_OBJECT) {
-      CAFFE_THROW("We don't support strings.");
-    }
-
-    if (result.copied) {
-      if (atensor.get_data_type() == idtype::f32 && !atensor.has_scale()) {
-        itensor temp_ten(atensor.get_desc().to_default_format(), outPtr);
-        atensor.reorder_to(temp_ten);
-      } else {
-        atensor.to_public(outPtr);
-      }
-    }
-
-    return result;
-#else
-    CAFFE_THROW("Caffe2 was compiled without NumPy support.");
-#endif // USE_NUMPY
-  }
-};
-
-class IDeepFeeder : public BlobFeederBase {
-  itensor::data_type type_transform(const TypeMeta meta) {
-    if (meta == TypeMeta::Make<float>())
-      return itensor::data_type::f32;
-    else if (meta == TypeMeta::Make<int>())
-      return itensor::data_type::s32;
-    else if (meta == TypeMeta::Make<int8_t>())
-      return itensor::data_type::s8;
-    else if (meta == TypeMeta::Make<uint8_t>())
-      return itensor::data_type::u8;
-    else
-      return itensor::data_type::undef;
-  }
-
- public:
-  void FeedTensor(
-      const DeviceOption& option,
-      PyArrayObject* original_array,
-      itensor* tensor) {
-#ifdef USE_NUMPY
-    PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
-    auto g = MakeGuard([&]() { Py_XDECREF(array); });
-    const auto npy_type = PyArray_TYPE(array);
-    const TypeMeta meta = NumpyTypeToCaffe(npy_type);
-    CAFFE_ENFORCE_NE(
-        meta,
-        ScalarType::Undefined,
-        "This numpy data type is not supported: ",
-        PyArray_TYPE(array),
-        ".");
-
-    int ndim = PyArray_NDIM(array);
-    npy_intp* npy_dims = PyArray_DIMS(array);
-
-    itensor::dims adims;
-    for (int i = 0; i < ndim; i++) {
-      adims.push_back(static_cast<itensor::dims::value_type>(npy_dims[i]));
-    }
-
-    switch (npy_type) {
-      case NPY_OBJECT:
-      case NPY_UNICODE:
-        CAFFE_THROW("IDeep doesn't support string");
-        break;
-      default:
-        auto type = type_transform(meta);
-        if (tensor->get_dims() != adims || type != tensor->get_data_type()) {
-          tensor->resize(adims, type);
-        }
-        tensor->feed_from(adims, type, static_cast<void*>(PyArray_DATA(array)));
-    }
-#else
-    CAFFE_THROW("Caffe2 was compiled without NumPy support.");
-#endif // USE_NUMPY
-  }
-
-  bool ZeroDim(PyArrayObject* array) {
-#ifdef USE_NUMPY
-    int ndim = PyArray_NDIM(array);
-    return ndim == 0;
-#else
-    CAFFE_THROW("Caffe2 was compiled without NumPy support.");
-#endif
-  }
-
-  void Feed(
-      const DeviceOption& option,
-      PyArrayObject* original_array,
-      Blob* blob,
-      bool in_place) override {
-#ifdef USE_NUMPY
-    try {
-      PyArrayObject* array = PyArray_GETCONTIGUOUS(original_array);
-      auto g = MakeGuard([&]() { Py_XDECREF(array); });
-
-      const auto npy_type = PyArray_TYPE(array);
-      const TypeMeta meta = NumpyTypeToCaffe(npy_type);
-
-      // TODO: if necessary, use dispatcher.
-      if ((in_place && blob->IsType<itensor>()) ||
-          (meta.Match<float>() && !ZeroDim(original_array))) {
-        FeedTensor(option, original_array, blob->GetMutable<itensor>());
-      } else {
-        DeviceOption cpu_option(option);
-        cpu_option.set_device_type(DeviceTypeProto::PROTO_CPU);
-        TensorFeeder<CPUContext> cpu_tensor_feeder;
-        if (in_place) {
-          cpu_tensor_feeder.FeedTensor(
-              cpu_option,
-              original_array,
-              BlobGetMutableTensor(blob, OptionToDevice(cpu_option).type()),
-              true);
-        } else {
-          blob->Reset<Tensor>(new Tensor(
-              cpu_tensor_feeder.FeedTensor(cpu_option, original_array)));
-        }
-      }
-    } catch (ideep::error& e) {
-      LOG(ERROR) << "IDEEP error: " << e.message;
-      throw;
-    }
-#else
-    CAFFE_THROW("Caffe2 was compiled without NumPy support.");
-#endif
-  }
-};
-
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/pybind_state_int8.cc b/caffe2/python/pybind_state_int8.cc
deleted file mode 100644
index 6dd5ecdf88bf..000000000000
--- a/caffe2/python/pybind_state_int8.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// Note(jiayq): the import_array function is done inside
-// caffe2_python.cc. Read
-// http://docs.scipy.org/doc/numpy-1.10.1/reference/c-api.array.html#miscellaneous
-// for more details.
-#define NO_IMPORT_ARRAY
-#include "caffe2/python/pybind_state.h"
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include "caffe2/core/tensor_int8.h"
-
-namespace caffe2 {
-namespace python {
-
-class Int8TensorFetcher : public BlobFetcherBase {
- public:
-  pybind11::object Fetch(const Blob& blob) override {
-#ifdef USE_NUMPY
-    const caffe2::int8::Int8TensorCPU& src =
-        blob.template Get<caffe2::int8::Int8TensorCPU>();
-    const int numpy_type = CaffeToNumpyType(src.t.dtype());
-    CAFFE_ENFORCE(numpy_type != -1, "Int8Tensor contains unknown type data");
-    std::vector<npy_intp> npy_dims;
-    for (const auto dim : src.t.sizes()) {
-      npy_dims.push_back(dim);
-    }
-    auto data_array = pybind11::reinterpret_steal<pybind11::object>(
-        PyArray_SimpleNew(src.t.sizes().size(), npy_dims.data(), numpy_type));
-    void* ptr = static_cast<void*>(
-        PyArray_DATA(reinterpret_cast<PyArrayObject*>(data_array.ptr())));
-    CPUContext context;
-    context.CopyBytesSameDevice(src.t.nbytes(), src.t.raw_data(), ptr);
-    context.FinishDeviceComputation();
-
-    auto result = pybind11::cast<pybind11::object>(
-        pybind11::make_tuple(data_array, src.scale, src.zero_point));
-    return result;
-#else
-    CAFFE_THROW("Caffe2 was compiled without NumPy support.");
-#endif // USE_NUMPY
-  }
-};
-
-REGISTER_BLOB_FETCHER(
-    (TypeMeta::Id<caffe2::int8::Int8TensorCPU>()),
-    caffe2::python::Int8TensorFetcher);
-} // namespace  python
-
-} // namespace caffe2
diff --git a/caffe2/python/pybind_state_nomni.cc b/caffe2/python/pybind_state_nomni.cc
deleted file mode 100644
index 82a896faefd6..000000000000
--- a/caffe2/python/pybind_state_nomni.cc
+++ /dev/null
@@ -1,553 +0,0 @@
-#include "caffe2/core/context.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/core/types.h"
-#include "caffe2/opt/converter.h"
-#include "caffe2/opt/distributed.h"
-#include "caffe2/proto/caffe2.pb.h"
-#include "caffe2/python/dlpack.h"
-#include "caffe2/python/pybind_state_registry.h"
-#include "caffe2/utils/proto_utils.h"
-#include "nomnigraph/Converters/Dot.h"
-#include "nomnigraph/Graph/Algorithms.h"
-#include "nomnigraph/Representations/NeuralNet.h"
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-using ListCasterBase = pybind11::detail::list_caster<
-    std::vector<nom::repr::NNGraph::NodeRef>,
-    nom::repr::NNGraph::NodeRef>;
-namespace pybind11 {
-namespace detail {
-template <>
-struct type_caster<std::vector<nom::repr::NNGraph::NodeRef>> : ListCasterBase {
-  static handle cast(
-      const std::vector<nom::repr::NNGraph::NodeRef>& src,
-      return_value_policy,
-      handle parent) {
-    return ListCasterBase::cast(src, return_value_policy::reference, parent);
-  }
-  static handle cast(
-      const std::vector<nom::repr::NNGraph::NodeRef>* src,
-      return_value_policy pol,
-      handle parent) {
-    return cast(*src, pol, parent);
-  }
-};
-} // namespace detail
-} // namespace pybind11
-
-namespace caffe2 {
-namespace python {
-
-using namespace nom::repr;
-
-namespace {
-
-std::map<std::string, std::string> NNPrinter(
-    typename nom::repr::NNGraph::NodeRef node) {
-  std::map<std::string, std::string> labelMap;
-  assert(node->data() && "Node doesn't have data, can't render it");
-  if (isa<nom::repr::NeuralNetOperator>(node->data())) {
-    auto* op = dyn_cast<nom::repr::NeuralNetOperator>(node->data().get());
-    labelMap["label"] = op->getName();
-    labelMap["shape"] = "box";
-  } else if (isa<nom::repr::Data>(node->data())) {
-    auto tensor = dyn_cast<nom::repr::NeuralNetData>(node->data().get());
-    labelMap["label"] = tensor->getName();
-  }
-  return labelMap;
-};
-
-using Graph = nom::Graph<py::object>;
-std::map<std::string, std::string> GraphPrinter(typename Graph::NodeRef node) {
-  std::map<std::string, std::string> labelMap;
-  assert(node->data() && "Node doesn't have data, can't render it");
-  labelMap["label"] = py::str(node->data());
-  return labelMap;
-};
-
-} // namespace
-
-void addNomnigraphMethods(pybind11::module& m) {
-  // Generic Graph methods
-  py::class_<Graph> graph(m, "Graph");
-  py::class_<nom::Node<py::object>> node(m, "Node");
-  py::class_<nom::Edge<py::object>> edge(m, "Edge");
-  graph.def(py::init<>())
-      .def(
-          "__repr__",
-          [](Graph* g) {
-            return nom::converters::convertToDotString(g, GraphPrinter);
-          })
-      .def(
-          "createEdge",
-          [](Graph* g, Graph::NodeRef a, Graph::NodeRef b) {
-            return g->createEdge(a, b);
-          },
-          py::return_value_policy::reference_internal)
-      .def(
-          "createNode",
-          [](Graph* g, py::object obj) {
-            return g->createNode(std::move(obj));
-          },
-          py::return_value_policy::reference_internal);
-
-  // NNModule methods
-  m.def("NNModuleFromProtobuf", [](py::bytes def) {
-    caffe2::NetDef proto;
-    CAFFE_ENFORCE(ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-    std::vector<NNGraph::NodeRef> ns;
-    auto nn = caffe2::convertToNNModule(proto, false, &ns);
-    return std::pair<NNModule, std::vector<NNGraph::NodeRef>>(
-        std::move(nn), ns);
-  });
-
-  m.def(
-      "NNModuleFromProtobufDistributed",
-      [](py::bytes def, std::map<std::string, py::bytes> blobToDeviceMap) {
-        std::map<std::string, caffe2::DeviceOption> m;
-        for (const auto& el : blobToDeviceMap) {
-          caffe2::DeviceOption d;
-          CAFFE_ENFORCE(
-              ParseProtoFromLargeString(el.second.cast<std::string>(), &d));
-          m[el.first] = d;
-        }
-
-        caffe2::NetDef proto;
-        CAFFE_ENFORCE(
-            ParseProtoFromLargeString(def.cast<std::string>(), &proto));
-
-        return caffe2::convertToNNModule(proto, m);
-      });
-
-  m.def("replaceProducer", &nn::replaceProducer);
-  m.def("replaceAllUsesWith", &nn::replaceAllUsesWith);
-  m.def("replaceAsConsumer", &nn::replaceAsConsumer);
-
-  py::class_<NNModule> nnmodule(m, "NNModule");
-  nnmodule.def(py::init<>())
-      .def(
-          "dataFlow",
-          [](NNModule* nn) -> NNGraph* { return &nn->dataFlow; },
-          py::return_value_policy::reference_internal)
-      .def(
-          "createUniqueDataNode",
-          &NNModule::createUniqueDataNode,
-          py::return_value_policy::reference_internal)
-      .def(
-          "convertToCaffe2Proto",
-          [](NNModule& nn, py::object def) {
-            CAFFE_ENFORCE(
-                pybind11::hasattr(def, "SerializeToString"),
-                "convertToCaffe2Proto takes either no args",
-                "a NetDef");
-            auto str = def.attr("SerializeToString")();
-            caffe2::NetDef proto;
-            proto.ParseFromString(py::bytes(str));
-            auto new_proto = caffe2::convertToCaffe2Proto(nn, proto);
-            std::string out;
-            new_proto.SerializeToString(&out);
-            return py::bytes(out);
-          })
-      .def(
-          "getExecutionOrder",
-          [](NNModule& nn) {
-            nn::coalesceInsertedDataDependencies(&nn);
-            std::vector<NNGraph::NodeRef> out;
-            auto sccs = nom::algorithm::tarjans(&nn.controlFlow);
-            for (const auto& scc : sccs) {
-              for (const auto& bb : scc.getNodes()) {
-                for (const auto& instr : bb->data().getInstructions()) {
-                  out.emplace_back(instr);
-                }
-              }
-            }
-            return out;
-          },
-          py::return_value_policy::reference_internal)
-      .def("replaceSubgraph", &NNModule::replaceSubgraph)
-      .def("deleteSubgraph", &NNModule::deleteSubgraph);
-
-  auto getTensors = [](NNGraph* g) {
-    return nn::nodeIterator<nom::repr::Tensor>(*g);
-  };
-  auto getOperators = [](NNGraph* g) {
-    return nn::nodeIterator<NeuralNetOperator>(*g);
-  };
-  // NNGraph methods
-  py::class_<NNGraph> nngraph(m, "NNGraph");
-  nngraph
-      .def(
-          "__repr__",
-          [](NNGraph* g) {
-            return nom::converters::convertToDotString(g, NNPrinter);
-          })
-      .def(
-          "createEdge",
-          [](NNGraph* g, NNGraph::NodeRef a, NNGraph::NodeRef b) {
-            CAFFE_ENFORCE(
-                (nn::is<NeuralNetOperator>(a) && nn::is<NeuralNetData>(b)) ||
-                    (nn::is<NeuralNetOperator>(b) && nn::is<NeuralNetData>(a)),
-                "Edges must exist between NeuralNetOperator and NeuralNetData");
-            g->createEdge(a, b);
-          })
-      .def("deleteEdge", &NNGraph::deleteEdge)
-      .def(
-          "deleteEdge",
-          [](NNGraph* g, NNGraph::NodeRef a, NNGraph::NodeRef b) {
-            auto edge = g->getEdgeIfExists(a, b);
-            if (edge) {
-              g->deleteEdge(edge);
-            }
-          })
-      .def(
-          "createNode",
-          [](NNGraph* g, GenericOperator& op) {
-            return g->createNode(
-                std::make_unique<GenericOperator>(op.getName()));
-          },
-          py::return_value_policy::reference_internal)
-      .def(
-          "createNode",
-          [](NNGraph* g, nom::repr::Tensor& tensor) {
-            return g->createNode(
-                std::make_unique<nom::repr::Tensor>(tensor.getName()));
-          },
-          py::return_value_policy::reference_internal)
-      .def(
-          "createNode",
-          [](NNGraph* g, py::object op_def) {
-            CAFFE_ENFORCE(
-                pybind11::hasattr(op_def, "SerializeToString"),
-                "createNode takes either OperatorDef",
-                "or ng.NeuralNetOperator");
-            auto str = op_def.attr("SerializeToString")();
-            OperatorDef op;
-            op.ParseFromString(py::bytes(str));
-            if (op.input().size() || op.output().size()) {
-              LOG(WARNING)
-                  << "Input and output specifications are "
-                  << "dropped when converting a single operator to nomnigraph. "
-                  << "Use ng.NNModule(NetDef&) to preserve these.";
-            }
-            return g->createNode(convertToNeuralNetOperator(op));
-          },
-          py::return_value_policy::reference_internal)
-      .def("deleteNode", &NNGraph::deleteNode)
-      .def(
-          "replaceNode",
-          [](NNGraph* g, NNGraph::NodeRef old_node, NNGraph::NodeRef new_node) {
-            g->replaceNode(old_node, new_node);
-          })
-      .def(
-          "getMutableNodes",
-          &NNGraph::getMutableNodes,
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
-          "nodes",
-          &NNGraph::getMutableNodes,
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
-          "operators",
-          getOperators,
-          py::return_value_policy::reference_internal)
-      .def_property_readonly(
-          "tensors", getTensors, py::return_value_policy::reference_internal);
-
-  // Node level methods
-  using NodeType = nom::Node<std::unique_ptr<nom::repr::Value>>;
-  py::class_<NodeType> noderef(m, "NodeRef");
-  auto getName = [](NNGraph::NodeRef n) {
-    if (nn::is<nom::repr::Tensor>(n)) {
-      return nn::get<nom::repr::Tensor>(n)->getName();
-    } else if (nn::is<NeuralNetOperator>(n)) {
-      return nn::get<NeuralNetOperator>(n)->getName();
-    }
-    return std::string("Unknown");
-  };
-  auto getType = [](NNGraph::NodeRef n) {
-    if (nn::is<nom::repr::Tensor>(n)) {
-      return "Tensor";
-    } else if (nn::is<NeuralNetOperator>(n)) {
-      return "Operator";
-    }
-    return "Unknown";
-  };
-  auto getOperator = [](NNGraph::NodeRef n) {
-    CAFFE_ENFORCE(nn::is<NeuralNetOperator>(n));
-    return nn::get<NeuralNetOperator>(n);
-  };
-  auto getTensor = [](NNGraph::NodeRef n) {
-    CAFFE_ENFORCE(nn::is<nom::repr::Tensor>(n));
-    return nn::get<nom::repr::Tensor>(n);
-  };
-  auto getInputs = [](NNGraph::NodeRef n) {
-    CAFFE_ENFORCE(nn::is<NeuralNetOperator>(n));
-    return nn::getInputs(n);
-  };
-  auto getOutputs = [](NNGraph::NodeRef n) {
-    CAFFE_ENFORCE(nn::is<NeuralNetOperator>(n));
-    return nn::getOutputs(n);
-  };
-  auto getProducer = [](NNGraph::NodeRef n) {
-    CAFFE_ENFORCE(nn::is<NeuralNetData>(n));
-    return nn::getProducer(n);
-  };
-  auto getConsumers = [](NNGraph::NodeRef n) {
-    CAFFE_ENFORCE(nn::is<NeuralNetData>(n));
-    return nn::getConsumers(n);
-  };
-  auto setAnnotation = [](NNGraph::NodeRef n, Caffe2Annotation& annot) {
-    auto* nnOp = nn::get<NeuralNetOperator>(n);
-    nnOp->setAnnotation(std::make_unique<Caffe2Annotation>(annot));
-  };
-  auto getAnnotation = [](NNGraph::NodeRef n) {
-    return getOrAddCaffe2Annotation(n);
-  };
-
-  noderef
-      .def(
-          "isOperator",
-          [](NNGraph::NodeRef n) { return nn::is<NeuralNetOperator>(n); })
-      .def(
-          "isTensor",
-          [](NNGraph::NodeRef n) { return nn::is<nom::repr::Tensor>(n); })
-      .def("getType", getType)
-      .def_property_readonly("type", getType)
-      .def("getName", getName)
-      .def_property_readonly("name", getName)
-      .def(
-          "getOperator",
-          getOperator,
-          py::return_value_policy::reference_internal)
-      .def("getTensor", getTensor, py::return_value_policy::reference_internal)
-      .def_property_readonly(
-          "operator", getOperator, py::return_value_policy::reference)
-      .def_property_readonly(
-          "tensor", getTensor, py::return_value_policy::reference)
-      .def("getInputs", getInputs, py::return_value_policy::reference)
-      .def("getOutputs", getOutputs, py::return_value_policy::reference)
-      .def("hasProducer", [](NNGraph::NodeRef n) { return nn::hasProducer(n); })
-      .def("getProducer", getProducer, py::return_value_policy::reference)
-      .def("getConsumers", getConsumers, py::return_value_policy::reference)
-      .def_property_readonly(
-          "inputs", getInputs, py::return_value_policy::reference)
-      .def_property_readonly(
-          "outputs", getOutputs, py::return_value_policy::reference)
-      .def_property_readonly(
-          "producer", getProducer, py::return_value_policy::reference)
-      .def_property_readonly(
-          "consumers", getConsumers, py::return_value_policy::reference)
-      .def("getAnnotation", getAnnotation, py::return_value_policy::reference)
-      .def("setAnnotation", setAnnotation)
-      .def_property(
-          "annotation",
-          getAnnotation,
-          setAnnotation,
-          py::return_value_policy::reference)
-      .def(
-          "getOperatorPredecessors",
-          [](NNGraph::NodeRef n) {
-            CAFFE_ENFORCE(nn::is<NeuralNetOperator>(n));
-            std::vector<NNGraph::NodeRef> pred;
-            for (const auto& inEdge : n->getInEdges()) {
-              auto data = inEdge->tail();
-              if (nn::hasProducer(data)) {
-                pred.emplace_back(nn::getProducer(data));
-              }
-            }
-            return pred;
-          },
-          py::return_value_policy::reference)
-      .def(
-          "getOperatorSuccessors",
-          [](NNGraph::NodeRef n) {
-            CAFFE_ENFORCE(nn::is<NeuralNetOperator>(n));
-            std::vector<NNGraph::NodeRef> succ;
-            for (const auto& outEdge : n->getOutEdges()) {
-              auto data = outEdge->head();
-              for (const auto& consumer : nn::getConsumers(data)) {
-                succ.emplace_back(consumer);
-              }
-            }
-            return succ;
-          },
-          py::return_value_policy::reference);
-
-  py::class_<NeuralNetOperator, GenericOperator> nnop(m, "NeuralNetOperator");
-  py::class_<nom::repr::Tensor> nndata(m, "NeuralNetData");
-
-  nnop.def(py::init<std::string>()).def("getName", &NeuralNetOperator::getName);
-  nndata.def(py::init<std::string>()).def("getName", &NeuralNetData::getName);
-
-  // Subgraph matching API
-  py::class_<NNSubgraph> nnsubgraph(m, "NNSubgraph");
-  nnsubgraph.def(py::init<>())
-      .def("__len__", [](NNSubgraph& s) { return s.getNodes().size(); })
-      .def(
-          "__repr__",
-          [](NNSubgraph* g) {
-            return nom::converters::convertToDotString<NNGraph>(*g, NNPrinter);
-          })
-      .def(
-          "addNode",
-          [](NNSubgraph* sg, NNGraph::NodeRef node) { sg->addNode(node); })
-      .def(
-          "induceEdges",
-          [](NNSubgraph* sg) { nom::algorithm::induceEdges(sg); })
-      .def_property_readonly(
-          "nodes",
-          [](NNSubgraph& s) {
-            std::vector<NNGraph::NodeRef> out;
-            for (auto n : s.getNodes()) {
-              out.emplace_back(n);
-            }
-            return out;
-          },
-          py::return_value_policy::reference)
-      .def("hasNode", [](NNSubgraph& s, NNGraph::NodeRef n) {
-        return s.hasNode(n);
-      });
-
-  py::class_<nn::NNMatchGraph> nnMatchGraph(m, "NNMatchGraph");
-  nnMatchGraph.def(py::init<>());
-
-  using MatchPredicateType = nom::Node<nn::NNMatchPredicate>;
-  py::class_<MatchPredicateType> nnMatchPredicate(m, "MatchPredicateRef");
-
-  nnMatchGraph
-      .def(
-          "createEdge",
-          [](nn::NNMatchGraph* g,
-             nn::NNMatchGraph::NodeRef a,
-             nn::NNMatchGraph::NodeRef b) { g->createEdge(a, b); })
-      .def(
-          "createNode",
-          [](nn::NNMatchGraph* g, GenericOperator& op, bool strict) {
-            auto opName = op.getName();
-            auto match = [opName](NNGraph::NodeRef node) {
-              NOM_REQUIRE_OR_RET_FALSE(nn::is<NeuralNetOperator>(node));
-              auto nnOp = nn::get<NeuralNetOperator>(node);
-              return opName == nnOp->getName();
-            };
-            auto node = nn::NNMatchPredicate(match);
-            if (!strict) {
-              node.nonTerminal();
-            }
-            return g->createNode(std::move(node));
-          },
-          py::return_value_policy::reference_internal,
-          py::arg("node"),
-          py::arg("strict") = false)
-      .def(
-          "createNode",
-          [](nn::NNMatchGraph* g, nom::repr::Tensor& tensor, bool strict) {
-            auto node = nn::NNMatchPredicate(nn::is<nom::repr::Tensor>);
-            if (!strict) {
-              node.nonTerminal();
-            }
-            return g->createNode(std::move(node));
-          },
-          py::return_value_policy::reference_internal,
-          py::arg("tensor"),
-          py::arg("strict") = false)
-      .def(
-          "createNode",
-          [](nn::NNMatchGraph* g, bool strict) {
-            auto match = [](NNGraph::NodeRef node) { return true; };
-            auto node = nn::NNMatchPredicate(match);
-            if (!strict) {
-              node.nonTerminal();
-            }
-            return g->createNode(std::move(node));
-          },
-          py::return_value_policy::reference_internal,
-          py::arg("strict") = false)
-      .def(
-          "getMutableNodes",
-          [](nn::NNMatchGraph* g) { return g->getMutableNodes(); },
-          py::return_value_policy::reference_internal);
-
-  m.def("matchSubgraph", [](NNGraph::NodeRef node, nn::NNMatchGraph* mg) {
-    // Get root node or node in root cycle
-    auto match_node = *nom::algorithm::tarjans(mg).back().getNodes().begin();
-    auto result = mg->isSubgraphMatch(node, match_node, false);
-    if (result.isMatch()) {
-      return *result.getMatchedSubgraph();
-    }
-    return NNSubgraph();
-  });
-
-  // Annotation API
-  py::class_<Caffe2Annotation> annotation(m, "Annotation");
-  annotation.def(py::init<>())
-      .def("setDevice", &Caffe2Annotation::setDevice)
-      .def("getDevice", &Caffe2Annotation::getDevice)
-      .def("setDeviceType", &Caffe2Annotation::setDeviceType)
-      .def("getDeviceType", &Caffe2Annotation::getDeviceType)
-      .def("setKeyNode", &Caffe2Annotation::setKeyNode)
-      .def(
-          "getKeyNode",
-          &Caffe2Annotation::getKeyNode,
-          py::return_value_policy::reference)
-      .def("setLengthNode", &Caffe2Annotation::setLengthNode)
-      .def(
-          "getLengthNode",
-          &Caffe2Annotation::getLengthNode,
-          py::return_value_policy::reference)
-      .def("setComponentLevels", &Caffe2Annotation::setComponentLevels)
-      .def("getComponentLevels", &Caffe2Annotation::getComponentLevels)
-      .def("hasDeviceOption", &Caffe2Annotation::hasDeviceOption)
-      .def_property(
-          "device_option",
-          [](Caffe2Annotation& annot) {
-            auto DeviceOption = py::module::import("caffe2.proto.caffe2_pb2")
-                                    .attr("DeviceOption");
-            // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
-            auto proto = annot.getDeviceOption();
-            std::string serialized_proto;
-            proto.SerializeToString(&serialized_proto);
-            auto py_device_opt = DeviceOption();
-            py_device_opt.attr("ParseFromString")(py::bytes(serialized_proto));
-            return py_device_opt;
-          },
-          [](Caffe2Annotation& annot, py::object& def) {
-            CAFFE_ENFORCE(
-                pybind11::hasattr(def, "SerializeToString"),
-                "device_option can only be set to a DeviceOption");
-            auto str = def.attr("SerializeToString")();
-            caffe2::DeviceOption proto;
-            proto.ParseFromString(py::bytes(str));
-            annot.setDeviceOption(proto);
-          },
-          py::return_value_policy::reference)
-      .def_property(
-          "operator_def",
-          [](Caffe2Annotation& annot) {
-            auto opDef = py::module::import("caffe2.proto.caffe2_pb2")
-                             .attr("OperatorDef");
-            // NOLINTNEXTLINE(performance-unnecessary-copy-initialization)
-            auto proto = annot.getOperatorDef();
-            std::string serialized_proto;
-            proto.SerializeToString(&serialized_proto);
-            auto py_op_def = opDef();
-            py_op_def.attr("ParseFromString")(py::bytes(serialized_proto));
-            return py_op_def;
-          },
-          [](Caffe2Annotation& annot, py::object& def) {
-            CAFFE_ENFORCE(
-                pybind11::hasattr(def, "SerializeToString"),
-                "operator_def can only be set to an OperatorDef");
-            auto str = def.attr("SerializeToString")();
-            caffe2::OperatorDef proto;
-            proto.ParseFromString(py::bytes(str));
-            annot.setOperatorDef(proto);
-          },
-          py::return_value_policy::reference);
-}
-
-REGISTER_PYBIND_ADDITION(addNomnigraphMethods);
-
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/pybind_state_registry.cc b/caffe2/python/pybind_state_registry.cc
deleted file mode 100644
index 77fabf342564..000000000000
--- a/caffe2/python/pybind_state_registry.cc
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "caffe2/python/pybind_state_registry.h"
-
-namespace caffe2 {
-namespace python {
-
-namespace py = pybind11;
-
-C10_DEFINE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&);
-
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/pybind_state_registry.h b/caffe2/python/pybind_state_registry.h
deleted file mode 100644
index 18bb0a3dbaa0..000000000000
--- a/caffe2/python/pybind_state_registry.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-#include "c10/util/Registry.h"
-
-namespace caffe2 {
-namespace python {
-
-namespace py = pybind11;
-
-struct PybindAddition {
-  PybindAddition() {}
-  PybindAddition(py::module&) {}
-  virtual ~PybindAddition(){};
-};
-
-C10_DECLARE_REGISTRY(PybindAdditionRegistry, PybindAddition, py::module&);
-
-#define REGISTER_PYBIND_ADDITION(funcname)                                    \
-  namespace {                                                                 \
-  struct funcname##Impl : public PybindAddition {                             \
-    funcname##Impl(py::module& m) {                                           \
-      funcname(m);                                                            \
-    }                                                                         \
-  };                                                                          \
-  C10_REGISTER_CLASS(PybindAdditionRegistry, funcname##Impl, funcname##Impl); \
-  }
-
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/pybind_workspace.cc b/caffe2/python/pybind_workspace.cc
deleted file mode 100644
index 2962e3b297be..000000000000
--- a/caffe2/python/pybind_workspace.cc
+++ /dev/null
@@ -1,82 +0,0 @@
-#include "caffe2/core/workspace.h"
-#include "caffe2/python/pybind_workspace.h"
-
-namespace caffe2 {
-namespace python {
-
-// NOLINTNEXTLINE(modernize-use-equals-default)
-BlobFetcherBase::~BlobFetcherBase() {}
-
-C10_DEFINE_TYPED_REGISTRY(
-    BlobFetcherRegistry,
-    TypeIdentifier,
-    BlobFetcherBase,
-    std::unique_ptr);
-
-// gWorkspace is the pointer to the current workspace. The ownership is kept
-// by the gWorkspaces map.
-static Workspace* gWorkspace = nullptr;
-static std::string gCurrentWorkspaceName;
-// gWorkspaces allows us to define and switch between multiple workspaces in
-// Python.
-static std::map<std::string, std::unique_ptr<Workspace>> gWorkspaces;
-
-Workspace* GetCurrentWorkspace() {
-  return gWorkspace;
-}
-
-void SetCurrentWorkspace(Workspace* workspace) {
-  gWorkspace = workspace;
-}
-
-Workspace* NewWorkspace() {
-  std::unique_ptr<Workspace> new_workspace(new Workspace());
-  gWorkspace = new_workspace.get();
-  return gWorkspace;
-}
-
-Workspace* GetWorkspaceByName(const std::string& name) {
-  if (gWorkspaces.count(name)) {
-    return gWorkspaces[name].get();
-  }
-  return nullptr;
-}
-
-std::string GetCurrentWorkspaceName() {
-  return gCurrentWorkspaceName;
-}
-void InsertWorkspace(const std::string& name, std::unique_ptr<Workspace> ws) {
-  gWorkspaces.insert(std::make_pair(name, std::move(ws)));
-}
-
-void SwitchWorkspaceInternal(const std::string& name, bool create_if_missing) {
-  if (gWorkspaces.count(name)) {
-    gCurrentWorkspaceName = name;
-    gWorkspace = gWorkspaces[name].get();
-    return;
-  }
-
-  CAFFE_ENFORCE(create_if_missing);
-  std::unique_ptr<Workspace> new_workspace(new Workspace());
-  gWorkspace = new_workspace.get();
-  gWorkspaces.insert(std::make_pair(name, std::move(new_workspace)));
-  gCurrentWorkspaceName = name;
-}
-
-void ResetWorkspace(Workspace* workspace) {
-  gWorkspaces[gCurrentWorkspaceName].reset(workspace);
-  gWorkspace = gWorkspaces[gCurrentWorkspaceName].get();
-}
-
-void GetWorkspaceNames(std::vector<std::string>& names) {
-  for (const auto& kv : gWorkspaces) {
-    // NOLINTNEXTLINE(performance-inefficient-vector-operation)
-    names.emplace_back(kv.first);
-  }
-}
-
-void ClearWorkspaces() {
-  gWorkspaces.clear();
-}
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/pybind_workspace.h b/caffe2/python/pybind_workspace.h
deleted file mode 100644
index ac43992b6416..000000000000
--- a/caffe2/python/pybind_workspace.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-//#include <Python.h>
-
-namespace caffe2 {
-namespace python {
-class C10_EXPORT BlobFetcherBase {
- public:
-  struct FetchedBlob {
-    pybind11::object obj;
-    bool copied;
-  };
-  virtual ~BlobFetcherBase();
-  virtual pybind11::object Fetch(const Blob& blob) = 0;
-};
-
-C10_DECLARE_TYPED_REGISTRY(
-    BlobFetcherRegistry,
-    TypeIdentifier,
-    BlobFetcherBase,
-    std::unique_ptr);
-#define REGISTER_BLOB_FETCHER(id, ...) \
-  C10_REGISTER_TYPED_CLASS(BlobFetcherRegistry, id, __VA_ARGS__)
-inline unique_ptr<BlobFetcherBase> CreateFetcher(TypeIdentifier id) {
-  return BlobFetcherRegistry()->Create(id);
-}
-
-Workspace* GetCurrentWorkspace();
-void SetCurrentWorkspace(Workspace* workspace);
-Workspace* NewWorkspace();
-Workspace* GetWorkspaceByName(const std::string& name);
-std::string GetCurrentWorkspaceName();
-void InsertWorkspace(const std::string& name, std::unique_ptr<Workspace> ws);
-void SwitchWorkspaceInternal(const std::string& name, bool create_if_missing);
-void ResetWorkspace(Workspace* workspace);
-void GetWorkspaceNames(std::vector<std::string>& names);
-void ClearWorkspaces();
-} // namespace python
-} // namespace caffe2
diff --git a/caffe2/python/python_op_test.py b/caffe2/python/python_op_test.py
deleted file mode 100644
index 4b39adc3f36a..000000000000
--- a/caffe2/python/python_op_test.py
+++ /dev/null
@@ -1,245 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.core import CreatePythonOperator
-import caffe2.python.hypothesis_test_util as hu
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import numpy as np
-
-
-class CustomError(Exception):
-    pass
-
-
-def SubFunctionThatThrowsCustomError():
-    raise CustomError("This is an intentional exception.")
-
-
-def MainOpFunctionThatThrowsCustomError(inputs, _):
-    return SubFunctionThatThrowsCustomError()
-
-def MainOpFunctionThatThrowsCustomErrorInBuilder(inputs, _):
-    raise CustomError("This is an intentional exception in builder.")
-
-def op_builder(name, index, extra):
-    iterations = [0]
-    assert name == 'name'
-    assert index == 5
-    assert extra - 4.2 < 0.0001
-
-    def my_op(inputs, outputs):
-        assert inputs[0].data[0] == iterations[0]
-        assert name == 'name'
-        assert index == 5
-        assert extra - 4.2 < 0.0001
-        iterations[0] += 1
-
-    return my_op
-
-
-class PythonOpTest(hu.HypothesisTestCase):
-    @given(x=hu.tensor())
-    def test_feed(self, x):
-        def f(inputs, _):
-            self.assertEqual(x.shape, inputs[0].shape)
-            self.assertEqual(type(inputs[0].shape), tuple)
-            self.assertEqual(type(inputs[0].data), np.ndarray)
-            np.testing.assert_almost_equal(x, inputs[0].data)
-        op = CreatePythonOperator(f, ["x"], [])
-        workspace.FeedBlob("x", x)
-        workspace.RunOperatorOnce(op)
-
-    def test_exception(self):
-        op = CreatePythonOperator(MainOpFunctionThatThrowsCustomError, [], [])
-        with self.assertRaisesRegex(CustomError, "This is an intentional exception."):
-            workspace.RunOperatorOnce(op)
-
-    def test_exception_builder(self):
-        op = CreatePythonOperator(MainOpFunctionThatThrowsCustomErrorInBuilder, [], [])
-        with self.assertRaisesRegex(CustomError, "This is an intentional exception in builder."):
-            workspace.RunOperatorOnce(op)
-
-    @given(x=hu.tensor())
-    def test_feed_with_helper_function(self, x):
-        def f(inputs, _):
-            self.assertEqual(x.shape, inputs[0].shape)
-            self.assertEqual(type(inputs[0].shape), tuple)
-            self.assertEqual(type(inputs[0].data), np.ndarray)
-            np.testing.assert_almost_equal(x, inputs[0].data)
-        net = core.Net("test")
-        net.Python(f)(["x"], [])
-        workspace.FeedBlob("x", x)
-        workspace.RunNetOnce(net)
-
-    def test_builder_tuple(self):
-        net = core.Net("builder_template")
-        iter_blob = 'iter'
-        net.Python((op_builder, ['name', 5], {'extra': 4.2}))([iter_blob], [])
-        net.Python((op_builder, ['name', 5], {'extra': 4.2}))([iter_blob], [])
-        for repeat in range(2):
-            # check that the builder will be called exactly once for each
-            # PythonOp constructor. Cloning the net will also trigger a call
-            # to the builder when the net is created.
-            cloned_net = net.Clone('builder_%d' % repeat)
-            workspace.FeedBlob(iter_blob, np.array([0]))
-            # Builder gets called once per python op in the line below
-            workspace.CreateNet(cloned_net)
-            for i in range(10):
-                workspace.FeedBlob(iter_blob, np.array([i]))
-                workspace.RunNet(cloned_net)
-
-    @given(x=hu.tensor())
-    def test_feed_with_gc(self, x):
-        def f(inputs, _):
-            self.assertEqual(x.shape, inputs[0].shape)
-            np.testing.assert_almost_equal(x, inputs[0].data)
-        op = CreatePythonOperator(f, ["x"], [])
-        workspace.FeedBlob("x", x)
-        workspace.RunOperatorOnce(op)
-        del f
-        workspace.FeedBlob("x", x)
-        workspace.RunOperatorOnce(op)
-
-    @given(x=hu.tensor())
-    def test_reshape(self, x):
-        def f(inputs, outputs):
-            outputs[0].reshape(inputs[0].shape)
-            self.assertEqual(x.shape, inputs[0].shape)
-            self.assertEqual(x.shape, outputs[0].shape)
-            outputs[0].data[...] = inputs[0].data
-
-        op = CreatePythonOperator(f, ["x"], ["y"])
-        workspace.FeedBlob("x", x)
-        workspace.RunOperatorOnce(op)
-        y = workspace.FetchBlob("y")
-        np.testing.assert_almost_equal(x, y)
-
-    @given(x=hu.tensor())
-    def test_workspace_manipulation(self, x):
-        """
-        Verify that python op can manipulate workspace directly
-        """
-        def f(inputs, outputs, ws):
-            fetched = ws.blobs['internal'].fetch()
-            np.testing.assert_almost_equal(fetched, x)
-
-        ws = workspace.C.Workspace()
-        net = core.Net("test")
-        net.GivenTensorFill([], ['internal'], values=x, shape=x.shape)
-        net.Python(f, pass_workspace=True)([], [])
-        ws.run(net)
-
-    @given(x=hu.tensor())
-    def test_caught_exception_doesnt_terminate(self, x):
-        def f(inputs, outputs):
-            try:
-                raise Exception("Exception in handler")
-            except Exception:
-                pass
-
-        op = CreatePythonOperator(f, ["x"], ["y"])
-        workspace.FeedBlob("x", x)
-        workspace.RunOperatorOnce(op)
-
-    @given(x=hu.tensor(),
-           n=st.integers(min_value=1, max_value=20),
-           w=st.integers(min_value=1, max_value=20))
-    @settings(deadline=1000)
-    def test_multithreaded_evaluation(self, x, n, w):
-        def f(inputs, outputs):
-            outputs[0].reshape(inputs[0].shape)
-            outputs[0].data[...] = inputs[0].data
-        ops = [CreatePythonOperator(f, ["x"], [str(i)]) for i in range(n)]
-        net = core.Net("net")
-        net.Proto().op.extend(ops)
-        net.Proto().type = "dag"
-        net.Proto().num_workers = w
-        iters = 100
-        plan = core.Plan("plan")
-        plan.AddStep(core.ExecutionStep("test-step", net, iters))
-        workspace.FeedBlob("x", x)
-        workspace.RunPlan(plan.Proto().SerializeToString())
-        for i in range(n):
-            y = workspace.FetchBlob(str(i))
-            np.testing.assert_almost_equal(x, y)
-
-    @given(x=hu.tensor(), in_place=st.booleans(), **hu.gcs)
-    @settings(deadline=10000)
-    def test_gradient(self, x, in_place, gc, dc):
-        def f(inputs, outputs):
-            outputs[0].reshape(inputs[0].shape)
-            outputs[0].data[...] = inputs[0].data * 2
-
-        def grad_f(inputs, outputs):
-            # Ordering is [inputs, outputs, grad_outputs]
-            grad_output = inputs[2]
-
-            grad_input = outputs[0]
-            grad_input.reshape(grad_output.shape)
-            grad_input.data[...] = grad_output.data * 2
-
-        op = CreatePythonOperator(
-            f, ["x"], ["x" if in_place else "y"], grad_f=grad_f)
-        self.assertGradientChecks(gc, op, [x], 0, [0])
-        self.assertDeviceChecks(dc, op, [x], [0])
-
-    @given(inputs=hu.tensors(n=2), **hu.gcs)
-    @settings(deadline=10000)
-    def test_gradient_multiple(self, inputs, gc, dc):
-        (x1, x2) = inputs
-
-        def f(inputs, outputs):
-            for idx in [0, 1]:
-                self.assertEqual(type(inputs[idx].shape), tuple)
-                outputs[idx].reshape(inputs[idx].shape)
-                outputs[idx].data[...] = inputs[idx].data * 2
-
-        def grad_f(inputs, outputs):
-            # Ordering is [inputs, outputs, grad_outputs]
-            self.assertEqual(len(inputs), 6)
-            self.assertEqual(len(outputs), 2)
-            for (grad_output_idx, grad_input_idx) in [(4, 0), (5, 1)]:
-                grad_output = inputs[grad_output_idx]
-                grad_input = outputs[grad_input_idx]
-                grad_input.reshape(grad_output.shape)
-                grad_input.data[...] = grad_output.data * 2
-
-        op = CreatePythonOperator(f, ["x1", "x2"], ["y1", "y2"], grad_f=grad_f)
-
-        for idx in [0, 1]:
-            self.assertGradientChecks(gc, op, [x1, x2], idx, [0, 1])
-        self.assertDeviceChecks(dc, op, [x1, x2], [0, 1])
-
-    @given(inputs=hu.tensors(n=3), **hu.gcs)
-    @settings(deadline=10000)
-    def test_gradient_multiple_with_indices(self, inputs, gc, dc):
-        (x1, x2, x3) = inputs
-
-        def f(inputs, outputs):
-            for idx in [0, 1, 2]:
-                self.assertEqual(type(inputs[idx].shape), tuple)
-                outputs[idx].reshape(inputs[idx].shape)
-                outputs[idx].data[...] = inputs[idx].data * 2
-
-        def grad_f(inputs, outputs):
-            # Ordering is [inputs, outputs, grad_outputs]
-            self.assertEqual(len(inputs), 8)
-            self.assertEqual(len(outputs), 1)
-            for (grad_output_idx, grad_input_idx) in [(6, 0)]:
-                grad_output = inputs[grad_output_idx]
-                grad_input = outputs[grad_input_idx]
-                grad_input.reshape(grad_output.shape)
-                grad_input.data[...] = grad_output.data * 2
-
-        op = CreatePythonOperator(
-            f, ["x1", "x2", "x3"], ["y1", "y2", "y3"],
-            grad_f=grad_f,
-            grad_output_indices=[0, 2],  # Receive grad outputs for y1 and y3
-            grad_input_indices=[0]       # Produce grad inputs for x1
-        )
-
-        self.assertGradientChecks(gc, op, [x1, x2, x3], 0, [0, 2])
-        self.assertDeviceChecks(dc, op, [x1, x2, x3], [0, 1, 2])
diff --git a/caffe2/python/queue_util.py b/caffe2/python/queue_util.py
deleted file mode 100644
index c9a91fc27d17..000000000000
--- a/caffe2/python/queue_util.py
+++ /dev/null
@@ -1,136 +0,0 @@
-## @package queue_util
-# Module caffe2.python.queue_util
-
-
-
-
-
-from caffe2.python import core, dataio
-from caffe2.python.task import TaskGroup
-
-import logging
-
-
-logger = logging.getLogger(__name__)
-
-
-class _QueueReader(dataio.Reader):
-    def __init__(self, wrapper, num_dequeue_records=1):
-        assert wrapper.schema is not None, (
-            'Queue needs a schema in order to be read from.')
-        dataio.Reader.__init__(self, wrapper.schema())
-        self._wrapper = wrapper
-        self._num_dequeue_records = num_dequeue_records
-
-    def setup_ex(self, init_net, exit_net):
-        exit_net.CloseBlobsQueue([self._wrapper.queue()], 0)
-
-    def read_ex(self, local_init_net, local_finish_net):
-        self._wrapper._new_reader(local_init_net)
-        dequeue_net = core.Net('dequeue')
-        fields, status_blob = dequeue(
-            dequeue_net,
-            self._wrapper.queue(),
-            len(self.schema().field_names()),
-            field_names=self.schema().field_names(),
-            num_records=self._num_dequeue_records)
-        return [dequeue_net], status_blob, fields
-
-    def read(self, net):
-        net, _, fields = self.read_ex(net, None)
-        return net, fields
-
-
-class _QueueWriter(dataio.Writer):
-    def __init__(self, wrapper):
-        self._wrapper = wrapper
-
-    def setup_ex(self, init_net, exit_net):
-        exit_net.CloseBlobsQueue([self._wrapper.queue()], 0)
-
-    def write_ex(self, fields, local_init_net, local_finish_net, status):
-        self._wrapper._new_writer(self.schema(), local_init_net)
-        enqueue_net = core.Net('enqueue')
-        enqueue(enqueue_net, self._wrapper.queue(), fields, status)
-        return [enqueue_net]
-
-
-class QueueWrapper(dataio.Pipe):
-    def __init__(self, handler, schema=None, num_dequeue_records=1):
-        dataio.Pipe.__init__(self, schema, TaskGroup.LOCAL_SETUP)
-        self._queue = handler
-        self._num_dequeue_records = num_dequeue_records
-
-    def reader(self):
-        return _QueueReader(
-            self, num_dequeue_records=self._num_dequeue_records)
-
-    def writer(self):
-        return _QueueWriter(self)
-
-    def queue(self):
-        return self._queue
-
-
-class Queue(QueueWrapper):
-    def __init__(self, capacity, schema=None, name='queue',
-                 num_dequeue_records=1):
-        # find a unique blob name for the queue
-        net = core.Net(name)
-        queue_blob = net.AddExternalInput(net.NextName('handler'))
-        QueueWrapper.__init__(
-            self, queue_blob, schema, num_dequeue_records=num_dequeue_records)
-        self.capacity = capacity
-        self._setup_done = False
-
-    def setup(self, global_init_net):
-        assert self._schema, 'This queue does not have a schema.'
-        self._setup_done = True
-        global_init_net.CreateBlobsQueue(
-            [],
-            [self._queue],
-            capacity=self.capacity,
-            num_blobs=len(self._schema.field_names()),
-            field_names=self._schema.field_names())
-
-
-def enqueue(net, queue, data_blobs, status=None):
-    if status is None:
-        status = net.NextName('status')
-    # Enqueueing moved the data into the queue;
-    # duplication will result in data corruption
-    queue_blobs = []
-    for blob in data_blobs:
-        if blob not in queue_blobs:
-            queue_blobs.append(blob)
-        else:
-            logger.warning("Need to copy blob {} to enqueue".format(blob))
-            queue_blobs.append(net.Copy(blob))
-    results = net.SafeEnqueueBlobs([queue] + queue_blobs, queue_blobs + [status])
-    return results[-1]
-
-
-def dequeue(net, queue, num_blobs, status=None, field_names=None,
-            num_records=1):
-    if field_names is not None:
-        assert len(field_names) == num_blobs
-        data_names = [net.NextName(name) for name in field_names]
-    else:
-        data_names = [net.NextName('data', i) for i in range(num_blobs)]
-    if status is None:
-        status = net.NextName('status')
-    results = net.SafeDequeueBlobs(
-        queue, data_names + [status], num_records=num_records)
-    results = list(results)
-    status_blob = results.pop(-1)
-    return results, status_blob
-
-
-def close_queue(step, *queues):
-    close_net = core.Net("close_queue_net")
-    for queue in queues:
-        close_net.CloseBlobsQueue([queue], 0)
-    close_step = core.execution_step("%s_step" % str(close_net), close_net)
-    return core.execution_step(
-        "%s_wraper_step" % str(close_net),
-        [step, close_step])
diff --git a/caffe2/python/record_queue.py b/caffe2/python/record_queue.py
deleted file mode 100644
index d6eb554272d1..000000000000
--- a/caffe2/python/record_queue.py
+++ /dev/null
@@ -1,118 +0,0 @@
-## @package record_queue
-# Module caffe2.python.record_queue
-"""
-Implementation of a queue wrapper.
-"""
-
-
-
-
-
-from caffe2.python import core
-from caffe2.python.dataio import Reader, Writer
-from caffe2.python.schema import (
-    Struct, Field, from_column_list)
-
-
-class _QueueReader(Reader):
-    def __init__(self, blobs_queue, schema, name=None):
-        """Don't call this directly. Instead, use dataset.reader()"""
-        super().__init__(schema)
-        self.blobs_queue = blobs_queue
-        self.name = name
-
-    def read(self, read_net):
-        with core.NameScope(read_net.NextName(self.name)):
-            status = read_net.NextName()
-            fields = read_net.SafeDequeueBlobs(
-                self.blobs_queue, self._schema.field_names() + [status])
-            return (fields[-1], fields[:-1])
-
-
-class _QueueWriter(Writer):
-    def __init__(self, blobs_queue, schema):
-        self.blobs_queue = blobs_queue
-        self.schema = schema
-
-    def write(self, writer_net, fields):
-        if isinstance(fields, Field):
-            fields = fields.field_blobs()
-        writer_net.CheckDatasetConsistency(
-            fields, [], fields=self.schema.field_names())
-        status = writer_net.NextName()
-        writer_net.SafeEnqueueBlobs(
-            [self.blobs_queue] + fields, fields + [status])
-        return status
-
-
-class RecordQueue:
-    """ The class is used to feed data with some process from a reader into a
-        queue and provider a reader interface for data fetching from the queue.
-    """
-    def __init__(self, fields, name=None, capacity=1,
-                 enforce_unique_name=False, num_threads=1):
-        assert isinstance(fields, list) or isinstance(fields, Struct), (
-            'fields must be either a Struct or a list of raw field names.')
-        if isinstance(fields, list):
-            fields = from_column_list(fields)
-        self.schema = fields
-        self.name = name or 'queue'
-        self.num_threads = num_threads
-        num_blobs = len(self.schema.field_names())
-        init_net = core.Net(self.name + '/init_net')
-        self.blobs_queue = init_net.CreateBlobsQueue(
-            [], 1,
-            capacity=capacity,
-            num_blobs=num_blobs,
-            enforce_unique_name=enforce_unique_name)
-        core.workspace.RunNetOnce(init_net)
-
-        self.writer = _QueueWriter(self.blobs_queue, self.schema)
-        reader_name = self.name + '_reader'
-        self.reader = _QueueReader(self.blobs_queue, self.schema, reader_name)
-
-        exit_net = core.Net(self.name + '/exit_net')
-        exit_net.CloseBlobsQueue(self.blobs_queue, 0)
-        self.exit_step = core.execution_step(
-            '{}_close_step'.format(str(exit_net)),
-            exit_net)
-
-    def build(self, reader, process=None):
-        """
-        Build the producer_step to feed data from reader into the queue, and
-        return the reader interface.
-        Inputs:
-            reader:           read data which will be stored in the queue.
-            process:          preprocess data before enqueue.
-        Outputs:
-            reader:           reader to fetch the data from the queue.
-            producer_step:    the step insert the data into the queue. Should be
-                              run with comsume_step together.
-            exit_step:        the step to close queue
-            schema:           the schema for the reader.
-        """
-        producer_steps = []
-        for i in range(self.num_threads):
-            name = 'reader_' + str(i)
-            net_reader = core.Net(name)
-            should_stop, fields = reader.read_record(net_reader)
-            step_read = core.execution_step(name, net_reader)
-
-            name = 'queue_writer' + str(i)
-            net_prod = core.Net(name)
-            field_blobs = fields.field_blobs()
-            if process:
-                field_blobs = process(net_prod, fields).field_blobs()
-
-            self.writer.write(net_prod, field_blobs)
-            step_prod = core.execution_step(name, net_prod)
-            step = core.execution_step(
-                'producer_' + str(i),
-                [step_read, step_prod],
-                should_stop_blob=should_stop)
-            producer_steps.append(step)
-        producer_step = core.execution_step(
-            'producers',
-            producer_steps,
-            concurrent_substeps=True)
-        return self.reader, producer_step, self.exit_step, self.schema
diff --git a/caffe2/python/recurrent.py b/caffe2/python/recurrent.py
deleted file mode 100644
index 8a983e56891c..000000000000
--- a/caffe2/python/recurrent.py
+++ /dev/null
@@ -1,332 +0,0 @@
-## @package recurrent
-# Module caffe2.python.recurrent
-
-
-
-
-
-from caffe2.python import core, workspace
-
-def recurrent_net(
-        net, cell_net, inputs, initial_cell_inputs,
-        links, timestep=None, scope=None, outputs_with_grads=(0,),
-        recompute_blobs_on_backward=None, forward_only=False,
-):
-    '''
-    net: the main net operator should be added to
-
-    cell_net: cell_net which is executed in a recurrent fasion
-
-    inputs: sequences to be fed into the recurrent net. Currently only one input
-    is supported. It has to be in a format T x N x (D1...Dk) where T is lengths
-    of the sequence. N is a batch size and (D1...Dk) are the rest of dimentions
-
-    initial_cell_inputs: inputs of the cell_net for the 0 timestamp.
-    Format for each input is:
-        (cell_net_input_name, external_blob_with_data)
-
-    links: a dictionary from cell_net input names in moment t+1 and
-    output names of moment t. Currently we assume that each output becomes
-    an input for the next timestep.
-
-    timestep: name of the timestep blob to be used. If not provided "timestep"
-    is used.
-
-    scope: Internal blobs are going to be scoped in a format
-    <scope_name>/<blob_name>
-    If not provided we generate a scope name automatically
-
-    outputs_with_grads : position indices of output blobs which will receive
-    error gradient (from outside recurrent network) during backpropagation
-
-    recompute_blobs_on_backward: specify a list of blobs that will be
-                 recomputed for backward pass, and thus need not to be
-                 stored for each forward timestep.
-
-    forward_only: if True, only forward steps are executed
-    '''
-    assert len(inputs) == 1, "Only one input blob is supported so far"
-
-    input_blobs = [str(i[0]) for i in inputs]
-    initial_input_blobs = [str(x[1]) for x in initial_cell_inputs]
-    op_name = net.NextName('recurrent')
-
-    def s(name):
-        # We have to manually scope due to our internal/external blob
-        # relationships.
-        scope_name = op_name if scope is None else scope
-        return "{}/{}".format(str(scope_name), str(name))
-
-    # determine inputs that are considered to be references
-    # it is those that are not referred to in inputs or initial_cell_inputs
-    known_inputs = [str(b) for b in input_blobs + initial_input_blobs]
-    known_inputs += [str(x[0]) for x in initial_cell_inputs]
-    if timestep is not None:
-        known_inputs.append(str(timestep))
-    references = [
-        core.BlobReference(b) for b in cell_net.Proto().external_input
-        if b not in known_inputs]
-
-    inner_outputs = list(cell_net.Proto().external_output)
-    # These gradients are expected to be available during the backward pass
-    inner_outputs_map = {o: o + '_grad' for o in inner_outputs}
-
-    # compute the backward pass of the cell net
-    if not forward_only:
-        backward_ops, backward_mapping = core.GradientRegistry.GetBackwardPass(
-            cell_net.Proto().op, inner_outputs_map)
-        backward_mapping = {str(k): v for k, v in backward_mapping.items()}
-
-        backward_cell_net = core.Net("RecurrentBackwardStep")
-        del backward_cell_net.Proto().op[:]
-
-        if recompute_blobs_on_backward is not None:
-            # Insert operators to re-compute the specified blobs.
-            # They are added in the same order as for the forward pass, thus
-            # the order is correct.
-            recompute_blobs_on_backward = {str(b) for b in
-                                           recompute_blobs_on_backward}
-
-            for op in cell_net.Proto().op:
-                if not recompute_blobs_on_backward.isdisjoint(set(op.output)):
-                    backward_cell_net.Proto().op.extend([op])
-                    # This fires if other outputs than the declared
-                    # are computed by the ops that are recomputed
-                    assert set(op.output).issubset(recompute_blobs_on_backward)
-
-        backward_cell_net.Proto().op.extend(backward_ops)
-        # compute blobs used but not defined in the backward pass
-        backward_ssa, backward_blob_versions = core.get_ssa(
-            backward_cell_net.Proto())
-        undefined = core.get_undefined_blobs(backward_ssa)
-
-        # also add to the output list the intermediate outputs of fwd_step that
-        # are used by backward.
-        ssa, blob_versions = core.get_ssa(cell_net.Proto())
-        scratches = [
-            blob
-            for blob, ver in blob_versions.items()
-            if (ver > 0 and
-                blob in undefined and
-                blob not in cell_net.Proto().external_output)
-        ]
-        backward_cell_net.Proto().external_input.extend(scratches)
-        backward_cell_net.Proto().type = 'simple'
-    else:
-        backward_cell_net = None
-
-    all_inputs = [i[1] for i in inputs] + [
-        x[1] for x in initial_cell_inputs] + references
-    all_outputs = []
-
-    cell_net.Proto().type = 'simple'
-
-    # Internal arguments used by RecurrentNetwork operator
-
-    # Links are in the format blob_name, recurrent_states, offset.
-    # In the moment t we know that corresponding data block is at
-    # t + offset position in the recurrent_states tensor
-    forward_links = []
-    backward_links = []
-
-    # Aliases are used to expose outputs to external world
-    # Format (internal_blob, external_blob, offset)
-    # Negative offset stands for going from the end,
-    # positive - from the beginning
-    aliases = []
-
-    # States held inputs to the cell net
-    recurrent_states = []
-
-    for cell_input, _ in initial_cell_inputs:
-        cell_input = str(cell_input)
-        # Recurrent_states is going to be (T + 1) x ...
-        # It stores all inputs and outputs of the cell net over time.
-        # Or their gradients in the case of the backward pass.
-        state = s(cell_input + "_states")
-        states_grad = state + "_grad"
-        cell_output = links[str(cell_input)]
-        forward_links.append((cell_input, state, 0))
-        forward_links.append((cell_output, state, 1))
-
-        aliases.append((state, cell_output + "_all", 1))
-        aliases.append((state, cell_output + "_last", -1))
-        all_outputs.extend([cell_output + "_all", cell_output + "_last"])
-
-        recurrent_states.append(state)
-
-        if backward_cell_net is not None:
-            backward_links.append((cell_output + "_grad", states_grad, 1))
-            backward_cell_net.Proto().external_input.append(
-                str(cell_output) + "_grad")
-
-            recurrent_input_grad = cell_input + "_grad"
-            if not backward_blob_versions.get(recurrent_input_grad, 0):
-                # If nobody writes to this recurrent input gradient, we need
-                # to make sure it gets to the states grad blob after all.
-                # We do this by using backward_links which triggers an alias
-                # This logic is being used for example in a SumOp case
-                backward_links.append(
-                    (backward_mapping[cell_input], states_grad, 0))
-            else:
-                backward_links.append((recurrent_input_grad, states_grad, 0))
-
-
-    for input_t, input_blob in inputs:
-        forward_links.append((str(input_t), str(input_blob), 0))
-
-    if backward_cell_net is not None:
-        for input_t, input_blob in inputs:
-            backward_links.append((
-                backward_mapping[str(input_t)], str(input_blob) + "_grad", 0
-            ))
-        backward_cell_net.Proto().external_input.extend(
-            cell_net.Proto().external_input)
-        backward_cell_net.Proto().external_input.extend(
-            cell_net.Proto().external_output)
-
-    def unpack_triple(x):
-        if x:
-            a, b, c = zip(*x)
-            return a, b, c
-        return [], [], []
-
-    # Splitting to separate lists so we can pass them to c++
-    # where we ensemle them back
-    link_internal, link_external, link_offset = unpack_triple(forward_links)
-    alias_src, alias_dst, alias_offset = unpack_triple(aliases)
-
-    recurrent_inputs = [str(x[1]) for x in initial_cell_inputs]
-
-    # Make sure that recurrent gradients accumulate with internal gradients
-    # (if a blob in the backward_cell_net receives gradient from both an
-    # external connection as well as from within the backward_cell_net,
-    # those gradients need to be added together, rather than one overwriting
-    # the other)
-    if backward_cell_net is not None:
-        proto = backward_cell_net.Proto()
-        operators = []
-        while len(proto.op) > 0:
-            op = proto.op[-1]
-            proto.op.remove(op)
-            operators.append(op)
-        for op in operators[::-1]:
-            proto.op.extend([op])
-            for j, output_blob in enumerate(op.output):
-                if output_blob in proto.external_input:
-                    # In place operation won't cause issues because it takes
-                    # existing value of a blob into account
-                    if output_blob in op.input:
-                        continue
-                    output_blob = core.BlobReference(output_blob)
-                    accum_blob = output_blob + "_accum"
-                    proto.op[-1].output[j] = str(accum_blob)
-                    backward_cell_net.Sum(
-                        [output_blob, accum_blob],
-                        [output_blob],
-                    )
-
-    def map_to_dual_list(m):
-        return [str(x) for x in list(m.keys())] + \
-               [str(x) for x in list(m.values())]
-
-    backward_args = {}
-    if backward_cell_net is not None:
-        backward_mapping_keys = set(backward_mapping.keys())
-        backward_link_internal, backward_link_external, backward_link_offset = \
-            unpack_triple(backward_links)
-        params = [x for x in references if x in backward_mapping_keys]
-        param_grads = [
-            str(backward_mapping[x])
-            for x in references
-            if x in backward_mapping_keys
-        ]
-        if recompute_blobs_on_backward is None:
-            recompute_blobs_on_backward = set()
-        backward_args = {
-            'param': [all_inputs.index(p) for p in params],
-            'backward_link_internal': [str(l) for l in backward_link_internal],
-            'backward_link_external': [str(l) for l in backward_link_external],
-            'backward_link_offset': backward_link_offset,
-            'outputs_with_grads': outputs_with_grads,
-            'recompute_blobs_on_backward': [
-                str(b) for b in recompute_blobs_on_backward
-            ],
-            'param_grads': param_grads,
-        }
-        if len(backward_cell_net.Proto().op) != 0:
-            backward_args['backward_step_net'] = backward_cell_net.Proto()
-
-
-    results = net.RecurrentNetwork(
-        all_inputs,
-        all_outputs + [s("step_workspaces")],
-        alias_src=alias_src,
-        alias_dst=[str(a) for a in alias_dst],
-        alias_offset=alias_offset,
-        recurrent_states=recurrent_states,
-        initial_recurrent_state_ids=[
-            all_inputs.index(i) for i in recurrent_inputs
-        ],
-        link_internal=[str(l) for l in link_internal],
-        link_external=[str(l) for l in link_external],
-        link_offset=link_offset,
-        enable_rnn_executor=1,
-        step_net=cell_net.Proto(),
-        timestep="timestep" if timestep is None else str(timestep),
-        **backward_args
-    )
-
-    # Restore net type since 'rnn' is not recognized outside RNNs
-    cell_net.Proto().type = 'simple'
-
-    # The last output is a list of step workspaces,
-    # which is only needed internally for gradient propagation
-    return results[:-1]
-
-
-def set_rnn_executor_config(rnn_op, num_threads=None, max_cuda_streams=None):
-    from caffe2.proto import caffe2_pb2
-    assert rnn_op.type in {'RecurrentNetwork', 'RecurrentNetworkGradient'}
-
-    def add_arg(s, v):
-        a = caffe2_pb2.Argument()
-        a.name = "rnn_executor." + s
-        a.i = v
-        rnn_op.arg.extend([a])
-
-    if num_threads is not None:
-        add_arg('num_threads', num_threads)
-    if max_cuda_streams is not None:
-        add_arg('max_cuda_streams', max_cuda_streams)
-
-
-def retrieve_step_blobs(net, prefix='rnn'):
-    '''
-    Retrieves blobs from step workspaces (which contain intermediate recurrent
-    network computation for each timestep) and puts them in the global
-    workspace. This allows access to the contents of this intermediate
-    computation in python. Returns the list of extracted blob names.
-
-    net: the net from which the step workspace blobs should be extracted
-
-    prefix: prefix to append to extracted blob names when placing them in the
-    global workspace
-    '''
-    count = 1
-    output_list = []
-    for op in net.Proto().op:
-        if op.type == "RecurrentNetwork":
-            blob_name = prefix + "_" + str(count)
-            count = count + 1
-            scratch_workspaces_blob_name = op.output[-1]
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "RecurrentNetworkBlobFetcher",
-                    [scratch_workspaces_blob_name],
-                    [blob_name],
-                    prefix=prefix
-                )
-            )
-            output_list += workspace.FetchBlob(blob_name).tolist()
-    return output_list
diff --git a/caffe2/python/regularizer.py b/caffe2/python/regularizer.py
deleted file mode 100644
index 4236647ed198..000000000000
--- a/caffe2/python/regularizer.py
+++ /dev/null
@@ -1,549 +0,0 @@
-# @package optimizer
-# Module caffe2.python.regularizer
-
-
-from caffe2.python import core, utils
-import numpy as np
-
-
-class RegularizationBy:
-    AFTER_OPTIMIZER = "after_optimizer"
-    ON_LOSS = "on_loss"
-
-
-class Regularizer:
-    def __init__(self):
-        self.kEpsilon = 1e-9
-
-    """
-    Adds regularization to train_net for given parameter. Its factor ahead of
-    regularization is given when initialization.
-    The param should be a BlobReference.
-    """
-
-    def __call__(self, net, param_init_net, param, grad=None, by=None):
-        assert isinstance(param, core.BlobReference)
-        by_enum = utils.EnumClassKeyVals(RegularizationBy)
-        assert by in by_enum.values(), (
-            "Regularizer of type {} is called with invalid by={}, "
-            "not in {}".format(self.__class__, by, by_enum.values())
-        )
-        run_func = "_run_" + by
-        assert hasattr(
-            self, run_func
-        ), "Regularizer of type {} does not implement function {}".format(
-            self.__class__, run_func
-        )
-        return getattr(self, run_func)(net, param_init_net, param, grad)
-
-    def _run_on_loss(self, net, param_init_net, param, grad=None):
-        return None
-
-    def _run_after_optimizer(self, net, param_init_net, param, grad):
-        return None
-
-    def _feature_grouping(self, param, net):
-        # Possible alternative grouping method via summing over absolute values
-        # Compute l2norm over feature weights
-        # pow( sum_i { pow(theda_i, 2) } ,  0.5)
-        param_mul = net.Mul([param, param], [net.NextScopedBlob("param_mul")])
-        param_reduced = net.ReduceFrontSum(
-            [param_mul], [net.NextScopedBlob("param_reduced")]
-        )
-        grouped_feature_weight_vec = net.Pow(
-            [param_reduced],
-            [net.NextScopedBlob("grouped_feature_weight_vec")],
-            exponent=0.5,
-        )
-
-        return grouped_feature_weight_vec
-
-    def _ensure_clipped(
-        self,
-        net,
-        param,
-        grad=None,
-        min=None,
-        max=None,
-        open_range=False,
-        left_open=False,
-        right_open=False,
-    ):
-        min = (
-            min + self.kEpsilon
-            if min is not None and (open_range or left_open)
-            else min
-        )
-        max = (
-            max - self.kEpsilon
-            if max is not None and (open_range or right_open)
-            else max
-        )
-        input_blobs = (
-            [param, grad.indices, grad.values]
-            if isinstance(grad, core.GradientSlice)
-            else [param]
-        )
-        net.EnsureClipped(input_blobs, [param], min=min, max=max)
-
-
-class L1Norm(Regularizer):
-    def __init__(self, reg_lambda):
-        super().__init__()
-        assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
-
-        self.reg_lambda = reg_lambda
-
-    def _run_on_loss(self, net, param_init_net, param, grad=None):
-        output_blob = net.NextScopedBlob(param + "_l1_regularization")
-        net.LpNorm([param], [output_blob], p=1)
-        net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
-        return output_blob
-
-class LpNorm(Regularizer):
-    def __init__(self, reg_lambda, p_value=0.5):
-        """
-        reg_lambda: parameter to scale regularization by
-
-        p_value:    determines what type of Lp norm to calculate. If p > 0,
-                    we will calculate Lp norm with the formula:
-                    pow( sum_i { pow(theda_i, p) } ,  1/p)
-        """
-        super().__init__()
-        assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
-        assert p_value > 0, "p_value factor should be greater than 0"
-        self.p_value = p_value
-        self.reg_lambda = reg_lambda
-
-
-    def _run_on_loss(self, net, param_init_net, param, grad=None):
-        # TODO: the second dim (num of input nodes) of param is after feature preproc,
-        # and does not correspond to the original num of dense features.
-        # In the future, will want to create a util to reduce the input dim of param to
-        # match the num of dense features.
-
-        output_blob = net.NextScopedBlob(param + "_dense_feature_regularization")
-        grouped_feature_weight_vec = self._feature_grouping(param, net)
-
-        # Compute Lpnorm:
-        # pow( sum_i { pow(theda_i, p) } ,  1/p)
-        lp_vec_raised = net.Pow(
-            [grouped_feature_weight_vec],
-            [net.NextScopedBlob("lp_vec_raised")],
-            exponent=self.p_value,
-        )
-        lp_vec_summed = net.ReduceFrontSum(
-            [lp_vec_raised], [net.NextScopedBlob("lp_vec_summed")]
-        )
-        lp_norm = net.Pow(
-            [lp_vec_summed],
-            [net.NextScopedBlob("lp_vec")],
-            exponent=(1 / self.p_value),
-        )
-        net.Scale([lp_norm], [output_blob], scale=self.reg_lambda)
-        return output_blob
-
-
-class L0ApproxNorm(Regularizer):
-    def __init__(self, reg_lambda, alpha=0.01, budget=0):
-        """
-        reg_lambda: parameter to scale regularization by
-
-        alpha:      hyper parameter to tune that is only used in the calculation
-                    of approximate L0 norm
-
-        budget:     desired number of features. If the number of features is greater
-                    than the budget amount, then the least important features will
-                    be penalized. If there are fewer features than the desired
-                    budget, no penalization will be applied. Optional parameter, if
-                    0, then no budget is used
-        """
-        super().__init__()
-        assert reg_lambda > 0, "factor ahead of regularization should be greater than 0"
-        assert alpha > 0, "alpha factor must be a positive value greater than 0"
-        assert budget >= 0, "budget factor must be greater than or equal to 0"
-        self.reg_lambda = reg_lambda
-        self.alpha = alpha
-        self.budget = float(budget)  # budget must be float for future calculations
-
-    def _run_on_loss(self, net, param_init_net, param, grad=None):
-        # TODO: the second dim (num of input nodes) of param is after feature preproc,
-        # and does not correspond to the original num of dense features.
-        # In the future, will want to create a util to reduce the input dim of param to
-        # match the num of dense features.
-
-        output_blob = net.NextScopedBlob(param + "_dense_feature_regularization")
-        grouped_feature_weight_vec = self._feature_grouping(param, net)
-
-        # compute approximate L0 norm
-        # sum_i ( min ( abs (theta_i), alpha))) / alpha
-        l0_abs = net.Abs([grouped_feature_weight_vec], [net.NextScopedBlob("l0_abs")])
-        l0_min = net.Clip([l0_abs], [net.NextScopedBlob("l0_min")], max=self.alpha)
-        l0_summed = net.ReduceFrontSum([l0_min], [net.NextScopedBlob("l0_summed")])
-        l0_norm = net.Scale(
-            [l0_summed], [net.NextScopedBlob("l0_norm")], scale=(1 / self.alpha)
-        )
-
-        # incorporate budget factor
-        # regularization = reg_lambda * max(0, l0_norm - budget)
-        if self.budget:
-            budget_blob = net.ConstantFill([], "budget", shape=[1], value=self.budget)
-            l0_sub_budget = net.Sub(
-                [l0_norm, budget_blob], [net.NextScopedBlob("l0_budget")]
-            )
-            relu_l0_sub_budget = net.Relu(
-                [l0_sub_budget], [net.NextScopedBlob("relu_l0_sub_budget")]
-            )
-            net.Scale([relu_l0_sub_budget], [output_blob], scale=self.reg_lambda)
-        else:
-            net.Scale([l0_norm], [output_blob], scale=self.reg_lambda)
-        return output_blob
-
-class L1NormTrimmed(Regularizer):
-    """
-    The Trimmed Lasso: Sparsity and Robustness. https://arxiv.org/abs/1708.04527
-    """
-    def __init__(self, reg_lambda, k):
-        super().__init__()
-        assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
-        assert isinstance(k, int), "k should be an interger as expected #. after selection"
-        assert k >= 1, "k should be larger than 1"
-
-        self.reg_lambda = reg_lambda
-        self.k = k
-
-    def _run_on_loss(self, net, param_init_net, param, grad=None):
-        output_blob = net.NextScopedBlob(param + "_l1_trimmed_regularization")
-        abs = net.Abs([param], [net.NextScopedBlob("abs")])
-        sum_abs = net.SumElements([abs], [net.NextScopedBlob("sum_abs")], average=False)
-        topk, _, _ = net.TopK([abs], [net.NextScopedBlob("topk"), net.NextScopedBlob("id"), net.NextScopedBlob("flat_id")], k=self.k)
-        topk_sum = net.SumElements([topk], [net.NextScopedBlob("topk_sum")], average=False)
-        net.Sub([sum_abs, topk_sum], [output_blob])
-        net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
-        return output_blob
-
-
-class L2Norm(Regularizer):
-    def __init__(self, reg_lambda):
-        super().__init__()
-        assert reg_lambda >= 0, "factor ahead of regularization should be 0 or positive"
-
-        self.reg_lambda = reg_lambda
-
-    def _run_on_loss(self, net, param_init_net, param, grad=None):
-        output_blob = net.NextScopedBlob(param + "_l2_regularization")
-        net.LpNorm([param], [output_blob], p=2)
-        net.Scale([output_blob], [output_blob], scale=self.reg_lambda)
-        return output_blob
-
-
-class ElasticNet(Regularizer):
-    def __init__(self, l1, l2):
-        super().__init__()
-        self.l1 = l1
-        self.l2 = l2
-
-    def _run_on_loss(self, net, param_init_net, param, grad=None):
-        output_blob = net.NextScopedBlob(param + "_elastic_net_regularization")
-        l2_blob = net.NextScopedBlob(param + "_l2_blob")
-        l1_blob = net.NextScopedBlob(param + "_l1_blob")
-        net.LpNorm([param], [l2_blob], p=2)
-        net.LpNorm([param], [l1_blob], p=1)
-        net.Scale([l2_blob], [l2_blob], scale=self.l2)
-        net.Scale([l1_blob], [l1_blob], scale=self.l1)
-        net.Add([l1_blob, l2_blob], [output_blob])
-        return output_blob
-
-
-class ElasticNetL1NormTrimmed(Regularizer):
-    def __init__(self, l1, l2, k):
-        super().__init__()
-        self.l1 = l1
-        self.l2 = l2
-        self.k = k
-
-    def _run_on_loss(self, net, param_init_net, param, grad=None):
-        output_blob = net.NextScopedBlob(param + "_elastic_net_l1_trimmed_regularization")
-        l2_blob = net.NextScopedBlob(param + "_l2_blob")
-        net.LpNorm([param], [l2_blob], p=2)
-        net.Scale([l2_blob], [l2_blob], scale=self.l2)
-
-        l1_blob = net.NextScopedBlob(param + "_l1_blob")
-        abs = net.Abs([param], [net.NextScopedBlob("abs")])
-        sum_abs = net.SumElements([abs], [net.NextScopedBlob("sum_abs")], average=False)
-        topk, _, _ = net.TopK([abs], [net.NextScopedBlob("topk"), net.NextScopedBlob("id"), net.NextScopedBlob("flat_id")], k=self.k)
-        topk_sum = net.SumElements([topk], [net.NextScopedBlob("topk_sum")], average=False)
-        net.Sub([sum_abs, topk_sum], [l1_blob])
-        net.Scale([l1_blob], [l1_blob], scale=self.l1)
-
-        net.Add([l1_blob, l2_blob], [output_blob])
-        return output_blob
-
-
-class MaxNorm(Regularizer):
-    def __init__(self, norm=1.0, dtype=None):
-        super().__init__()
-        self.norm = norm
-        self.dtype = dtype
-
-    def _run_after_optimizer(self, net, param_init_net, param, grad):
-        assert self.norm > 0, "norm should be bigger than 0."
-        if isinstance(grad, core.GradientSlice):
-            if self.dtype and self.dtype == 'fp16':
-                net.Float16SparseNormalize(
-                    [param, grad.indices],
-                    [param],
-                    use_max_norm=True,
-                    norm=self.norm,
-                )
-            else:
-                net.SparseNormalize(
-                    [param, grad.indices],
-                    [param],
-                    use_max_norm=True,
-                    norm=self.norm,
-                )
-        else:
-            raise NotImplementedError("MaxNorm is not supported for dense parameters")
-
-
-class ConstantNorm(Regularizer):
-    def __init__(self, norm=1.0):
-        super().__init__()
-        self.norm = norm
-
-    def _run_after_optimizer(self, net, param_init_net, param, grad):
-        assert self.norm > 0, "norm should be bigger than 0."
-        if isinstance(grad, core.GradientSlice):
-            net.SparseNormalize(
-                [param, grad.indices],
-                [param],
-                use_max_norm=False,
-                norm=self.norm,
-            )
-        else:
-            raise NotImplementedError(
-                "ConstantNorm is not supported for dense parameters"
-            )
-
-
-class SparseLpNorm(Regularizer):
-    def __init__(self, p, reg_lambda):
-        super().__init__()
-        assert p in (1.0, 2.0), "Sparse Lp regularization only implemented for p = 1.0 and p = 2.0."
-        assert reg_lambda > 0, "factor ahead of regularization should be greater than 0."
-        self.p = p
-        self.reg_lambda = reg_lambda
-
-    def _run_after_optimizer(self, net, param_init_net, param, grad):
-        if isinstance(grad, core.GradientSlice):
-            net.SparseLpRegularizer(
-                [param, grad.indices],
-                [param],
-                p=self.p,
-                reg_lambda=self.reg_lambda,
-            )
-        else:
-            raise NotImplementedError("SparseLpNorm is not supported for dense parameters")
-
-
-class SparseL1Norm(SparseLpNorm):
-    def __init__(self, reg_lambda):
-        super().__init__(p=1.0, reg_lambda=reg_lambda)
-
-
-class SparseL2Norm(SparseLpNorm):
-    def __init__(self, reg_lambda):
-        super().__init__(p=2.0, reg_lambda=reg_lambda)
-
-
-class LogBarrier(Regularizer):
-    """
-    Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
-    35(67-68), 7. Chapter 19
-    """
-
-    def __init__(self, reg_lambda, discount_policy="inv", discount_options=None):
-        """
-        discount is a positive weight that is decreasing, and here it is implemented
-        similar to the learning rate. It is specified by a learning rate policy and
-        corresponding options
-        """
-        super().__init__()
-        assert reg_lambda > 0, "factor ahead of regularization should be 0 or positive"
-        self.reg_lambda = reg_lambda
-        self.discount_policy = discount_policy
-        self.discount_options = discount_options or {"gamma": 1.0, "power": 1.0}
-
-    def _run_on_loss(self, net, param_init_net, param, grad=None):
-        iteration = utils.BuildUniqueMutexIter(param_init_net, net)
-        # Since we are most likely to do a minimization
-        discount = net.NextScopedBlob(param + "_log_barrier_discount")
-        net.LearningRate(
-            [iteration],
-            [discount],
-            base_lr=-self.reg_lambda,
-            policy=self.discount_policy,
-            **self.discount_options
-        )
-        # TODO(xlwang): param might still be negative at the initialization time or
-        # slightly negative due to the distributed training. Enforce it's non-negativity
-        # for now (at least above machine epsilon)
-        param_non_neg = net.NextScopedBlob(param + "_non_neg")
-        net.Clip([param], [param_non_neg], min=self.kEpsilon)
-        param_log = net.NextScopedBlob(param + "_log")
-        net.Log([param_non_neg], [param_log])
-        param_log_sum = net.NextScopedBlob(param + "_log_sum")
-        net.SumElements([param_log], [param_log_sum])
-        output_blob = net.NextScopedBlob(param + "_log_barrier")
-        net.Mul([param_log_sum, discount], [output_blob], broadcast=1)
-        return output_blob
-
-    def _run_after_optimizer(self, net, param_init_net, param, grad):
-        self._ensure_clipped(net, param, grad, min=0, open_range=True)
-
-
-class BoundedGradientProjection(Regularizer):
-    """
-    Wright, S., & Nocedal, J. (1999). Numerical optimization. Springer Science,
-    35(67-68), 7. Chapter 16
-    """
-
-    def __init__(
-        self, lb=None, ub=None, left_open=False, right_open=False, epsilon=None
-    ):
-        super().__init__()
-        lb = float(lb) if lb is not None else None
-        ub = float(ub) if ub is not None else None
-        epsilon = float(epsilon) if epsilon is not None else self.kEpsilon
-        assert epsilon > 0, "Bounded Gradient Projection with invalid eps={eps}".format(
-            eps=epsilon
-        )
-        assert (
-            (lb is None)
-            or (ub is None)
-            or (
-                lb + (epsilon if left_open else 0.)
-                <= ub - (epsilon if right_open else 0.)
-            )
-        ), (
-            "Bounded Gradient Projection with invalid "
-            "{lp}ub={ub}, lb={lb}{rp}, eps={eps}".format(
-                lb=lb,
-                ub=ub,
-                lp="(" if left_open else "[",
-                rp=")" if right_open else "]",
-                eps=epsilon,
-            )
-        )
-        self.left_open = left_open
-        self.right_open = right_open
-        self.kEpsilon = epsilon
-        self.lb = lb
-        self.ub = ub
-
-    def _run_after_optimizer(self, net, param_init_net, param, grad):
-        self._ensure_clipped(
-            net,
-            param,
-            grad,
-            min=self.lb,
-            max=self.ub,
-            left_open=self.left_open,
-            right_open=self.right_open,
-        )
-
-
-class GroupL1Norm(Regularizer):
-    """
-    Scardapane, Simone, et al. "Group sparse regularization for deep neural networks."
-    Neurocomputing 241 (2017): 81-89.
-
-    This regularizer computes l1 norm of a weight matrix based on groups.
-    There are essentially three stages in the computation:
-    1. Compute the l2 norm on all the members of each group
-    2. Scale each l2 norm by the size of each group
-    3. Compute the l1 norm of the scaled l2 norms
-    """
-    def __init__(self, reg_lambda, groups, stabilizing_val=0):
-        """
-        Args:
-            reg_lambda: The weight of the regularization term.
-            groups: A list of integers describing the size of each group.
-                The length of the list is the number of groups.
-
-        Optional Args:
-            stabilizing_val: The computation of GroupL1Norm involves the Sqrt
-                operator. When values are small, its gradient can be numerically
-                unstable and causing gradient explosion. Adding this term to
-                stabilize gradient calculation. Recommended value of this term is
-                1e-8, but it depends on the specific scenarios. If the implementation
-                of the gradient operator of Sqrt has taken into stability into
-                consideration, this term won't be necessary.
-        """
-        super().__init__()
-        assert (
-            (reg_lambda) >= 0
-        ), "regularization weight should be 0 or positive"
-        assert isinstance(groups, list), "groups needs to be a list"
-
-        self.reg_lambda = (reg_lambda)
-        self.groups = groups
-        self.stabilizing_val = stabilizing_val
-
-    def _run_on_loss(self, net, param_init_net, param, grad=None):
-        """
-        Args:
-            param: The input blob to regularize. It should be a weight matrix
-                blob with shape (output_dim, input_dim). input_dim should be
-                equal to the sum of self.groups.
-
-        Returns:
-            group_l1_norm: The output blob after applying regularization.
-
-        These are the steps of computation:
-            1. square all elements
-            2. sum by row
-            3. lengthssum by group
-            4. square_root all elements
-            5. normalize each group based on group size
-            6. compute l1 norm of each group
-            7. scale the result with the regularization lambda
-        """
-        squared = net.Sqr(param)
-        reduced_sum = net.ReduceSum(squared, axes=[0], keepdims=0)
-        lengths_sum = net.LengthsSum(
-            [
-                reduced_sum,
-                net.GivenTensorIntFill(
-                    [], 1, shape=[len(self.groups)], values=self.groups
-                ),
-            ]
-        )
-
-        if self.stabilizing_val:
-            net.Add(
-                [lengths_sum, net.ConstantFill([], 1, value=self.stabilizing_val)],
-                [lengths_sum],
-                broadcast=1,
-            )
-
-        sqrt = net.Sqrt(lengths_sum)
-
-        # Here we combine step 5 and step 7 into one operator call to
-        # improve efficiency: values = np.sqrt(self.groups) * self.reg_lambda
-        l2_scaled = net.Mul(
-            [
-                sqrt,
-                net.GivenTensorFill(
-                    [],
-                    shape=[len(self.groups)],
-                    values=np.sqrt(self.groups) * self.reg_lambda
-                )
-            ],
-            ['normalized_l2_norm_scaled']
-        )
-
-        group_l1_norm = net.LpNorm(l2_scaled, ['group_l1_nrom'], p=1)
-
-        return group_l1_norm
diff --git a/caffe2/python/regularizer_context.py b/caffe2/python/regularizer_context.py
deleted file mode 100644
index 27dc37818961..000000000000
--- a/caffe2/python/regularizer_context.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# @package regularizer_context
-# Module caffe2.python.regularizer_context
-
-
-
-
-
-from caffe2.python import context
-from caffe2.python.modifier_context import (
-    ModifierContext, UseModifierBase)
-
-
-class RegularizerContext(ModifierContext, context.DefaultManaged):
-    """
-    provide context to allow param_info to have different regularizers
-    """
-
-    def has_regularizer(self, name):
-        return self._has_modifier(name)
-
-    def get_regularizer(self, name):
-        assert self.has_regularizer(name), (
-            "{} regularizer is not provided!".format(name))
-        return self._get_modifier(name)
-
-
-class UseRegularizer(UseModifierBase):
-    '''
-    context class to allow setting the current context.
-    Example usage with layer:
-        regularizers = {'reg1': reg1, 'reg2': reg2}
-        with UseRegularizer(regularizers):
-            reg = RegularizerContext.current().get_regularizer('reg1')
-            layer(reg=reg)
-    '''
-    def _context_class(self):
-        return RegularizerContext
diff --git a/caffe2/python/regularizer_test.py b/caffe2/python/regularizer_test.py
deleted file mode 100644
index ac46746c096f..000000000000
--- a/caffe2/python/regularizer_test.py
+++ /dev/null
@@ -1,258 +0,0 @@
-
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-import numpy.testing as npt
-from caffe2.python import core, layer_model_instantiator, regularizer, schema, workspace
-from caffe2.python.layer_test_util import LayersTestCase
-from caffe2.python.optimizer import SgdOptimizer
-from caffe2.python.regularizer import L1Norm, RegularizationBy
-from caffe2.python.regularizer_context import RegularizerContext, UseRegularizer
-from hypothesis import given
-
-
-class TestRegularizerContext(LayersTestCase):
-    @given(X=hu.arrays(dims=[2, 5]))
-    def test_regularizer_context(self, X):
-        weight_reg_out = L1Norm(0.2)
-        bias_reg_out = L1Norm(0)
-        regularizers = {"WEIGHT": weight_reg_out, "BIAS": bias_reg_out}
-
-        output_dims = 2
-        input_record = self.new_record(schema.Scalar((np.float32, (5,))))
-        schema.FeedRecord(input_record, [X])
-
-        with UseRegularizer(regularizers):
-            weight_reg = RegularizerContext.current().get_regularizer("WEIGHT")
-            bias_reg = RegularizerContext.current().get_regularizer("BIAS")
-            optim = SgdOptimizer(0.15)
-
-            assert (
-                weight_reg == weight_reg_out
-            ), "fail to get correct weight reg from context"
-            assert bias_reg == bias_reg_out, "fail to get correct bias reg from context"
-            fc_output = self.model.FC(
-                input_record,
-                output_dims,
-                weight_optim=optim,
-                bias_optim=optim,
-                weight_reg=weight_reg,
-                bias_reg=bias_reg,
-            )
-            # model.output_schema has to a struct
-            self.model.output_schema = schema.Struct(("fc_output", fc_output))
-
-            self.assertEqual(schema.Scalar((np.float32, (output_dims,))), fc_output)
-
-            _, train_net = layer_model_instantiator.generate_training_nets(self.model)
-            ops = train_net.Proto().op
-            ops_type_list = [ops[i].type for i in range(len(ops))]
-            assert ops_type_list.count("LpNorm") == 2
-            assert ops_type_list.count("Scale") == 4
-            assert ops_type_list.count("LpNormGradient") == 2
-
-
-class TestRegularizer(LayersTestCase):
-    @given(X=hu.arrays(dims=[2, 5], elements=hu.floats(min_value=-1.0, max_value=1.0)))
-    def test_log_barrier(self, X):
-        param = core.BlobReference("X")
-        workspace.FeedBlob(param, X)
-        train_init_net, train_net = self.get_training_nets()
-        reg = regularizer.LogBarrier(1.0)
-        output = reg(train_net, train_init_net, param, by=RegularizationBy.ON_LOSS)
-        reg(
-            train_net,
-            train_init_net,
-            param,
-            grad=None,
-            by=RegularizationBy.AFTER_OPTIMIZER,
-        )
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-
-        def ref(X):
-            return (
-                np.array(np.sum(-np.log(np.clip(X, 1e-9, None))) * 0.5).astype(
-                    np.float32
-                ),
-                np.clip(X, 1e-9, None),
-            )
-
-        for x, y in zip(workspace.FetchBlobs([output, param]), ref(X)):
-            npt.assert_allclose(x, y, rtol=1e-3)
-
-    @given(
-        X=hu.arrays(dims=[2, 5], elements=hu.floats(min_value=-1.0, max_value=1.0)),
-        left_open=st.booleans(),
-        right_open=st.booleans(),
-        eps=hu.floats(min_value=1e-6, max_value=1e-4),
-        ub=hu.floats(min_value=-1.0, max_value=1.0),
-        lb=hu.floats(min_value=-1.0, max_value=1.0),
-        **hu.gcs_cpu_only
-    )
-    def test_bounded_grad_proj(self, X, left_open, right_open, eps, ub, lb, gc, dc):
-        if ub - (eps if right_open else 0.) < lb + (eps if left_open else 0.):
-            return
-        param = core.BlobReference("X")
-        workspace.FeedBlob(param, X)
-        train_init_net, train_net = self.get_training_nets()
-        reg = regularizer.BoundedGradientProjection(
-            lb=lb, ub=ub, left_open=left_open, right_open=right_open, epsilon=eps
-        )
-        output = reg(train_net, train_init_net, param, by=RegularizationBy.ON_LOSS)
-        reg(
-            train_net,
-            train_init_net,
-            param,
-            grad=None,
-            by=RegularizationBy.AFTER_OPTIMIZER,
-        )
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-
-        def ref(X):
-            return np.clip(
-                X, lb + (eps if left_open else 0.), ub - (eps if right_open else 0.)
-            )
-
-        assert output is None
-        npt.assert_allclose(workspace.blobs[param], ref(X), atol=1e-7)
-
-    @given(
-        output_dim=st.integers(1, 10),
-        input_num=st.integers(3, 30),
-        reg_weight=st.integers(0, 10)
-    )
-    def test_group_l1_norm(self, output_dim, input_num, reg_weight):
-        """
-        1. create a weight blob
-        2. create random group splits
-        3. run group_l1_nrom with the weight blob
-        4. run equivalent np operations to calculate group l1 norm
-        5. compare if the results from 3 and 4 are equal
-        """
-        def compare_reference(weight, group_boundaries, reg_lambda, output):
-            group_splits = np.hsplit(weight, group_boundaries[1:-1])
-            l2_reg = np.sqrt([np.sum(np.square(g)) for g in group_splits])
-            l2_normalized = np.multiply(l2_reg,
-                np.array([np.sqrt(g.shape[1]) for g in group_splits]))
-            result = np.multiply(np.sum(l2_normalized), reg_lambda)
-            npt.assert_almost_equal(result, workspace.blobs[output], decimal=2)
-
-        weight = np.random.rand(output_dim, input_num).astype(np.float32)
-
-        feature_num = np.random.randint(low=1, high=input_num - 1)
-        group_boundaries = [0]
-        group_boundaries = np.append(
-            group_boundaries,
-            np.sort(
-                np.random.choice(range(1, input_num - 1), feature_num, replace=False)
-            ),
-        )
-        group_boundaries = np.append(group_boundaries, [input_num])
-        split_info = np.diff(group_boundaries)
-
-        weight_blob = core.BlobReference("weight_blob")
-        workspace.FeedBlob(weight_blob, weight)
-
-        train_init_net, train_net = self.get_training_nets()
-        reg = regularizer.GroupL1Norm(reg_weight * 0.1, split_info.tolist())
-        output = reg(
-            train_net, train_init_net, weight_blob, by=RegularizationBy.ON_LOSS
-        )
-
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-        compare_reference(weight, group_boundaries, reg_weight * 0.1, output)
-
-    @given(
-        param_dim=st.integers(10, 30),
-        k=st.integers(5, 9),
-        reg_weight=st.integers(0, 10)
-    )
-    def test_l1_norm_trimmed(self, param_dim, k, reg_weight):
-        weight = np.random.rand(param_dim).astype(np.float32)
-        weight_blob = core.BlobReference("weight_blob")
-        workspace.FeedBlob(weight_blob, weight)
-
-        train_init_net, train_net = self.get_training_nets()
-        reg = regularizer.L1NormTrimmed(reg_weight * 0.1, k)
-        output = reg(
-            train_net, train_init_net, weight_blob, by=RegularizationBy.ON_LOSS
-        )
-
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-        result = np.sum(np.sort(np.absolute(weight))[:(param_dim - k)]) * reg_weight * 0.1
-        npt.assert_almost_equal(result, workspace.blobs[output], decimal=2)
-
-    @given(
-        param_dim=st.integers(10, 30),
-        k=st.integers(5, 9),
-        l1=st.integers(0, 10),
-        l2=st.integers(0, 10)
-    )
-    def test_elastic_l1_norm_trimmed(self, param_dim, k, l1, l2):
-        weight = np.random.rand(param_dim).astype(np.float32)
-        weight_blob = core.BlobReference("weight_blob")
-        workspace.FeedBlob(weight_blob, weight)
-
-        train_init_net, train_net = self.get_training_nets()
-        reg = regularizer.ElasticNetL1NormTrimmed(l1 * 0.1, l2 * 0.1, k)
-        output = reg(
-            train_net, train_init_net, weight_blob, by=RegularizationBy.ON_LOSS
-        )
-
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-        l1_norm = np.sum(np.sort(np.absolute(weight))[:(param_dim - k)])
-        l2_norm = np.sum(np.square(weight))
-        result = l1_norm * l1 * 0.1 + l2_norm * l2 * 0.1
-        npt.assert_almost_equal(result, workspace.blobs[output], decimal=2)
-
-    @given(
-        row_dim=st.integers(5, 10),
-        norm=st.floats(min_value=1.0, max_value=4.0),
-        data_strategy=st.data(),
-    )
-    def test_fp16_max_norm(self, row_dim, norm, data_strategy):
-        weight = np.random.rand(row_dim, 5).astype(np.float16)
-        grad = np.random.rand(row_dim, 5).astype(np.float16)
-
-        # generate indices that will be updated
-        indices = data_strategy.draw(
-            hu.tensor(
-                dtype=np.int64,
-                min_dim=1,
-                max_dim=1,
-                elements=st.sampled_from(np.arange(weight.shape[0])),
-            )
-        )
-        indices = np.unique(indices)
-
-        # compute expected result
-        result = weight.copy()
-        # prevent dived by zero
-        eps = 1e-12
-        norms = np.sqrt(np.sum(result[indices, ] ** 2, axis=1, keepdims=True))
-        # if the norms are smaller than max_norm, then it doesn't need update
-        desired = np.clip(norms, 0, norm)
-        # apply max norm
-        result[indices, ] *= desired / (eps + norms)
-
-        weight_blob = core.BlobReference("weight_blob")
-        workspace.FeedBlob(weight_blob, weight)
-        grad_blob = core.BlobReference("grad_blob")
-        workspace.FeedBlob(grad_blob, grad)
-        indices_blob = core.BlobReference("indices")
-        workspace.FeedBlob(indices_blob, indices)
-        grad_blob_slice = core.GradientSlice(indices=indices_blob, values=grad_blob)
-        train_init_net, train_net = self.get_training_nets()
-        reg = regularizer.MaxNorm(norm, dtype='fp16')
-        reg(
-            train_net, train_init_net, weight_blob, grad_blob_slice, by=RegularizationBy.AFTER_OPTIMIZER
-        )
-        workspace.RunNetOnce(train_init_net)
-        workspace.RunNetOnce(train_net)
-        npt.assert_almost_equal(result, workspace.FetchBlob('weight_blob'), decimal=2)
diff --git a/caffe2/python/rnn/__init__.py b/caffe2/python/rnn/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/rnn/lstm_comparison.py b/caffe2/python/rnn/lstm_comparison.py
deleted file mode 100644
index 34fddbc1a66e..000000000000
--- a/caffe2/python/rnn/lstm_comparison.py
+++ /dev/null
@@ -1,58 +0,0 @@
-
-
-
-
-from caffe2.python import workspace, core, lstm_benchmark, utils
-from copy import copy
-
-@utils.debug
-def Compare(args):
-    results = []
-    num_iters = 1000
-    args.gpu = True
-    with core.DeviceScope(core.DeviceOption(workspace.GpuDeviceType, 0)):
-        for batch_size in [64, 128, 256]:
-            for seq_length in [20, 100]:
-                for hidden_dim in [40, 100, 400, 800]:
-                    args.batch_size = batch_size
-                    args.seq_length = seq_length
-                    args.hidden_dim = hidden_dim
-                    args.data_size = batch_size * seq_length * num_iters
-                    args.iters_to_report = num_iters // 3
-
-                    args.implementation = 'own'
-                    t_own = lstm_benchmark.Benchmark(args)
-                    workspace.ResetWorkspace()
-                    args.implementation = 'cudnn'
-                    t_cudnn = lstm_benchmark.Benchmark(args)
-                    workspace.ResetWorkspace()
-                    results.append((copy(args), float(t_own), float(t_cudnn)))
-                    print(args)
-                    print("t_cudnn / t_own: {}".format(t_cudnn / t_own))
-
-    for args, t_own, t_cudnn in results:
-        print("{}: cudnn time: {}, own time: {}, ratio: {}".format(
-            str(args), t_cudnn, t_own, t_cudnn / t_own))
-
-    ratio_sum = 0
-    for args, t_own, t_cudnn in results:
-        ratio = float(t_cudnn) / t_own
-        ratio_sum += ratio
-        print("hidden_dim: {}, seq_lengths: {}, batch_size: {}, num_layers: {}:"
-              " cudnn time: {}, own time: {}, ratio: {}".format(
-                  args.hidden_dim, args.seq_length, args.batch_size,
-                  args.num_layers, t_cudnn, t_own, ratio))
-
-    print("Ratio average: {}".format(ratio_sum / len(results)))
-
-
-if __name__ == '__main__':
-    args = lstm_benchmark.GetArgumentParser().parse_args()
-
-    workspace.GlobalInit([
-        'caffe2',
-        '--caffe2_log_level=0',
-        '--caffe2_print_blob_sizes_at_exit=0',
-        '--caffe2_gpu_memory_tracking=1'])
-
-    Compare(args)
diff --git a/caffe2/python/rnn/rnn_cell_test_util.py b/caffe2/python/rnn/rnn_cell_test_util.py
deleted file mode 100644
index 95728d682bfa..000000000000
--- a/caffe2/python/rnn/rnn_cell_test_util.py
+++ /dev/null
@@ -1,75 +0,0 @@
-
-
-
-
-
-from caffe2.python import workspace, scope
-from caffe2.python.model_helper import ModelHelper
-
-import numpy as np
-
-
-def sigmoid(x):
-    return 1.0 / (1.0 + np.exp(-x))
-
-
-def tanh(x):
-    return 2.0 * sigmoid(2.0 * x) - 1
-
-
-def _prepare_rnn(
-    t, n, dim_in, create_rnn, outputs_with_grads,
-    forget_bias, memory_optim=False,
-    forward_only=False, drop_states=False, T=None,
-    two_d_initial_states=None, dim_out=None,
-    num_states=2,
-    **kwargs
-):
-    if dim_out is None:
-        dim_out = [dim_in]
-    print("Dims: ", t, n, dim_in, dim_out)
-
-    model = ModelHelper(name='external')
-
-    if two_d_initial_states is None:
-        two_d_initial_states = np.random.randint(2)
-
-    def generate_input_state(n, d):
-        if two_d_initial_states:
-            return np.random.randn(n, d).astype(np.float32)
-        else:
-            return np.random.randn(1, n, d).astype(np.float32)
-
-    states = []
-    for layer_id, d in enumerate(dim_out):
-        for i in range(num_states):
-            state_name = "state_{}/layer_{}".format(i, layer_id)
-            states.append(model.net.AddExternalInput(state_name))
-            workspace.FeedBlob(
-                states[-1], generate_input_state(n, d).astype(np.float32))
-
-    # Due to convoluted RNN scoping logic we make sure that things
-    # work from a namescope
-    with scope.NameScope("test_name_scope"):
-        input_blob, seq_lengths = model.net.AddScopedExternalInputs(
-            'input_blob', 'seq_lengths')
-
-        outputs = create_rnn(
-            model, input_blob, seq_lengths, states,
-            dim_in=dim_in, dim_out=dim_out, scope="external/recurrent",
-            outputs_with_grads=outputs_with_grads,
-            memory_optimization=memory_optim,
-            forget_bias=forget_bias,
-            forward_only=forward_only,
-            drop_states=drop_states,
-            static_rnn_unroll_size=T,
-            **kwargs
-        )
-
-    workspace.RunNetOnce(model.param_init_net)
-
-    workspace.FeedBlob(
-        seq_lengths,
-        np.random.randint(1, t + 1, size=(n,)).astype(np.int32)
-    )
-    return outputs, model.net, states + [input_blob]
diff --git a/caffe2/python/rnn_cell.py b/caffe2/python/rnn_cell.py
deleted file mode 100644
index 2a9bdae2b614..000000000000
--- a/caffe2/python/rnn_cell.py
+++ /dev/null
@@ -1,1978 +0,0 @@
-## @package rnn_cell
-# Module caffe2.python.rnn_cell
-
-
-
-
-
-import functools
-import inspect
-import logging
-import numpy as np
-import random
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python.attention import (
-    apply_dot_attention,
-    apply_recurrent_attention,
-    apply_regular_attention,
-    apply_soft_coverage_attention,
-    AttentionType,
-)
-from caffe2.python import core, recurrent, workspace, brew, scope, utils
-from caffe2.python.modeling.parameter_sharing import ParameterSharing
-from caffe2.python.modeling.parameter_info import ParameterTags
-from caffe2.python.modeling.initializers import Initializer
-from caffe2.python.model_helper import ModelHelper
-
-
-def _RectifyName(blob_reference_or_name):
-    if blob_reference_or_name is None:
-        return None
-    if isinstance(blob_reference_or_name, str):
-        return core.ScopedBlobReference(blob_reference_or_name)
-    if not isinstance(blob_reference_or_name, core.BlobReference):
-        raise Exception("Unknown blob reference type")
-    return blob_reference_or_name
-
-
-def _RectifyNames(blob_references_or_names):
-    if blob_references_or_names is None:
-        return None
-    return [_RectifyName(i) for i in blob_references_or_names]
-
-
-class RNNCell:
-    '''
-    Base class for writing recurrent / stateful operations.
-
-    One needs to implement 2 methods: apply_override
-    and get_state_names_override.
-
-    As a result base class will provice apply_over_sequence method, which
-    allows you to apply recurrent operations over a sequence of any length.
-
-    As optional you could add input and output preparation steps by overriding
-    corresponding methods.
-    '''
-    def __init__(self, name=None, forward_only=False, initializer=None):
-        self.name = name
-        self.recompute_blobs = []
-        self.forward_only = forward_only
-        self._initializer = initializer
-
-    @property
-    def initializer(self):
-        return self._initializer
-
-    @initializer.setter
-    def initializer(self, value):
-        self._initializer = value
-
-    def scope(self, name):
-        return self.name + '/' + name if self.name is not None else name
-
-    def apply_over_sequence(
-        self,
-        model,
-        inputs,
-        seq_lengths=None,
-        initial_states=None,
-        outputs_with_grads=None,
-    ):
-        if initial_states is None:
-            with scope.NameScope(self.name):
-                if self.initializer is None:
-                    raise Exception("Either initial states "
-                                    "or initializer have to be set")
-                initial_states = self.initializer.create_states(model)
-
-        preprocessed_inputs = self.prepare_input(model, inputs)
-        step_model = ModelHelper(name=self.name, param_model=model)
-        input_t, timestep = step_model.net.AddScopedExternalInputs(
-            'input_t',
-            'timestep',
-        )
-        utils.raiseIfNotEqual(
-            len(initial_states), len(self.get_state_names()),
-            "Number of initial state values provided doesn't match the number "
-            "of states"
-        )
-        states_prev = step_model.net.AddScopedExternalInputs(*[
-            s + '_prev' for s in self.get_state_names()
-        ])
-        states = self._apply(
-            model=step_model,
-            input_t=input_t,
-            seq_lengths=seq_lengths,
-            states=states_prev,
-            timestep=timestep,
-        )
-
-        external_outputs = set(step_model.net.Proto().external_output)
-        for state in states:
-            if state not in external_outputs:
-                step_model.net.AddExternalOutput(state)
-
-        if outputs_with_grads is None:
-            outputs_with_grads = [self.get_output_state_index() * 2]
-
-        # states_for_all_steps consists of combination of
-        # states gather for all steps and final states. It looks like this:
-        # (state_1_all, state_1_final, state_2_all, state_2_final, ...)
-        states_for_all_steps = recurrent.recurrent_net(
-            net=model.net,
-            cell_net=step_model.net,
-            inputs=[(input_t, preprocessed_inputs)],
-            initial_cell_inputs=list(zip(states_prev, initial_states)),
-            links=dict(zip(states_prev, states)),
-            timestep=timestep,
-            scope=self.name,
-            forward_only=self.forward_only,
-            outputs_with_grads=outputs_with_grads,
-            recompute_blobs_on_backward=self.recompute_blobs,
-        )
-
-        output = self._prepare_output_sequence(
-            model,
-            states_for_all_steps,
-        )
-        return output, states_for_all_steps
-
-    def apply(self, model, input_t, seq_lengths, states, timestep):
-        input_t = self.prepare_input(model, input_t)
-        states = self._apply(
-            model, input_t, seq_lengths, states, timestep)
-        output = self._prepare_output(model, states)
-        return output, states
-
-    def _apply(
-        self,
-        model, input_t, seq_lengths, states, timestep, extra_inputs=None
-    ):
-        '''
-        This  method uses apply_override provided by a custom cell.
-        On the top it takes care of applying self.scope() to all the outputs.
-        While all the inputs stay within the scope this function was called
-        from.
-        '''
-        args = self._rectify_apply_inputs(
-            input_t, seq_lengths, states, timestep, extra_inputs)
-        with core.NameScope(self.name):
-            return self.apply_override(model, *args)
-
-    def _rectify_apply_inputs(
-            self, input_t, seq_lengths, states, timestep, extra_inputs):
-        '''
-        Before applying a scope we make sure that all external blob names
-        are converted to blob reference. So further scoping doesn't affect them
-        '''
-
-        input_t, seq_lengths, timestep = _RectifyNames(
-            [input_t, seq_lengths, timestep])
-        states = _RectifyNames(states)
-        if extra_inputs:
-            extra_input_names, extra_input_sizes = zip(*extra_inputs)
-            extra_inputs = _RectifyNames(extra_input_names)
-            extra_inputs = zip(extra_input_names, extra_input_sizes)
-
-        arg_names = inspect.getargspec(self.apply_override).args
-        rectified = [input_t, seq_lengths, states, timestep]
-        if 'extra_inputs' in arg_names:
-            rectified.append(extra_inputs)
-        return rectified
-
-
-    def apply_override(
-        self,
-        model, input_t, seq_lengths, timestep, extra_inputs=None,
-    ):
-        '''
-        A single step of a recurrent network to be implemented by each custom
-        RNNCell.
-
-        model: ModelHelper object new operators would be added to
-
-        input_t: singlse input with shape (1, batch_size, input_dim)
-
-        seq_lengths: blob containing sequence lengths which would be passed to
-        LSTMUnit operator
-
-        states: previous recurrent states
-
-        timestep: current recurrent iteration. Could be used together with
-        seq_lengths in order to determine, if some shorter sequences
-        in the batch have already ended.
-
-        extra_inputs: list of tuples (input, dim). specifies additional input
-        which is not subject to prepare_input(). (useful when a cell is a
-        component of a larger recurrent structure, e.g., attention)
-        '''
-        raise NotImplementedError('Abstract method')
-
-    def prepare_input(self, model, input_blob):
-        '''
-        If some operations in _apply method depend only on the input,
-        not on recurrent states, they could be computed in advance.
-
-        model: ModelHelper object new operators would be added to
-
-        input_blob: either the whole input sequence with shape
-        (sequence_length, batch_size, input_dim) or a single input with shape
-        (1, batch_size, input_dim).
-        '''
-        return input_blob
-
-    def get_output_state_index(self):
-        '''
-        Return index into state list of the "primary" step-wise output.
-        '''
-        return 0
-
-    def get_state_names(self):
-        '''
-        Returns recurrent state names with self.name scoping applied
-        '''
-        return [self.scope(name) for name in self.get_state_names_override()]
-
-    def get_state_names_override(self):
-        '''
-        Override this function in your custom cell.
-        It should return the names of the recurrent states.
-
-        It's required by apply_over_sequence method in order to allocate
-        recurrent states for all steps with meaningful names.
-        '''
-        raise NotImplementedError('Abstract method')
-
-    def get_output_dim(self):
-        '''
-        Specifies the dimension (number of units) of stepwise output.
-        '''
-        raise NotImplementedError('Abstract method')
-
-    def _prepare_output(self, model, states):
-        '''
-        Allows arbitrary post-processing of primary output.
-        '''
-        return states[self.get_output_state_index()]
-
-    def _prepare_output_sequence(self, model, state_outputs):
-        '''
-        Allows arbitrary post-processing of primary sequence output.
-
-        (Note that state_outputs alternates between full-sequence and final
-        output for each state, thus the index multiplier 2.)
-        '''
-        output_sequence_index = 2 * self.get_output_state_index()
-        return state_outputs[output_sequence_index]
-
-
-class LSTMInitializer:
-    def __init__(self, hidden_size):
-        self.hidden_size = hidden_size
-
-    def create_states(self, model):
-        return [
-            model.create_param(
-                param_name='initial_hidden_state',
-                initializer=Initializer(operator_name='ConstantFill',
-                                        value=0.0),
-                shape=[self.hidden_size],
-            ),
-            model.create_param(
-                param_name='initial_cell_state',
-                initializer=Initializer(operator_name='ConstantFill',
-                                        value=0.0),
-                shape=[self.hidden_size],
-            )
-        ]
-
-
-# based on https://pytorch.org/docs/main/nn.html#torch.nn.RNNCell
-class BasicRNNCell(RNNCell):
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        forget_bias,
-        memory_optimization,
-        drop_states=False,
-        initializer=None,
-        activation=None,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.drop_states = drop_states
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.activation = activation
-
-        if self.activation not in ['relu', 'tanh']:
-            raise RuntimeError(
-                'BasicRNNCell with unknown activation function (%s)'
-                % self.activation)
-
-    def apply_override(
-        self,
-        model,
-        input_t,
-        seq_lengths,
-        states,
-        timestep,
-        extra_inputs=None,
-    ):
-        hidden_t_prev = states[0]
-
-        gates_t = brew.fc(
-            model,
-            hidden_t_prev,
-            'gates_t',
-            dim_in=self.hidden_size,
-            dim_out=self.hidden_size,
-            axis=2,
-        )
-
-        brew.sum(model, [gates_t, input_t], gates_t)
-        if self.activation == 'tanh':
-            hidden_t = model.net.Tanh(gates_t, 'hidden_t')
-        elif self.activation == 'relu':
-            hidden_t = model.net.Relu(gates_t, 'hidden_t')
-        else:
-            raise RuntimeError(
-                'BasicRNNCell with unknown activation function (%s)'
-                % self.activation)
-
-        if seq_lengths is not None:
-            # TODO If this codepath becomes popular, it may be worth
-            # taking a look at optimizing it - for now a simple
-            # implementation is used to round out compatibility with
-            # ONNX.
-            timestep = model.net.CopyFromCPUInput(
-                timestep, 'timestep_gpu')
-            valid_b = model.net.GT(
-                [seq_lengths, timestep], 'valid_b', broadcast=1)
-            invalid_b = model.net.LE(
-                [seq_lengths, timestep], 'invalid_b', broadcast=1)
-            valid = model.net.Cast(valid_b, 'valid', to='float')
-            invalid = model.net.Cast(invalid_b, 'invalid', to='float')
-
-            hidden_valid = model.net.Mul(
-                [hidden_t, valid],
-                'hidden_valid',
-                broadcast=1,
-                axis=1,
-            )
-            if self.drop_states:
-                hidden_t = hidden_valid
-            else:
-                hidden_invalid = model.net.Mul(
-                    [hidden_t_prev, invalid],
-                    'hidden_invalid',
-                    broadcast=1, axis=1)
-                hidden_t = model.net.Add(
-                    [hidden_valid, hidden_invalid], hidden_t)
-        return (hidden_t,)
-
-    def prepare_input(self, model, input_blob):
-        return brew.fc(
-            model,
-            input_blob,
-            self.scope('i2h'),
-            dim_in=self.input_size,
-            dim_out=self.hidden_size,
-            axis=2,
-        )
-
-    def get_state_names(self):
-        return (self.scope('hidden_t'),)
-
-    def get_output_dim(self):
-        return self.hidden_size
-
-
-class LSTMCell(RNNCell):
-
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        forget_bias,
-        memory_optimization,
-        drop_states=False,
-        initializer=None,
-        **kwargs
-    ):
-        super().__init__(initializer=initializer, **kwargs)
-        self.initializer = initializer or LSTMInitializer(
-            hidden_size=hidden_size)
-
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.forget_bias = float(forget_bias)
-        self.memory_optimization = memory_optimization
-        self.drop_states = drop_states
-        self.gates_size = 4 * self.hidden_size
-
-    def apply_override(
-        self,
-        model,
-        input_t,
-        seq_lengths,
-        states,
-        timestep,
-        extra_inputs=None,
-    ):
-        hidden_t_prev, cell_t_prev = states
-
-        fc_input = hidden_t_prev
-        fc_input_dim = self.hidden_size
-
-        if extra_inputs is not None:
-            extra_input_blobs, extra_input_sizes = zip(*extra_inputs)
-            fc_input = brew.concat(
-                model,
-                [hidden_t_prev] + list(extra_input_blobs),
-                'gates_concatenated_input_t',
-                axis=2,
-            )
-            fc_input_dim += sum(extra_input_sizes)
-
-        gates_t = brew.fc(
-            model,
-            fc_input,
-            'gates_t',
-            dim_in=fc_input_dim,
-            dim_out=self.gates_size,
-            axis=2,
-        )
-        brew.sum(model, [gates_t, input_t], gates_t)
-
-        if seq_lengths is not None:
-            inputs = [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep]
-        else:
-            inputs = [hidden_t_prev, cell_t_prev, gates_t, timestep]
-
-        hidden_t, cell_t = model.net.LSTMUnit(
-            inputs,
-            ['hidden_state', 'cell_state'],
-            forget_bias=self.forget_bias,
-            drop_states=self.drop_states,
-            sequence_lengths=(seq_lengths is not None),
-        )
-        model.net.AddExternalOutputs(hidden_t, cell_t)
-        if self.memory_optimization:
-            self.recompute_blobs = [gates_t]
-
-        return hidden_t, cell_t
-
-    def get_input_params(self):
-        return {
-            'weights': self.scope('i2h') + '_w',
-            'biases': self.scope('i2h') + '_b',
-        }
-
-    def get_recurrent_params(self):
-        return {
-            'weights': self.scope('gates_t') + '_w',
-            'biases': self.scope('gates_t') + '_b',
-        }
-
-    def prepare_input(self, model, input_blob):
-        return brew.fc(
-            model,
-            input_blob,
-            self.scope('i2h'),
-            dim_in=self.input_size,
-            dim_out=self.gates_size,
-            axis=2,
-        )
-
-    def get_state_names_override(self):
-        return ['hidden_t', 'cell_t']
-
-    def get_output_dim(self):
-        return self.hidden_size
-
-
-class LayerNormLSTMCell(RNNCell):
-
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        forget_bias,
-        memory_optimization,
-        drop_states=False,
-        initializer=None,
-        **kwargs
-    ):
-        super().__init__(initializer=initializer, **kwargs)
-        self.initializer = initializer or LSTMInitializer(
-            hidden_size=hidden_size
-        )
-
-        self.input_size = input_size
-        self.hidden_size = hidden_size
-        self.forget_bias = float(forget_bias)
-        self.memory_optimization = memory_optimization
-        self.drop_states = drop_states
-        self.gates_size = 4 * self.hidden_size
-
-    def _apply(
-        self,
-        model,
-        input_t,
-        seq_lengths,
-        states,
-        timestep,
-        extra_inputs=None,
-    ):
-        hidden_t_prev, cell_t_prev = states
-
-        fc_input = hidden_t_prev
-        fc_input_dim = self.hidden_size
-
-        if extra_inputs is not None:
-            extra_input_blobs, extra_input_sizes = zip(*extra_inputs)
-            fc_input = brew.concat(
-                model,
-                [hidden_t_prev] + list(extra_input_blobs),
-                self.scope('gates_concatenated_input_t'),
-                axis=2,
-            )
-            fc_input_dim += sum(extra_input_sizes)
-
-        gates_t = brew.fc(
-            model,
-            fc_input,
-            self.scope('gates_t'),
-            dim_in=fc_input_dim,
-            dim_out=self.gates_size,
-            axis=2,
-        )
-        brew.sum(model, [gates_t, input_t], gates_t)
-
-        # brew.layer_norm call is only difference from LSTMCell
-        gates_t, _, _ = brew.layer_norm(
-            model,
-            self.scope('gates_t'),
-            self.scope('gates_t_norm'),
-            dim_in=self.gates_size,
-            axis=-1,
-        )
-
-        hidden_t, cell_t = model.net.LSTMUnit(
-            [
-                hidden_t_prev,
-                cell_t_prev,
-                gates_t,
-                seq_lengths,
-                timestep,
-            ],
-            self.get_state_names(),
-            forget_bias=self.forget_bias,
-            drop_states=self.drop_states,
-        )
-        model.net.AddExternalOutputs(hidden_t, cell_t)
-        if self.memory_optimization:
-            self.recompute_blobs = [gates_t]
-
-        return hidden_t, cell_t
-
-    def get_input_params(self):
-        return {
-            'weights': self.scope('i2h') + '_w',
-            'biases': self.scope('i2h') + '_b',
-        }
-
-    def prepare_input(self, model, input_blob):
-        return brew.fc(
-            model,
-            input_blob,
-            self.scope('i2h'),
-            dim_in=self.input_size,
-            dim_out=self.gates_size,
-            axis=2,
-        )
-
-    def get_state_names(self):
-        return (self.scope('hidden_t'), self.scope('cell_t'))
-
-
-class MILSTMCell(LSTMCell):
-
-    def _apply(
-        self,
-        model,
-        input_t,
-        seq_lengths,
-        states,
-        timestep,
-        extra_inputs=None,
-    ):
-        hidden_t_prev, cell_t_prev = states
-
-        fc_input = hidden_t_prev
-        fc_input_dim = self.hidden_size
-
-        if extra_inputs is not None:
-            extra_input_blobs, extra_input_sizes = zip(*extra_inputs)
-            fc_input = brew.concat(
-                model,
-                [hidden_t_prev] + list(extra_input_blobs),
-                self.scope('gates_concatenated_input_t'),
-                axis=2,
-            )
-            fc_input_dim += sum(extra_input_sizes)
-
-        prev_t = brew.fc(
-            model,
-            fc_input,
-            self.scope('prev_t'),
-            dim_in=fc_input_dim,
-            dim_out=self.gates_size,
-            axis=2,
-        )
-
-        # defining initializers for MI parameters
-        alpha = model.create_param(
-            self.scope('alpha'),
-            shape=[self.gates_size],
-            initializer=Initializer('ConstantFill', value=1.0),
-        )
-        beta_h = model.create_param(
-            self.scope('beta1'),
-            shape=[self.gates_size],
-            initializer=Initializer('ConstantFill', value=1.0),
-        )
-        beta_i = model.create_param(
-            self.scope('beta2'),
-            shape=[self.gates_size],
-            initializer=Initializer('ConstantFill', value=1.0),
-        )
-        b = model.create_param(
-            self.scope('b'),
-            shape=[self.gates_size],
-            initializer=Initializer('ConstantFill', value=0.0),
-        )
-
-        # alpha * input_t + beta_h
-        # Shape: [1, batch_size, 4 * hidden_size]
-        alpha_by_input_t_plus_beta_h = model.net.ElementwiseLinear(
-            [input_t, alpha, beta_h],
-            self.scope('alpha_by_input_t_plus_beta_h'),
-            axis=2,
-        )
-        # (alpha * input_t + beta_h) * prev_t =
-        # alpha * input_t * prev_t + beta_h * prev_t
-        # Shape: [1, batch_size, 4 * hidden_size]
-        alpha_by_input_t_plus_beta_h_by_prev_t = model.net.Mul(
-            [alpha_by_input_t_plus_beta_h, prev_t],
-            self.scope('alpha_by_input_t_plus_beta_h_by_prev_t')
-        )
-        # beta_i * input_t + b
-        # Shape: [1, batch_size, 4 * hidden_size]
-        beta_i_by_input_t_plus_b = model.net.ElementwiseLinear(
-            [input_t, beta_i, b],
-            self.scope('beta_i_by_input_t_plus_b'),
-            axis=2,
-        )
-        # alpha * input_t * prev_t + beta_h * prev_t + beta_i * input_t + b
-        # Shape: [1, batch_size, 4 * hidden_size]
-        gates_t = brew.sum(
-            model,
-            [alpha_by_input_t_plus_beta_h_by_prev_t, beta_i_by_input_t_plus_b],
-            self.scope('gates_t')
-        )
-        hidden_t, cell_t = model.net.LSTMUnit(
-            [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep],
-            [self.scope('hidden_t_intermediate'), self.scope('cell_t')],
-            forget_bias=self.forget_bias,
-            drop_states=self.drop_states,
-        )
-        model.net.AddExternalOutputs(
-            cell_t,
-            hidden_t,
-        )
-        if self.memory_optimization:
-            self.recompute_blobs = [gates_t]
-        return hidden_t, cell_t
-
-
-class LayerNormMILSTMCell(LSTMCell):
-
-    def _apply(
-        self,
-        model,
-        input_t,
-        seq_lengths,
-        states,
-        timestep,
-        extra_inputs=None,
-    ):
-        hidden_t_prev, cell_t_prev = states
-
-        fc_input = hidden_t_prev
-        fc_input_dim = self.hidden_size
-
-        if extra_inputs is not None:
-            extra_input_blobs, extra_input_sizes = zip(*extra_inputs)
-            fc_input = brew.concat(
-                model,
-                [hidden_t_prev] + list(extra_input_blobs),
-                self.scope('gates_concatenated_input_t'),
-                axis=2,
-            )
-            fc_input_dim += sum(extra_input_sizes)
-
-        prev_t = brew.fc(
-            model,
-            fc_input,
-            self.scope('prev_t'),
-            dim_in=fc_input_dim,
-            dim_out=self.gates_size,
-            axis=2,
-        )
-
-        # defining initializers for MI parameters
-        alpha = model.create_param(
-            self.scope('alpha'),
-            shape=[self.gates_size],
-            initializer=Initializer('ConstantFill', value=1.0),
-        )
-        beta_h = model.create_param(
-            self.scope('beta1'),
-            shape=[self.gates_size],
-            initializer=Initializer('ConstantFill', value=1.0),
-        )
-        beta_i = model.create_param(
-            self.scope('beta2'),
-            shape=[self.gates_size],
-            initializer=Initializer('ConstantFill', value=1.0),
-        )
-        b = model.create_param(
-            self.scope('b'),
-            shape=[self.gates_size],
-            initializer=Initializer('ConstantFill', value=0.0),
-        )
-
-        # alpha * input_t + beta_h
-        # Shape: [1, batch_size, 4 * hidden_size]
-        alpha_by_input_t_plus_beta_h = model.net.ElementwiseLinear(
-            [input_t, alpha, beta_h],
-            self.scope('alpha_by_input_t_plus_beta_h'),
-            axis=2,
-        )
-        # (alpha * input_t + beta_h) * prev_t =
-        # alpha * input_t * prev_t + beta_h * prev_t
-        # Shape: [1, batch_size, 4 * hidden_size]
-        alpha_by_input_t_plus_beta_h_by_prev_t = model.net.Mul(
-            [alpha_by_input_t_plus_beta_h, prev_t],
-            self.scope('alpha_by_input_t_plus_beta_h_by_prev_t')
-        )
-        # beta_i * input_t + b
-        # Shape: [1, batch_size, 4 * hidden_size]
-        beta_i_by_input_t_plus_b = model.net.ElementwiseLinear(
-            [input_t, beta_i, b],
-            self.scope('beta_i_by_input_t_plus_b'),
-            axis=2,
-        )
-        # alpha * input_t * prev_t + beta_h * prev_t + beta_i * input_t + b
-        # Shape: [1, batch_size, 4 * hidden_size]
-        gates_t = brew.sum(
-            model,
-            [alpha_by_input_t_plus_beta_h_by_prev_t, beta_i_by_input_t_plus_b],
-            self.scope('gates_t')
-        )
-        # brew.layer_norm call is only difference from MILSTMCell._apply
-        gates_t, _, _ = brew.layer_norm(
-            model,
-            self.scope('gates_t'),
-            self.scope('gates_t_norm'),
-            dim_in=self.gates_size,
-            axis=-1,
-        )
-        hidden_t, cell_t = model.net.LSTMUnit(
-            [hidden_t_prev, cell_t_prev, gates_t, seq_lengths, timestep],
-            [self.scope('hidden_t_intermediate'), self.scope('cell_t')],
-            forget_bias=self.forget_bias,
-            drop_states=self.drop_states,
-        )
-        model.net.AddExternalOutputs(
-            cell_t,
-            hidden_t,
-        )
-        if self.memory_optimization:
-            self.recompute_blobs = [gates_t]
-        return hidden_t, cell_t
-
-
-class DropoutCell(RNNCell):
-    '''
-    Wraps arbitrary RNNCell, applying dropout to its output (but not to the
-    recurrent connection for the corresponding state).
-    '''
-
-    def __init__(
-        self,
-        internal_cell,
-        dropout_ratio=None,
-        use_cudnn=False,
-        **kwargs
-    ):
-        self.internal_cell = internal_cell
-        self.dropout_ratio = dropout_ratio
-        assert 'is_test' in kwargs, "Argument 'is_test' is required"
-        self.is_test = kwargs.pop('is_test')
-        self.use_cudnn = use_cudnn
-        super().__init__(**kwargs)
-
-        self.prepare_input = internal_cell.prepare_input
-        self.get_output_state_index = internal_cell.get_output_state_index
-        self.get_state_names = internal_cell.get_state_names
-        self.get_output_dim = internal_cell.get_output_dim
-
-        self.mask = 0
-
-    def _apply(
-        self,
-        model,
-        input_t,
-        seq_lengths,
-        states,
-        timestep,
-        extra_inputs=None,
-    ):
-        return self.internal_cell._apply(
-            model,
-            input_t,
-            seq_lengths,
-            states,
-            timestep,
-            extra_inputs,
-        )
-
-    def _prepare_output(self, model, states):
-        output = self.internal_cell._prepare_output(
-            model,
-            states,
-        )
-        if self.dropout_ratio is not None:
-            output = self._apply_dropout(model, output)
-        return output
-
-    def _prepare_output_sequence(self, model, state_outputs):
-        output = self.internal_cell._prepare_output_sequence(
-            model,
-            state_outputs,
-        )
-        if self.dropout_ratio is not None:
-            output = self._apply_dropout(model, output)
-        return output
-
-    def _apply_dropout(self, model, output):
-        if self.dropout_ratio and not self.forward_only:
-            with core.NameScope(self.name or ''):
-                output = brew.dropout(
-                    model,
-                    output,
-                    str(output) + '_with_dropout_mask{}'.format(self.mask),
-                    ratio=float(self.dropout_ratio),
-                    is_test=self.is_test,
-                    use_cudnn=self.use_cudnn,
-                )
-                self.mask += 1
-        return output
-
-
-class MultiRNNCellInitializer:
-    def __init__(self, cells):
-        self.cells = cells
-
-    def create_states(self, model):
-        states = []
-        for i, cell in enumerate(self.cells):
-            if cell.initializer is None:
-                raise Exception("Either initial states "
-                                "or initializer have to be set")
-
-            with core.NameScope("layer_{}".format(i)),\
-                    core.NameScope(cell.name):
-                states.extend(cell.initializer.create_states(model))
-        return states
-
-
-class MultiRNNCell(RNNCell):
-    '''
-    Multilayer RNN via the composition of RNNCell instance.
-
-    It is the responsibility of calling code to ensure the compatibility
-    of the successive layers in terms of input/output dimensiality, etc.,
-    and to ensure that their blobs do not have name conflicts, typically by
-    creating the cells with names that specify layer number.
-
-    Assumes first state (recurrent output) for each layer should be the input
-    to the next layer.
-    '''
-
-    def __init__(self, cells, residual_output_layers=None, **kwargs):
-        '''
-        cells: list of RNNCell instances, from input to output side.
-
-        name: string designating network component (for scoping)
-
-        residual_output_layers: list of indices of layers whose input will
-        be added elementwise to their output elementwise. (It is the
-        responsibility of the client code to ensure shape compatibility.)
-        Note that layer 0 (zero) cannot have residual output because of the
-        timing of prepare_input().
-
-        forward_only: used to construct inference-only network.
-        '''
-        super().__init__(**kwargs)
-        self.cells = cells
-
-        if residual_output_layers is None:
-            self.residual_output_layers = []
-        else:
-            self.residual_output_layers = residual_output_layers
-
-        output_index_per_layer = []
-        base_index = 0
-        for cell in self.cells:
-            output_index_per_layer.append(
-                base_index + cell.get_output_state_index(),
-            )
-            base_index += len(cell.get_state_names())
-
-        self.output_connected_layers = []
-        self.output_indices = []
-        for i in range(len(self.cells) - 1):
-            if (i + 1) in self.residual_output_layers:
-                self.output_connected_layers.append(i)
-                self.output_indices.append(output_index_per_layer[i])
-            else:
-                self.output_connected_layers = []
-                self.output_indices = []
-        self.output_connected_layers.append(len(self.cells) - 1)
-        self.output_indices.append(output_index_per_layer[-1])
-
-        self.state_names = []
-        for i, cell in enumerate(self.cells):
-            self.state_names.extend(
-                map(self.layer_scoper(i), cell.get_state_names())
-            )
-
-        self.initializer = MultiRNNCellInitializer(cells)
-
-    def layer_scoper(self, layer_id):
-        def helper(name):
-            return "{}/layer_{}/{}".format(self.name, layer_id, name)
-        return helper
-
-    def prepare_input(self, model, input_blob):
-        input_blob = _RectifyName(input_blob)
-        with core.NameScope(self.name or ''):
-            return self.cells[0].prepare_input(model, input_blob)
-
-    def _apply(
-        self,
-        model,
-        input_t,
-        seq_lengths,
-        states,
-        timestep,
-        extra_inputs=None,
-    ):
-        '''
-        Because below we will do scoping across layers, we need
-        to make sure that string blob names are convereted to BlobReference
-        objects.
-        '''
-
-        input_t, seq_lengths, states, timestep, extra_inputs = \
-            self._rectify_apply_inputs(
-                input_t, seq_lengths, states, timestep, extra_inputs)
-
-        states_per_layer = [len(cell.get_state_names()) for cell in self.cells]
-        assert len(states) == sum(states_per_layer)
-
-        next_states = []
-        states_index = 0
-
-        layer_input = input_t
-        for i, layer_cell in enumerate(self.cells):
-            # # If cells don't have different names we still
-            # take care of scoping
-            with core.NameScope(self.name), core.NameScope("layer_{}".format(i)):
-                num_states = states_per_layer[i]
-                layer_states = states[states_index:(states_index + num_states)]
-                states_index += num_states
-
-                if i > 0:
-                    prepared_input = layer_cell.prepare_input(
-                        model, layer_input)
-                else:
-                    prepared_input = layer_input
-
-                layer_next_states = layer_cell._apply(
-                    model,
-                    prepared_input,
-                    seq_lengths,
-                    layer_states,
-                    timestep,
-                    extra_inputs=(None if i > 0 else extra_inputs),
-                )
-                # Since we're using here non-public method _apply,
-                # instead of apply, we have to manually extract output
-                # from states
-                if i != len(self.cells) - 1:
-                    layer_output = layer_cell._prepare_output(
-                        model,
-                        layer_next_states,
-                    )
-                    if i > 0 and i in self.residual_output_layers:
-                        layer_input = brew.sum(
-                            model,
-                            [layer_output, layer_input],
-                            self.scope('residual_output_{}'.format(i)),
-                        )
-                    else:
-                        layer_input = layer_output
-
-                next_states.extend(layer_next_states)
-        return next_states
-
-    def get_state_names(self):
-        return self.state_names
-
-    def get_output_state_index(self):
-        index = 0
-        for cell in self.cells[:-1]:
-            index += len(cell.get_state_names())
-        index += self.cells[-1].get_output_state_index()
-        return index
-
-    def _prepare_output(self, model, states):
-        connected_outputs = []
-        state_index = 0
-        for i, cell in enumerate(self.cells):
-            num_states = len(cell.get_state_names())
-            if i in self.output_connected_layers:
-                layer_states = states[state_index:state_index + num_states]
-                layer_output = cell._prepare_output(
-                    model,
-                    layer_states
-                )
-                connected_outputs.append(layer_output)
-            state_index += num_states
-        if len(connected_outputs) > 1:
-            output = brew.sum(
-                model,
-                connected_outputs,
-                self.scope('residual_output'),
-            )
-        else:
-            output = connected_outputs[0]
-        return output
-
-    def _prepare_output_sequence(self, model, states):
-        connected_outputs = []
-        state_index = 0
-        for i, cell in enumerate(self.cells):
-            num_states = 2 * len(cell.get_state_names())
-            if i in self.output_connected_layers:
-                layer_states = states[state_index:state_index + num_states]
-                layer_output = cell._prepare_output_sequence(
-                    model,
-                    layer_states
-                )
-                connected_outputs.append(layer_output)
-            state_index += num_states
-        if len(connected_outputs) > 1:
-            output = brew.sum(
-                model,
-                connected_outputs,
-                self.scope('residual_output_sequence'),
-            )
-        else:
-            output = connected_outputs[0]
-        return output
-
-
-class AttentionCell(RNNCell):
-
-    def __init__(
-        self,
-        encoder_output_dim,
-        encoder_outputs,
-        encoder_lengths,
-        decoder_cell,
-        decoder_state_dim,
-        attention_type,
-        weighted_encoder_outputs,
-        attention_memory_optimization,
-        **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.encoder_output_dim = encoder_output_dim
-        self.encoder_outputs = encoder_outputs
-        self.encoder_lengths = encoder_lengths
-        self.decoder_cell = decoder_cell
-        self.decoder_state_dim = decoder_state_dim
-        self.weighted_encoder_outputs = weighted_encoder_outputs
-        self.encoder_outputs_transposed = None
-        assert attention_type in [
-            AttentionType.Regular,
-            AttentionType.Recurrent,
-            AttentionType.Dot,
-            AttentionType.SoftCoverage,
-        ]
-        self.attention_type = attention_type
-        self.attention_memory_optimization = attention_memory_optimization
-
-    def _apply(
-        self,
-        model,
-        input_t,
-        seq_lengths,
-        states,
-        timestep,
-        extra_inputs=None,
-    ):
-        if self.attention_type == AttentionType.SoftCoverage:
-            decoder_prev_states = states[:-2]
-            attention_weighted_encoder_context_t_prev = states[-2]
-            coverage_t_prev = states[-1]
-        else:
-            decoder_prev_states = states[:-1]
-            attention_weighted_encoder_context_t_prev = states[-1]
-
-        assert extra_inputs is None
-
-        decoder_states = self.decoder_cell._apply(
-            model,
-            input_t,
-            seq_lengths,
-            decoder_prev_states,
-            timestep,
-            extra_inputs=[(
-                attention_weighted_encoder_context_t_prev,
-                self.encoder_output_dim,
-            )],
-        )
-
-        self.hidden_t_intermediate = self.decoder_cell._prepare_output(
-            model,
-            decoder_states,
-        )
-
-        if self.attention_type == AttentionType.Recurrent:
-            (
-                attention_weighted_encoder_context_t,
-                self.attention_weights_3d,
-                attention_blobs,
-            ) = apply_recurrent_attention(
-                model=model,
-                encoder_output_dim=self.encoder_output_dim,
-                encoder_outputs_transposed=self.encoder_outputs_transposed,
-                weighted_encoder_outputs=self.weighted_encoder_outputs,
-                decoder_hidden_state_t=self.hidden_t_intermediate,
-                decoder_hidden_state_dim=self.decoder_state_dim,
-                scope=self.name,
-                attention_weighted_encoder_context_t_prev=(
-                    attention_weighted_encoder_context_t_prev
-                ),
-                encoder_lengths=self.encoder_lengths,
-            )
-        elif self.attention_type == AttentionType.Regular:
-            (
-                attention_weighted_encoder_context_t,
-                self.attention_weights_3d,
-                attention_blobs,
-            ) = apply_regular_attention(
-                model=model,
-                encoder_output_dim=self.encoder_output_dim,
-                encoder_outputs_transposed=self.encoder_outputs_transposed,
-                weighted_encoder_outputs=self.weighted_encoder_outputs,
-                decoder_hidden_state_t=self.hidden_t_intermediate,
-                decoder_hidden_state_dim=self.decoder_state_dim,
-                scope=self.name,
-                encoder_lengths=self.encoder_lengths,
-            )
-        elif self.attention_type == AttentionType.Dot:
-            (
-                attention_weighted_encoder_context_t,
-                self.attention_weights_3d,
-                attention_blobs,
-            ) = apply_dot_attention(
-                model=model,
-                encoder_output_dim=self.encoder_output_dim,
-                encoder_outputs_transposed=self.encoder_outputs_transposed,
-                decoder_hidden_state_t=self.hidden_t_intermediate,
-                decoder_hidden_state_dim=self.decoder_state_dim,
-                scope=self.name,
-                encoder_lengths=self.encoder_lengths,
-            )
-        elif self.attention_type == AttentionType.SoftCoverage:
-            (
-                attention_weighted_encoder_context_t,
-                self.attention_weights_3d,
-                attention_blobs,
-                coverage_t,
-            ) = apply_soft_coverage_attention(
-                model=model,
-                encoder_output_dim=self.encoder_output_dim,
-                encoder_outputs_transposed=self.encoder_outputs_transposed,
-                weighted_encoder_outputs=self.weighted_encoder_outputs,
-                decoder_hidden_state_t=self.hidden_t_intermediate,
-                decoder_hidden_state_dim=self.decoder_state_dim,
-                scope=self.name,
-                encoder_lengths=self.encoder_lengths,
-                coverage_t_prev=coverage_t_prev,
-                coverage_weights=self.coverage_weights,
-            )
-        else:
-            raise Exception('Attention type {} not implemented'.format(
-                self.attention_type
-            ))
-
-        if self.attention_memory_optimization:
-            self.recompute_blobs.extend(attention_blobs)
-
-        output = list(decoder_states) + [attention_weighted_encoder_context_t]
-        if self.attention_type == AttentionType.SoftCoverage:
-            output.append(coverage_t)
-
-        output[self.decoder_cell.get_output_state_index()] = model.Copy(
-            output[self.decoder_cell.get_output_state_index()],
-            self.scope('hidden_t_external'),
-        )
-        model.net.AddExternalOutputs(*output)
-
-        return output
-
-    def get_attention_weights(self):
-        # [batch_size, encoder_length, 1]
-        return self.attention_weights_3d
-
-    def prepare_input(self, model, input_blob):
-        if self.encoder_outputs_transposed is None:
-            self.encoder_outputs_transposed = brew.transpose(
-                model,
-                self.encoder_outputs,
-                self.scope('encoder_outputs_transposed'),
-                axes=[1, 2, 0],
-            )
-        if (
-            self.weighted_encoder_outputs is None and
-            self.attention_type != AttentionType.Dot
-        ):
-            self.weighted_encoder_outputs = brew.fc(
-                model,
-                self.encoder_outputs,
-                self.scope('weighted_encoder_outputs'),
-                dim_in=self.encoder_output_dim,
-                dim_out=self.encoder_output_dim,
-                axis=2,
-            )
-
-        return self.decoder_cell.prepare_input(model, input_blob)
-
-    def build_initial_coverage(self, model):
-        """
-        initial_coverage is always zeros of shape [encoder_length],
-        which shape must be determined programmatically dureing network
-        computation.
-
-        This method also sets self.coverage_weights, a separate transform
-        of encoder_outputs which is used to determine coverage contribution
-        tp attention.
-        """
-        assert self.attention_type == AttentionType.SoftCoverage
-
-        # [encoder_length, batch_size, encoder_output_dim]
-        self.coverage_weights = brew.fc(
-            model,
-            self.encoder_outputs,
-            self.scope('coverage_weights'),
-            dim_in=self.encoder_output_dim,
-            dim_out=self.encoder_output_dim,
-            axis=2,
-        )
-
-        encoder_length = model.net.Slice(
-            model.net.Shape(self.encoder_outputs),
-            starts=[0],
-            ends=[1],
-        )
-        if (
-            scope.CurrentDeviceScope() is not None and
-            core.IsGPUDeviceType(scope.CurrentDeviceScope().device_type)
-        ):
-            encoder_length = model.net.CopyGPUToCPU(
-                encoder_length,
-                'encoder_length_cpu',
-            )
-        # total attention weight applied across decoding steps_per_checkpoint
-        # shape: [encoder_length]
-        initial_coverage = model.net.ConstantFill(
-            encoder_length,
-            self.scope('initial_coverage'),
-            value=0.0,
-            input_as_shape=1,
-        )
-        return initial_coverage
-
-    def get_state_names(self):
-        state_names = list(self.decoder_cell.get_state_names())
-        state_names[self.get_output_state_index()] = self.scope(
-            'hidden_t_external',
-        )
-        state_names.append(self.scope('attention_weighted_encoder_context_t'))
-        if self.attention_type == AttentionType.SoftCoverage:
-            state_names.append(self.scope('coverage_t'))
-        return state_names
-
-    def get_output_dim(self):
-        return self.decoder_state_dim + self.encoder_output_dim
-
-    def get_output_state_index(self):
-        return self.decoder_cell.get_output_state_index()
-
-    def _prepare_output(self, model, states):
-        if self.attention_type == AttentionType.SoftCoverage:
-            attention_context = states[-2]
-        else:
-            attention_context = states[-1]
-
-        with core.NameScope(self.name or ''):
-            output = brew.concat(
-                model,
-                [self.hidden_t_intermediate, attention_context],
-                'states_and_context_combination',
-                axis=2,
-            )
-
-        return output
-
-    def _prepare_output_sequence(self, model, state_outputs):
-        if self.attention_type == AttentionType.SoftCoverage:
-            decoder_state_outputs = state_outputs[:-4]
-        else:
-            decoder_state_outputs = state_outputs[:-2]
-
-        decoder_output = self.decoder_cell._prepare_output_sequence(
-            model,
-            decoder_state_outputs,
-        )
-
-        if self.attention_type == AttentionType.SoftCoverage:
-            attention_context_index = 2 * (len(self.get_state_names()) - 2)
-        else:
-            attention_context_index = 2 * (len(self.get_state_names()) - 1)
-
-        with core.NameScope(self.name or ''):
-            output = brew.concat(
-                model,
-                [
-                    decoder_output,
-                    state_outputs[attention_context_index],
-                ],
-                'states_and_context_combination',
-                axis=2,
-            )
-        return output
-
-
-class LSTMWithAttentionCell(AttentionCell):
-
-    def __init__(
-        self,
-        encoder_output_dim,
-        encoder_outputs,
-        encoder_lengths,
-        decoder_input_dim,
-        decoder_state_dim,
-        name,
-        attention_type,
-        weighted_encoder_outputs,
-        forget_bias,
-        lstm_memory_optimization,
-        attention_memory_optimization,
-        forward_only=False,
-    ):
-        decoder_cell = LSTMCell(
-            input_size=decoder_input_dim,
-            hidden_size=decoder_state_dim,
-            forget_bias=forget_bias,
-            memory_optimization=lstm_memory_optimization,
-            name='{}/decoder'.format(name),
-            forward_only=False,
-            drop_states=False,
-        )
-        super().__init__(
-            encoder_output_dim=encoder_output_dim,
-            encoder_outputs=encoder_outputs,
-            encoder_lengths=encoder_lengths,
-            decoder_cell=decoder_cell,
-            decoder_state_dim=decoder_state_dim,
-            name=name,
-            attention_type=attention_type,
-            weighted_encoder_outputs=weighted_encoder_outputs,
-            attention_memory_optimization=attention_memory_optimization,
-            forward_only=forward_only,
-        )
-
-
-class MILSTMWithAttentionCell(AttentionCell):
-
-    def __init__(
-        self,
-        encoder_output_dim,
-        encoder_outputs,
-        decoder_input_dim,
-        decoder_state_dim,
-        name,
-        attention_type,
-        weighted_encoder_outputs,
-        forget_bias,
-        lstm_memory_optimization,
-        attention_memory_optimization,
-        forward_only=False,
-    ):
-        decoder_cell = MILSTMCell(
-            input_size=decoder_input_dim,
-            hidden_size=decoder_state_dim,
-            forget_bias=forget_bias,
-            memory_optimization=lstm_memory_optimization,
-            name='{}/decoder'.format(name),
-            forward_only=False,
-            drop_states=False,
-        )
-        super().__init__(
-            encoder_output_dim=encoder_output_dim,
-            encoder_outputs=encoder_outputs,
-            decoder_cell=decoder_cell,
-            decoder_state_dim=decoder_state_dim,
-            name=name,
-            attention_type=attention_type,
-            weighted_encoder_outputs=weighted_encoder_outputs,
-            attention_memory_optimization=attention_memory_optimization,
-            forward_only=forward_only,
-        )
-
-
-def _LSTM(
-    cell_class,
-    model,
-    input_blob,
-    seq_lengths,
-    initial_states,
-    dim_in,
-    dim_out,
-    scope=None,
-    outputs_with_grads=(0,),
-    return_params=False,
-    memory_optimization=False,
-    forget_bias=0.0,
-    forward_only=False,
-    drop_states=False,
-    return_last_layer_only=True,
-    static_rnn_unroll_size=None,
-    **cell_kwargs
-):
-    '''
-    Adds a standard LSTM recurrent network operator to a model.
-
-    cell_class: LSTMCell or compatible subclass
-
-    model: ModelHelper object new operators would be added to
-
-    input_blob: the input sequence in a format T x N x D
-            where T is sequence size, N - batch size and D - input dimension
-
-    seq_lengths: blob containing sequence lengths which would be passed to
-            LSTMUnit operator
-
-    initial_states: a list of (2 * num_layers) blobs representing the initial
-            hidden and cell states of each layer. If this argument is None,
-            these states will be added to the model as network parameters.
-
-    dim_in: input dimension
-
-    dim_out: number of units per LSTM layer
-            (use int for single-layer LSTM, list of ints for multi-layer)
-
-    outputs_with_grads : position indices of output blobs for LAST LAYER which
-            will receive external error gradient during backpropagation.
-            These outputs are: (h_all, h_last, c_all, c_last)
-
-    return_params: if True, will return a dictionary of parameters of the LSTM
-
-    memory_optimization: if enabled, the LSTM step is recomputed on backward
-            step so that we don't need to store forward activations for each
-            timestep. Saves memory with cost of computation.
-
-    forget_bias: forget gate bias (default 0.0)
-
-    forward_only: whether to create a backward pass
-
-    drop_states: drop invalid states, passed through to LSTMUnit operator
-
-    return_last_layer_only: only return outputs from final layer
-            (so that length of results does depend on number of layers)
-
-    static_rnn_unroll_size: if not None, we will use static RNN which is
-    unrolled into Caffe2 graph. The size of the unroll is the value of
-    this parameter.
-    '''
-    if type(dim_out) is not list and type(dim_out) is not tuple:
-        dim_out = [dim_out]
-    num_layers = len(dim_out)
-
-    cells = []
-    for i in range(num_layers):
-        cell = cell_class(
-            input_size=(dim_in if i == 0 else dim_out[i - 1]),
-            hidden_size=dim_out[i],
-            forget_bias=forget_bias,
-            memory_optimization=memory_optimization,
-            name=scope if num_layers == 1 else None,
-            forward_only=forward_only,
-            drop_states=drop_states,
-            **cell_kwargs
-        )
-        cells.append(cell)
-
-    cell = MultiRNNCell(
-        cells,
-        name=scope,
-        forward_only=forward_only,
-    ) if num_layers > 1 else cells[0]
-
-    cell = (
-        cell if static_rnn_unroll_size is None
-        else UnrolledCell(cell, static_rnn_unroll_size))
-
-    # outputs_with_grads argument indexes into final layer
-    outputs_with_grads = [4 * (num_layers - 1) + i for i in outputs_with_grads]
-    _, result = cell.apply_over_sequence(
-        model=model,
-        inputs=input_blob,
-        seq_lengths=seq_lengths,
-        initial_states=initial_states,
-        outputs_with_grads=outputs_with_grads,
-    )
-
-    if return_last_layer_only:
-        result = result[4 * (num_layers - 1):]
-    if return_params:
-        result = list(result) + [{
-            'input': cell.get_input_params(),
-            'recurrent': cell.get_recurrent_params(),
-        }]
-    return tuple(result)
-
-
-LSTM = functools.partial(_LSTM, LSTMCell)
-BasicRNN = functools.partial(_LSTM, BasicRNNCell)
-MILSTM = functools.partial(_LSTM, MILSTMCell)
-LayerNormLSTM = functools.partial(_LSTM, LayerNormLSTMCell)
-LayerNormMILSTM = functools.partial(_LSTM, LayerNormMILSTMCell)
-
-
-class UnrolledCell(RNNCell):
-    def __init__(self, cell, T):
-        self.T = T
-        self.cell = cell
-
-    def apply_over_sequence(
-        self,
-        model,
-        inputs,
-        seq_lengths,
-        initial_states,
-        outputs_with_grads=None,
-    ):
-        inputs = self.cell.prepare_input(model, inputs)
-
-        # Now they are blob references - outputs of splitting the input sequence
-        split_inputs = model.net.Split(
-            inputs,
-            [str(inputs) + "_timestep_{}".format(i)
-             for i in range(self.T)],
-            axis=0)
-        if self.T == 1:
-            split_inputs = [split_inputs]
-
-        states = initial_states
-        all_states = []
-        for t in range(0, self.T):
-            scope_name = "timestep_{}".format(t)
-            # Parameters of all timesteps are shared
-            with ParameterSharing({scope_name: ''}),\
-                    scope.NameScope(scope_name):
-                timestep = model.param_init_net.ConstantFill(
-                    [], "timestep", value=t, shape=[1],
-                    dtype=core.DataType.INT32,
-                    device_option=core.DeviceOption(caffe2_pb2.CPU))
-                states = self.cell._apply(
-                    model=model,
-                    input_t=split_inputs[t],
-                    seq_lengths=seq_lengths,
-                    states=states,
-                    timestep=timestep,
-                )
-            all_states.append(states)
-
-        all_states = zip(*all_states)
-        all_states = [
-            model.net.Concat(
-                list(full_output),
-                [
-                    str(full_output[0])[len("timestep_0/"):] + "_concat",
-                    str(full_output[0])[len("timestep_0/"):] + "_concat_info"
-
-                ],
-                axis=0)[0]
-            for full_output in all_states
-        ]
-        # Interleave the state values similar to
-        #
-        #   x = [1, 3, 5]
-        #   y = [2, 4, 6]
-        #   z = [val for pair in zip(x, y) for val in pair]
-        #   # z is [1, 2, 3, 4, 5, 6]
-        #
-        # and returns it as outputs
-        outputs = tuple(
-            state for state_pair in zip(all_states, states) for state in state_pair
-        )
-        outputs_without_grad = set(range(len(outputs))) - set(
-            outputs_with_grads)
-        for i in outputs_without_grad:
-            model.net.ZeroGradient(outputs[i], [])
-        logging.debug("Added 0 gradients for blobs:",
-                      [outputs[i] for i in outputs_without_grad])
-
-        final_output = self.cell._prepare_output_sequence(model, outputs)
-
-        return final_output, outputs
-
-
-def GetLSTMParamNames():
-    weight_params = ["input_gate_w", "forget_gate_w", "output_gate_w", "cell_w"]
-    bias_params = ["input_gate_b", "forget_gate_b", "output_gate_b", "cell_b"]
-    return {'weights': weight_params, 'biases': bias_params}
-
-
-def InitFromLSTMParams(lstm_pblobs, param_values):
-    '''
-    Set the parameters of LSTM based on predefined values
-    '''
-    weight_params = GetLSTMParamNames()['weights']
-    bias_params = GetLSTMParamNames()['biases']
-    for input_type in param_values.keys():
-        weight_values = [
-            param_values[input_type][w].flatten()
-            for w in weight_params
-        ]
-        wmat = np.array([])
-        for w in weight_values:
-            wmat = np.append(wmat, w)
-        bias_values = [
-            param_values[input_type][b].flatten()
-            for b in bias_params
-        ]
-        bm = np.array([])
-        for b in bias_values:
-            bm = np.append(bm, b)
-
-        weights_blob = lstm_pblobs[input_type]['weights']
-        bias_blob = lstm_pblobs[input_type]['biases']
-        cur_weight = workspace.FetchBlob(weights_blob)
-        cur_biases = workspace.FetchBlob(bias_blob)
-
-        workspace.FeedBlob(
-            weights_blob,
-            wmat.reshape(cur_weight.shape).astype(np.float32))
-        workspace.FeedBlob(
-            bias_blob,
-            bm.reshape(cur_biases.shape).astype(np.float32))
-
-
-def cudnn_LSTM(model, input_blob, initial_states, dim_in, dim_out,
-               scope, recurrent_params=None, input_params=None,
-               num_layers=1, return_params=False):
-    '''
-    CuDNN version of LSTM for GPUs.
-    input_blob          Blob containing the input. Will need to be available
-                        when param_init_net is run, because the sequence lengths
-                        and batch sizes will be inferred from the size of this
-                        blob.
-    initial_states      tuple of (hidden_init, cell_init) blobs
-    dim_in              input dimensions
-    dim_out             output/hidden dimension
-    scope               namescope to apply
-    recurrent_params    dict of blobs containing values for recurrent
-                        gate weights, biases (if None, use random init values)
-                        See GetLSTMParamNames() for format.
-    input_params        dict of blobs containing values for input
-                        gate weights, biases (if None, use random init values)
-                        See GetLSTMParamNames() for format.
-    num_layers          number of LSTM layers
-    return_params       if True, returns (param_extract_net, param_mapping)
-                        where param_extract_net is a net that when run, will
-                        populate the blobs specified in param_mapping with the
-                        current gate weights and biases (input/recurrent).
-                        Useful for assigning the values back to non-cuDNN
-                        LSTM.
-    '''
-    with core.NameScope(scope):
-        weight_params = GetLSTMParamNames()['weights']
-        bias_params = GetLSTMParamNames()['biases']
-
-        input_weight_size = dim_out * dim_in
-        upper_layer_input_weight_size = dim_out * dim_out
-        recurrent_weight_size = dim_out * dim_out
-        input_bias_size = dim_out
-        recurrent_bias_size = dim_out
-
-        def init(layer, pname, input_type):
-            input_weight_size_for_layer = input_weight_size if layer == 0 else \
-                upper_layer_input_weight_size
-            if pname in weight_params:
-                sz = input_weight_size_for_layer if input_type == 'input' \
-                    else recurrent_weight_size
-            elif pname in bias_params:
-                sz = input_bias_size if input_type == 'input' \
-                    else recurrent_bias_size
-            else:
-                assert False, "unknown parameter type {}".format(pname)
-            return model.param_init_net.UniformFill(
-                [],
-                "lstm_init_{}_{}_{}".format(input_type, pname, layer),
-                shape=[sz])
-
-        # Multiply by 4 since we have 4 gates per LSTM unit
-        first_layer_sz = input_weight_size + recurrent_weight_size + \
-                         input_bias_size + recurrent_bias_size
-        upper_layer_sz = upper_layer_input_weight_size + \
-                         recurrent_weight_size + input_bias_size + \
-                         recurrent_bias_size
-        total_sz = 4 * (first_layer_sz + (num_layers - 1) * upper_layer_sz)
-
-        weights = model.create_param(
-            'lstm_weight',
-            shape=[total_sz],
-            initializer=Initializer('UniformFill'),
-            tags=ParameterTags.WEIGHT,
-        )
-
-        lstm_args = {
-            'hidden_size': dim_out,
-            'rnn_mode': 'lstm',
-            'bidirectional': 0,  # TODO
-            'dropout': 1.0,  # TODO
-            'input_mode': 'linear',  # TODO
-            'num_layers': num_layers,
-            'engine': 'CUDNN'
-        }
-
-        param_extract_net = core.Net("lstm_param_extractor")
-        param_extract_net.AddExternalInputs([input_blob, weights])
-        param_extract_mapping = {}
-
-        # Populate the weights-blob from blobs containing parameters for
-        # the individual components of the LSTM, such as forget/input gate
-        # weights and bises. Also, create a special param_extract_net that
-        # can be used to grab those individual params from the black-box
-        # weights blob. These results can be then fed to InitFromLSTMParams()
-        for input_type in ['input', 'recurrent']:
-            param_extract_mapping[input_type] = {}
-            p = recurrent_params if input_type == 'recurrent' else input_params
-            if p is None:
-                p = {}
-            for pname in weight_params + bias_params:
-                for j in range(0, num_layers):
-                    values = p[pname] if pname in p else init(j, pname, input_type)
-                    model.param_init_net.RecurrentParamSet(
-                        [input_blob, weights, values],
-                        weights,
-                        layer=j,
-                        input_type=input_type,
-                        param_type=pname,
-                        **lstm_args
-                    )
-                    if pname not in param_extract_mapping[input_type]:
-                        param_extract_mapping[input_type][pname] = {}
-                    b = param_extract_net.RecurrentParamGet(
-                        [input_blob, weights],
-                        ["lstm_{}_{}_{}".format(input_type, pname, j)],
-                        layer=j,
-                        input_type=input_type,
-                        param_type=pname,
-                        **lstm_args
-                    )
-                    param_extract_mapping[input_type][pname][j] = b
-
-        (hidden_input_blob, cell_input_blob) = initial_states
-        output, hidden_output, cell_output, rnn_scratch, dropout_states = \
-            model.net.Recurrent(
-                [input_blob, hidden_input_blob, cell_input_blob, weights],
-                ["lstm_output", "lstm_hidden_output", "lstm_cell_output",
-                 "lstm_rnn_scratch", "lstm_dropout_states"],
-                seed=random.randint(0, 100000),  # TODO: dropout seed
-                **lstm_args
-            )
-        model.net.AddExternalOutputs(
-            hidden_output, cell_output, rnn_scratch, dropout_states)
-
-    if return_params:
-        param_extract = param_extract_net, param_extract_mapping
-        return output, hidden_output, cell_output, param_extract
-    else:
-        return output, hidden_output, cell_output
-
-
-def LSTMWithAttention(
-    model,
-    decoder_inputs,
-    decoder_input_lengths,
-    initial_decoder_hidden_state,
-    initial_decoder_cell_state,
-    initial_attention_weighted_encoder_context,
-    encoder_output_dim,
-    encoder_outputs,
-    encoder_lengths,
-    decoder_input_dim,
-    decoder_state_dim,
-    scope,
-    attention_type=AttentionType.Regular,
-    outputs_with_grads=(0, 4),
-    weighted_encoder_outputs=None,
-    lstm_memory_optimization=False,
-    attention_memory_optimization=False,
-    forget_bias=0.0,
-    forward_only=False,
-):
-    '''
-    Adds a LSTM with attention mechanism to a model.
-
-    The implementation is based on https://arxiv.org/abs/1409.0473, with
-    a small difference in the order
-    how we compute new attention context and new hidden state, similarly to
-    https://arxiv.org/abs/1508.04025.
-
-    The model uses encoder-decoder naming conventions,
-    where the decoder is the sequence the op is iterating over,
-    while computing the attention context over the encoder.
-
-    model: ModelHelper object new operators would be added to
-
-    decoder_inputs: the input sequence in a format T x N x D
-    where T is sequence size, N - batch size and D - input dimension
-
-    decoder_input_lengths: blob containing sequence lengths
-    which would be passed to LSTMUnit operator
-
-    initial_decoder_hidden_state: initial hidden state of LSTM
-
-    initial_decoder_cell_state: initial cell state of LSTM
-
-    initial_attention_weighted_encoder_context: initial attention context
-
-    encoder_output_dim: dimension of encoder outputs
-
-    encoder_outputs: the sequence, on which we compute the attention context
-    at every iteration
-
-    encoder_lengths: a tensor with lengths of each encoder sequence in batch
-    (may be None, meaning all encoder sequences are of same length)
-
-    decoder_input_dim: input dimension (last dimension on decoder_inputs)
-
-    decoder_state_dim: size of hidden states of LSTM
-
-    attention_type: One of: AttentionType.Regular, AttentionType.Recurrent.
-    Determines which type of attention mechanism to use.
-
-    outputs_with_grads : position indices of output blobs which will receive
-    external error gradient during backpropagation
-
-    weighted_encoder_outputs: encoder outputs to be used to compute attention
-    weights. In the basic case it's just linear transformation of
-    encoder outputs (that the default, when weighted_encoder_outputs is None).
-    However, it can be something more complicated - like a separate
-    encoder network (for example, in case of convolutional encoder)
-
-    lstm_memory_optimization: recompute LSTM activations on backward pass, so
-                 we don't need to store their values in forward passes
-
-    attention_memory_optimization: recompute attention for backward pass
-
-    forward_only: whether to create only forward pass
-    '''
-    cell = LSTMWithAttentionCell(
-        encoder_output_dim=encoder_output_dim,
-        encoder_outputs=encoder_outputs,
-        encoder_lengths=encoder_lengths,
-        decoder_input_dim=decoder_input_dim,
-        decoder_state_dim=decoder_state_dim,
-        name=scope,
-        attention_type=attention_type,
-        weighted_encoder_outputs=weighted_encoder_outputs,
-        forget_bias=forget_bias,
-        lstm_memory_optimization=lstm_memory_optimization,
-        attention_memory_optimization=attention_memory_optimization,
-        forward_only=forward_only,
-    )
-    initial_states = [
-        initial_decoder_hidden_state,
-        initial_decoder_cell_state,
-        initial_attention_weighted_encoder_context,
-    ]
-    if attention_type == AttentionType.SoftCoverage:
-        initial_states.append(cell.build_initial_coverage(model))
-    _, result = cell.apply_over_sequence(
-        model=model,
-        inputs=decoder_inputs,
-        seq_lengths=decoder_input_lengths,
-        initial_states=initial_states,
-        outputs_with_grads=outputs_with_grads,
-    )
-    return result
-
-
-def _layered_LSTM(
-        model, input_blob, seq_lengths, initial_states,
-        dim_in, dim_out, scope, outputs_with_grads=(0,), return_params=False,
-        memory_optimization=False, forget_bias=0.0, forward_only=False,
-        drop_states=False, create_lstm=None):
-    params = locals()  # leave it as a first line to grab all params
-    params.pop('create_lstm')
-    if not isinstance(dim_out, list):
-        return create_lstm(**params)
-    elif len(dim_out) == 1:
-        params['dim_out'] = dim_out[0]
-        return create_lstm(**params)
-
-    assert len(dim_out) != 0, "dim_out list can't be empty"
-    assert return_params is False, "return_params not supported for layering"
-    for i, output_dim in enumerate(dim_out):
-        params.update({
-            'dim_out': output_dim
-        })
-        output, last_output, all_states, last_state = create_lstm(**params)
-        params.update({
-            'input_blob': output,
-            'dim_in': output_dim,
-            'initial_states': (last_output, last_state),
-            'scope': scope + '_layer_{}'.format(i + 1)
-        })
-    return output, last_output, all_states, last_state
-
-
-layered_LSTM = functools.partial(_layered_LSTM, create_lstm=LSTM)
diff --git a/caffe2/python/schema.py b/caffe2/python/schema.py
deleted file mode 100644
index 924afed41bd5..000000000000
--- a/caffe2/python/schema.py
+++ /dev/null
@@ -1,1318 +0,0 @@
-## @package schema
-# Module caffe2.python.schema
-"""
-Defines a minimal set of data types that allow to represent datasets with
-arbitrary nested structure, including objects of variable length, such as
-maps and lists.
-
-This defines a columnar storage format for such datasets on top of caffe2
-tensors. In terms of capacity of representation, it can represent most of
-the data types supported by Parquet, ORC, DWRF file formats.
-
-See comments in operator_test/dataset_ops_test.py for an example and
-walkthrough on how to use schema to store and iterate through a structured
-in-memory dataset.
-"""
-
-
-
-
-
-import logging
-import numpy as np
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.core import BlobReference
-from collections import OrderedDict, namedtuple
-from past.builtins import basestring
-from itertools import islice
-from io import StringIO
-from typing import Sequence
-
-logger = logging.getLogger(__name__)
-
-FIELD_SEPARATOR = ':'
-
-
-def _join_field_name(prefix, suffix):
-    if prefix and suffix:
-        return '{}{}{}'.format(prefix, FIELD_SEPARATOR, suffix)
-    elif prefix:
-        return prefix
-    elif suffix:
-        return suffix
-    else:
-        return ''
-
-
-def _normalize_field(field_or_type_or_blob, keep_blobs=True):
-    """Clones/normalizes a field before adding it to a container."""
-    if isinstance(field_or_type_or_blob, Field):
-        return field_or_type_or_blob.clone(keep_blobs=keep_blobs)
-    elif type(field_or_type_or_blob) in (type, np.dtype):
-        return Scalar(dtype=field_or_type_or_blob)
-    else:
-        return Scalar(blob=field_or_type_or_blob)
-
-
-FeatureSpec = namedtuple(
-    'FeatureSpec',
-    [
-        'feature_type',
-        'feature_names',
-        'feature_ids',
-        'feature_is_request_only',
-        'desired_hash_size',
-        'feature_to_index',
-    ]
-)
-
-# pyre-fixme[16]: `FeatureSpec.__new__` has no attribute `__defaults__`
-FeatureSpec.__new__.__defaults__ = (None, None, None, None, None, None)
-
-
-class Metadata(
-    namedtuple(
-        'Metadata', ['categorical_limit', 'expected_value', 'feature_specs']
-    )
-):
-    """Represents additional information associated with a scalar in schema.
-
-    `categorical_limit` - for fields of integral type that are guaranteed to be
-    non-negative it specifies the maximum possible value plus one. It's often
-    used as a size of an embedding table.
-
-    `expected_value` - anticipated average value of elements in the field.
-    Usually makes sense for length fields of lists.
-
-    `feature_specs` - information about the features that contained in this
-    field. For example if field have more than 1 feature it can have list of
-    feature names contained in this field."""
-    __slots__: Sequence[str] = ()
-
-
-# pyre-fixme[16]: `Metadata.__new__` has no attribute `__defaults__`
-Metadata.__new__.__defaults__ = (None, None, None)
-
-
-class Field:
-    """Represents an abstract field type in a dataset.
-    """
-
-    __slots__: Sequence[str] = ("_parent", "_field_offsets")
-
-    def __init__(self, children):
-        """Derived classes must call this after their initialization."""
-        self._parent = (None, 0)
-        offset = 0
-        self._field_offsets = []
-        for child in children:
-            self._field_offsets.append(offset)
-            offset += len(child.field_names())
-        self._field_offsets.append(offset)
-
-    def clone_schema(self):
-        return self.clone(keep_blobs=False)
-
-    def field_names(self):
-        """Return the children field names for this field."""
-        raise NotImplementedError('Field is an abstract class.')
-
-    def field_types(self):
-        """Return the numpy.dtype for each of the children fields."""
-        raise NotImplementedError('Field is an abstract class.')
-
-    def field_metadata(self):
-        """Return the Metadata for each of the children fields."""
-        raise NotImplementedError('Field is an abstract class.')
-
-    def field_blobs(self):
-        """Return the list of blobs with contents for this Field.
-        Values can either be all numpy.ndarray or BlobReference.
-        If any of the fields doesn't have a blob, throws.
-        """
-        raise NotImplementedError('Field is an abstract class.')
-
-    def all_scalars(self):
-        """Return the list of all Scalar instances in the Field.
-        The order is the same as for field_names() or field_blobs()"""
-        raise NotImplementedError('Field is an abstract class.')
-
-    def has_blobs(self):
-        """Return True if every scalar of this field has blobs."""
-        raise NotImplementedError('Field is an abstract class.')
-
-    def clone(self, keep_blobs=True):
-        """Clone this Field along with its children."""
-        raise NotImplementedError('Field is an abstract class.')
-
-    def _set_parent(self, parent, relative_id):
-        self._parent = (parent, relative_id)
-
-    def slice(self):
-        """
-        Returns a slice representing the range of field ids that belong to
-        this field. This slice can be used to index a list of fields.
-
-        E.g.:
-
-        >>> s = Struct(
-        >>>     ('a', Scalar()),
-        >>>     ('b', Struct(
-        >>>         ('b1', Scalar()),
-        >>>         ('b2', Scalar()),
-        >>>     )),
-        >>>     ('c', Scalar()),
-        >>> )
-        >>> field_data = ['da', 'db1', 'db2', 'dc']
-        >>> field_data[s.b.split()]
-        ['db1', 'db2']
-        """
-        base_id = self._child_base_id()
-        return slice(base_id, base_id + len(self.field_names()))
-
-    def _child_base_id(self, child_index=None):
-        """Get the base id of the given child"""
-        p, i = self._parent
-        pos = 0 if child_index is None else self._field_offsets[child_index]
-        if p:
-            pos += p._child_base_id(i)
-        return pos
-
-    def __eq__(self, other):
-        """Equivalance of two schemas"""
-        return (
-            (self.field_names() == other.field_names()) and
-            (self.field_types() == other.field_types()) and
-            (self.field_metadata() == other.field_metadata())
-        )
-
-    def _pprint_impl(self, indent, str_buffer):
-        raise NotImplementedError('Field is an abstract class.')
-
-    def __repr__(self):
-        str_buffer = StringIO()
-        self._pprint_impl(0, str_buffer)
-        contents = str_buffer.getvalue()
-        str_buffer.close()
-        return contents
-
-
-class List(Field):
-    """Represents a variable-length list.
-
-    Values of a list can also be complex fields such as Lists and Structs.
-    In addition to the fields exposed by its `values` field, a List exposes an
-    additional `lengths` field, which will contain the size of each list under
-    the parent domain.
-    """
-
-    __slots__: Sequence[str] = ("lengths", "_items")
-
-    def __init__(self, values, lengths_blob=None):
-        if isinstance(lengths_blob, Field):
-            assert isinstance(lengths_blob, Scalar)
-            self.lengths = _normalize_field(lengths_blob)
-        else:
-            self.lengths = Scalar(np.int32, lengths_blob)
-        self._items = _normalize_field(values)
-        self.lengths._set_parent(self, 0)
-        self._items._set_parent(self, 1)
-        super().__init__([self.lengths, self._items])
-
-    def field_names(self):
-        value_fields = self._items.field_names()
-        return (
-            ['lengths'] + [_join_field_name('values', v) for v in value_fields]
-        )
-
-    def field_types(self):
-        return self.lengths.field_types() + self._items.field_types()
-
-    def field_metadata(self):
-        return self.lengths.field_metadata() + self._items.field_metadata()
-
-    def field_blobs(self):
-        return self.lengths.field_blobs() + self._items.field_blobs()
-
-    def all_scalars(self):
-        return self.lengths.all_scalars() + self._items.all_scalars()
-
-    def has_blobs(self):
-        return self.lengths.has_blobs() and self._items.has_blobs()
-
-    def clone(self, keep_blobs=True):
-        return type(self)(
-            _normalize_field(self._items, keep_blobs=keep_blobs),
-            _normalize_field(self.lengths, keep_blobs=keep_blobs)
-        )
-
-    def _pprint_impl(self, indent, str_buffer):
-        str_buffer.write('  ' * indent + "List(\n")
-        str_buffer.write('  ' * (indent + 1) + "lengths=\n")
-        self.lengths._pprint_impl(indent=indent + 2, str_buffer=str_buffer)
-        str_buffer.write('  ' * (indent + 1) + "_items=\n")
-        self._items._pprint_impl(indent=indent + 2, str_buffer=str_buffer)
-        str_buffer.write('  ' * indent + ")\n")
-
-    def __getattr__(self, item):
-        """If the value of this list is a struct,
-        allow to introspect directly into its fields."""
-        if item.startswith('__'):
-            raise AttributeError(item)
-        if isinstance(self._items, Struct):
-            return getattr(self._items, item)
-        elif item == 'value' or item == 'items':
-            return self._items
-        else:
-            raise AttributeError('Field not found in list: %s.' % item)
-
-    def __getitem__(self, item):
-        names = item.split(FIELD_SEPARATOR, 1)
-
-        if len(names) == 1:
-            if item == 'lengths':
-                return self.lengths
-            elif item == 'values':
-                return self._items
-        else:
-            if names[0] == 'values':
-                return self._items[names[1]]
-        raise KeyError('Field not found in list: %s.' % item)
-
-
-class ListWithEvicted(List):
-    """
-    This class is similar with List, but containing extra field evicted_values for
-    LRU Hashing.
-    """
-
-    __slots__: Sequence[str] = ("_evicted_values",)
-
-    def __init__(self, values, lengths_blob=None, evicted_values=None):
-        if isinstance(evicted_values, Field):
-            assert isinstance(evicted_values, Scalar)
-            self._evicted_values = _normalize_field(evicted_values)
-        else:
-            self._evicted_values = Scalar(np.int64, evicted_values)
-        super().__init__(values, lengths_blob=lengths_blob)
-
-    def field_names(self):
-        value_fields = self._items.field_names()
-        return (
-            ['lengths'] + [_join_field_name('values', v) for v in value_fields] + ["_evicted_values"]
-        )
-
-    def field_types(self):
-        return self.lengths.field_types() + self._items.field_types() + self._evicted_values.field_types()
-
-    def field_metadata(self):
-        return self.lengths.field_metadata() + self._items.field_metadata() + self._evicted_values.field_metadata()
-
-    def field_blobs(self):
-        return self.lengths.field_blobs() + self._items.field_blobs() + self._evicted_values.field_blobs()
-
-    def all_scalars(self):
-        return self.lengths.all_scalars() + self._items.all_scalars() + self._evicted_values.all_scalars()
-
-    def has_blobs(self):
-        return self.lengths.has_blobs() and self._items.has_blobs() + self._evicted_values.has_blobs()
-
-    def clone(self, keep_blobs=True):
-        return type(self)(
-            _normalize_field(self._items, keep_blobs=keep_blobs),
-            _normalize_field(self.lengths, keep_blobs=keep_blobs),
-            _normalize_field(self._evicted_values, keep_blobs=keep_blobs)
-        )
-
-    def _pprint_impl(self, indent, str_buffer):
-        str_buffer.write('  ' * indent + "ListWithEvicted(\n")
-        str_buffer.write('  ' * (indent + 1) + "lengths=\n")
-        self.lengths._pprint_impl(indent=indent + 2, str_buffer=str_buffer)
-        str_buffer.write('  ' * (indent + 1) + "_items=\n")
-        self._items._pprint_impl(indent=indent + 2, str_buffer=str_buffer)
-        str_buffer.write('  ' * (indent + 1) + "_evicted_values=\n")
-        self._evicted_values._pprint_impl(indent=indent + 2, str_buffer=str_buffer)
-        str_buffer.write('  ' * indent + ")\n")
-
-
-    def __getattr__(self, item):
-        """If the value of this list is a struct,
-        allow to introspect directly into its fields."""
-        if item.startswith('__'):
-            raise AttributeError(item)
-        if item == "_evicted_values":
-            return self._evicted_values
-        if isinstance(self._items, Struct):
-            return getattr(self._items, item)
-        elif item == 'value' or item == 'items':
-            return self._items
-        else:
-            raise AttributeError('Field not found in list: %s.' % item)
-
-    def __getitem__(self, item):
-        names = item.split(FIELD_SEPARATOR, 1)
-
-        if len(names) == 1:
-            if item == 'lengths':
-                return self.lengths
-            elif item == 'values':
-                return self._items
-            elif item == '_evicted_values':
-                return self._evicted_values
-        else:
-            if names[0] == 'values':
-                return self._items[names[1]]
-        raise KeyError('Field not found in list: %s.' % item)
-
-
-class Struct(Field):
-    """Represents a named list of fields sharing the same domain.
-    """
-
-    __slots__: Sequence[str] = ("fields", "_frozen")
-
-    def __init__(self, *fields):
-        """ fields is a list of tuples in format of (name, field). The name is
-        a string of nested name, e.g., `a`, `a:b`, `a:b:c`. For example
-
-        Struct(
-          ('a', Scalar()),
-          ('b:c', Scalar()),
-          ('b:d:e', Scalar()),
-          ('b', Struct(
-            ('f', Scalar()),
-          )),
-        )
-
-        is equal to
-
-        Struct(
-          ('a', Scalar()),
-          ('b', Struct(
-            ('c', Scalar()),
-            ('d', Struct(('e', Scalar()))),
-            ('f', Scalar()),
-          )),
-        )
-        """
-        for field in fields:
-            assert len(field) == 2
-            assert field[0], 'Field names cannot be empty'
-            assert field[0] != 'lengths', (
-                'Struct cannot contain a field named `lengths`.'
-            )
-        fields = [(name, _normalize_field(field)) for name, field in fields]
-        self.fields = OrderedDict()
-        for name, field in fields:
-            if FIELD_SEPARATOR in name:
-                name, field = self._struct_from_nested_name(name, field)
-            if name not in self.fields:
-                self.fields[name] = field
-                continue
-            if (
-                    not isinstance(field, Struct) or
-                    not isinstance(self.fields[name], Struct)
-            ):
-                raise ValueError('Duplicate field name: %s' % name)
-            self.fields[name] = self.fields[name] + field
-        for id, (_, field) in enumerate(self.fields.items()):
-            field._set_parent(self, id)
-        super().__init__(self.fields.values())
-        self._frozen = True
-
-    def _struct_from_nested_name(self, nested_name, field):
-        def create_internal(nested_name, field):
-            names = nested_name.split(FIELD_SEPARATOR, 1)
-            if len(names) == 1:
-                added_field = field
-            else:
-                added_field = create_internal(names[1], field)
-            return Struct((names[0], added_field))
-
-        names = nested_name.split(FIELD_SEPARATOR, 1)
-        assert len(names) >= 2
-        return names[0], create_internal(names[1], field)
-
-    def get_children(self):
-        return list(self.fields.items())
-
-    def field_names(self):
-        names = []
-        for name, field in self.fields.items():
-            names += [_join_field_name(name, f) for f in field.field_names()]
-        return names
-
-    def field_types(self):
-        types = []
-        for field in self.fields.values():
-            types += field.field_types()
-        return types
-
-    def field_metadata(self):
-        metadata = []
-        for field in self.fields.values():
-            metadata += field.field_metadata()
-        return metadata
-
-    def field_blobs(self):
-        blobs = []
-        for field in self.fields.values():
-            blobs += field.field_blobs()
-        return blobs
-
-    def all_scalars(self):
-        scalars = []
-        for field in self.fields.values():
-            scalars += field.all_scalars()
-        return scalars
-
-    def has_blobs(self):
-        return all(field.has_blobs() for field in self.fields.values())
-
-    def clone(self, keep_blobs=True):
-        normalized_fields = [
-            (k, _normalize_field(v, keep_blobs=keep_blobs))
-            for k, v in self.fields.items()
-        ]
-        return type(self)(*normalized_fields)
-
-    def _get_field_by_nested_name(self, nested_name):
-        names = nested_name.split(FIELD_SEPARATOR, 1)
-        field = self.fields.get(names[0], None)
-
-        if field is None:
-            return None
-
-        if len(names) == 1:
-            return field
-
-        try:
-            return field[names[1]]
-        except (KeyError, TypeError):
-            return None
-
-    def _pprint_impl(self, indent, str_buffer):
-        str_buffer.write('  ' * indent + "Struct( \n")
-        for name, field in self.fields.items():
-            str_buffer.write('  ' * (indent + 1) + "{}=".format(name) + "\n")
-            field._pprint_impl(indent=indent + 2, str_buffer=str_buffer)
-        str_buffer.write('  ' * indent + ") \n")
-
-    def __contains__(self, item):
-        field = self._get_field_by_nested_name(item)
-        return field is not None
-
-    def __len__(self):
-        return len(self.fields)
-
-    def __getitem__(self, item):
-        """
-        item can be a tuple or list of ints or strings, or a single
-        int or string. String item is a nested field name, e.g., "a", "a:b",
-        "a:b:c". Int item is the index of a field at the first level of the
-        Struct.
-        """
-        if isinstance(item, list) or isinstance(item, tuple):
-            keys = list(self.fields.keys())
-            return Struct(
-                * [
-                    (
-                        keys[k]
-                        if isinstance(k, int) else k, self[k]
-                    ) for k in item
-                ]
-            )
-        elif isinstance(item, int):
-            return next(islice(self.fields.values(), item, None))
-        else:
-            field = self._get_field_by_nested_name(item)
-            if field is None:
-                raise KeyError('field "%s" not found' % (item))
-            return field
-
-    def get(self, item, default_value):
-        """
-        similar to python's dictionary get method, return field of item if found
-        (i.e. self.item is valid) or otherwise return default_value
-
-        it's a syntax suger of python's builtin getattr method
-        """
-        return getattr(self, item, default_value)
-
-    def __getattr__(self, item):
-        if item.startswith('__'):
-            raise AttributeError(item)
-        try:
-            return super().__getattribute__("fields")[item]
-        except KeyError as e:
-            raise AttributeError(item) from e
-
-    def __setattr__(self, key, value):
-        # Disable setting attributes after initialization to prevent false
-        # impression of being able to overwrite a field.
-        # Allowing setting internal states mainly so that _parent can be set
-        # post initialization.
-        if getattr(self, '_frozen', None) and not key.startswith('_'):
-            raise TypeError('Struct.__setattr__() is disabled after __init__()')
-        super().__setattr__(key, value)
-
-    def __add__(self, other):
-        """
-        Allows to merge fields of two schema.Struct using '+' operator.
-        If two Struct have common field names, the merge is conducted
-        recursively. Here are examples:
-
-        Example 1
-        s1 = Struct(('a', Scalar()))
-        s2 = Struct(('b', Scalar()))
-        s1 + s2 == Struct(
-            ('a', Scalar()),
-            ('b', Scalar()),
-        )
-
-        Example 2
-        s1 = Struct(
-            ('a', Scalar()),
-            ('b', Struct(('c', Scalar()))),
-        )
-        s2 = Struct(('b', Struct(('d', Scalar()))))
-        s1 + s2 == Struct(
-            ('a', Scalar()),
-            ('b', Struct(
-                ('c', Scalar()),
-                ('d', Scalar()),
-            )),
-        )
-        """
-        if not isinstance(other, Struct):
-            return NotImplemented
-
-        children = OrderedDict(self.get_children())
-        for name, right_field in other.get_children():
-            if name not in children:
-                children[name] = right_field
-                continue
-            left_field = children[name]
-            if not (isinstance(left_field, Struct) and isinstance(right_field, Struct)):
-                raise TypeError(
-                    "Type of left_field, " + str(type(left_field)) +
-                    ", and type of right_field, " +
-                    str(type(right_field)) +
-                    ", must both the Struct to allow merging of the field, " + name)
-            children[name] = left_field + right_field
-
-        return Struct(*(children.items()))
-
-    def __sub__(self, other):
-        """
-        Allows to remove common fields of two schema.Struct from self by
-        using '-' operator. If two Struct have common field names, the
-        removal is conducted recursively. If a child struct has no fields
-        inside, it will be removed from its parent. Here are examples:
-
-        Example 1
-        s1 = Struct(
-            ('a', Scalar()),
-            ('b', Scalar()),
-        )
-        s2 = Struct(('a', Scalar()))
-        s1 - s2 == Struct(('b', Scalar()))
-
-        Example 2
-        s1 = Struct(
-            ('b', Struct(
-                ('c', Scalar()),
-                ('d', Scalar()),
-            ))
-        )
-        s2 = Struct(
-            ('b', Struct(('c', Scalar()))),
-        )
-        s1 - s2 == Struct(
-            ('b', Struct(
-                ('d', Scalar()),
-            )),
-        )
-
-        Example 3
-        s1 = Struct(
-            ('a', Scalar()),
-            ('b', Struct(
-                ('d', Scalar()),
-            ))
-        )
-        s2 = Struct(
-            ('b', Struct(
-                ('c', Scalar())
-                ('d', Scalar())
-            )),
-        )
-        s1 - s2 == Struct(
-            ('a', Scalar()),
-        )
-        """
-        if not isinstance(other, Struct):
-            return NotImplemented
-
-        children = OrderedDict(self.get_children())
-        for name, right_field in other.get_children():
-            if name in children:
-                left_field = children[name]
-                if type(left_field) == type(right_field):
-                    if isinstance(left_field, Struct):
-                        child = left_field - right_field
-                        if child.get_children():
-                            children[name] = child
-                            continue
-                    children.pop(name)
-                else:
-                    raise TypeError(
-                        "Type of left_field, " + str(type(left_field)) +
-                        ", is not the same as that of right_field, " +
-                        str(type(right_field)) +
-                        ", yet they have the same field name, " + name)
-        return Struct(*(children.items()))
-
-
-class Scalar(Field):
-    """Represents a typed scalar or tensor of fixed shape.
-
-    A Scalar is a leaf in a schema tree, translating to exactly one tensor in
-    the dataset's underlying storage.
-
-    Usually, the tensor storing the actual values of this field is a 1D tensor,
-    representing a series of values in its domain. It is possible however to
-    have higher rank values stored as a Scalar, as long as all entries have
-    the same shape.
-
-    E.g.:
-
-        Scalar(np.float64)
-
-            Scalar field of type float64. Caffe2 will expect readers and
-            datasets to expose it as a 1D tensor of doubles (vector), where
-            the size of the vector is determined by this fields' domain.
-
-        Scalar((np.int32, 5))
-
-            Tensor field of type int32. Caffe2 will expect readers and
-            datasets to implement it as a 2D tensor (matrix) of shape (L, 5),
-            where L is determined by this fields' domain.
-
-        Scalar((str, (10, 20)))
-
-            Tensor field of type str. Caffe2 will expect readers and
-            datasets to implement it as a 3D tensor of shape (L, 10, 20),
-            where L is determined by this fields' domain.
-
-    If the field type is unknown at construction time, call Scalar(), that will
-    default to np.void as its dtype.
-
-    It is an error to pass a structured dtype to Scalar, since it would contain
-    more than one field. Instead, use from_dtype, which will construct
-    a nested `Struct` field reflecting the given dtype's structure.
-
-    A Scalar can also contain a blob, which represents the value of this
-    Scalar. A blob can be either a numpy.ndarray, in which case it contain the
-    actual contents of the Scalar, or a BlobReference, which represents a
-    blob living in a caffe2 Workspace. If blob of different types are passed,
-    a conversion to numpy.ndarray is attempted.
-    """
-
-    __slots__: Sequence[str] = ("_metadata", "dtype", "_original_dtype", "_blob")
-
-    def __init__(self, dtype=None, blob=None, metadata=None):
-        self._metadata = None
-        self.set(dtype, blob, metadata, unsafe=True)
-        super().__init__([])
-
-    def field_names(self):
-        return ['']
-
-    def field_type(self):
-        return self.dtype
-
-    def field_types(self):
-        return [self.dtype]
-
-    def field_metadata(self):
-        return [self._metadata]
-
-    def has_blobs(self):
-        return self._blob is not None
-
-    def field_blobs(self):
-        assert self._blob is not None, 'Value is not set for this field.'
-        return [self._blob]
-
-    def all_scalars(self):
-        return [self]
-
-    def clone(self, keep_blobs=True):
-        return Scalar(
-            dtype=self._original_dtype,
-            blob=self._blob if keep_blobs else None,
-            metadata=self._metadata
-        )
-
-    def get(self):
-        """Gets the current blob of this Scalar field."""
-        assert self._blob is not None, 'Value is not set for this field.'
-        return self._blob
-
-    def __call__(self):
-        """Shortcut for self.get()"""
-        return self.get()
-
-    @property
-    def metadata(self):
-        return self._metadata
-
-    def set_metadata(self, value):
-        assert isinstance(value, Metadata), \
-            'metadata must be Metadata, got {}'.format(type(value))
-        self._metadata = value
-        self._validate_metadata()
-
-    def _validate_metadata(self):
-        if self._metadata is None:
-            return
-        if (self._metadata.categorical_limit is not None and
-                self.dtype is not None):
-            assert np.issubdtype(self.dtype, np.integer), \
-                "`categorical_limit` can be specified only in integral " + \
-                "fields but got {}".format(self.dtype)
-
-    def set_value(self, blob, throw_on_type_mismatch=False, unsafe=False):
-        """Sets only the blob field still validating the existing dtype"""
-        if self.dtype.base != np.void and throw_on_type_mismatch:
-            assert isinstance(blob, np.ndarray), "Got {!r}".format(blob)
-            assert blob.dtype.base == self.dtype.base, (
-                "Expected {}, got {}".format(self.dtype.base, blob.dtype.base))
-        self.set(dtype=self._original_dtype, blob=blob, unsafe=unsafe)
-
-    def set(self, dtype=None, blob=None, metadata=None, unsafe=False):
-        """Set the type and/or blob of this scalar. See __init__ for details.
-
-        Args:
-            dtype: can be any numpy type. If not provided and `blob` is
-                   provided, it will be inferred. If no argument is provided,
-                   this Scalar will be of type np.void.
-            blob:  if provided, can be either a BlobReference or a
-                   numpy.ndarray. If a value of different type is passed,
-                   a conversion to numpy.ndarray is attempted. Strings aren't
-                   accepted, since they can be ambiguous. If you want to pass
-                   a string, to either BlobReference(blob) or np.array(blob).
-            metadata: optional instance of Metadata, if provided overrides
-                      the metadata information of the scalar
-        """
-        if not unsafe:
-            logger.warning(
-                "Scalar should be considered immutable. Only call Scalar.set() "
-                "on newly created Scalar with unsafe=True. This will become an "
-                "error soon."
-            )
-        if blob is not None and isinstance(blob, basestring):
-            raise ValueError(
-                'Passing str blob to Scalar.set() is ambiguous. '
-                'Do either set(blob=np.array(blob)) or '
-                'set(blob=BlobReference(blob))'
-            )
-
-        self._original_dtype = dtype
-        # Numpy will collapse a shape of 1 into an unindexed data array (shape = ()),
-        # which betrays the docstring of this class (which expects shape = (1,)).
-        # >>> import numpy as np
-        # >> np.dtype((np.int32, 1))
-        # dtype('int32')
-        # >>> np.dtype((np.int32, 5))
-        # dtype(('<i4', (5,)))
-        if dtype is not None and isinstance(dtype, tuple) and dtype[1] == 1:
-            dtype = (dtype[0], (1,))
-        if dtype is not None:
-            if isinstance(dtype, tuple) and dtype[0] == np.void:
-                raise TypeError(
-                    "Cannot set the Scalar with type {} for blob {}."
-                    "If this blob is the output of some operation, "
-                    "please verify the input of that operation has "
-                    "proper type.".format(dtype, blob)
-                )
-            dtype = np.dtype(dtype)
-        # If blob is not None and it is not a BlobReference, we assume that
-        # it is actual tensor data, so we will try to cast it to a numpy array.
-        if blob is not None and not isinstance(blob, BlobReference):
-            preserve_shape = isinstance(blob, np.ndarray)
-            if dtype is not None and dtype != np.void:
-                blob = np.array(blob, dtype=dtype.base)
-                # if array is empty we may need to reshape a little
-                if blob.size == 0 and not preserve_shape:
-                    blob = blob.reshape((0, ) + dtype.shape)
-            else:
-                assert isinstance(blob, np.ndarray), (
-                    'Invalid blob type: %s' % str(type(blob)))
-
-            # reshape scalars into 1D arrays
-            # TODO(azzolini): figure out better way of representing this
-            if len(blob.shape) == 0 and not preserve_shape:
-                blob = blob.reshape((1, ))
-
-            # infer inner shape from the blob given
-            # TODO(dzhulgakov): tweak this to make it work with PackedStruct
-            if (len(blob.shape) > 1 and dtype is not None and
-                    dtype.base != np.void):
-                dtype = np.dtype((dtype.base, blob.shape[1:]))
-        # if we were still unable to infer the dtype
-        if dtype is None:
-            dtype = np.dtype(np.void)
-        assert not dtype.fields, (
-            'Cannot create Scalar with a structured dtype. ' +
-            'Use from_dtype instead.'
-        )
-        self.dtype = dtype
-        self._blob = blob
-        if metadata is not None:
-            self.set_metadata(metadata)
-        self._validate_metadata()
-
-    def set_type(self, dtype):
-        self._original_dtype = dtype
-        if dtype is not None:
-            self.dtype = np.dtype(dtype)
-        else:
-            self.dtype = np.dtype(np.void)
-        self._validate_metadata()
-
-    def _pprint_impl(self, indent, str_buffer):
-        str_buffer.write('  ' * (indent) +
-            'Scalar({!r}, {!r}, {!r})'.format(
-            self.dtype, self._blob, self._metadata) + "\n")
-
-    def id(self):
-        """
-        Return the zero-indexed position of this scalar field in its schema.
-        Used in order to index into the field_blob list returned by readers or
-        accepted by writers.
-        """
-        return self._child_base_id()
-
-
-def Map(
-    keys,
-    values,
-    keys_name='keys',
-    values_name='values',
-    lengths_blob=None
-):
-    """A map is a List of Struct containing keys and values fields.
-    Optionally, you can provide custom name for the key and value fields.
-    """
-    return List(
-        Struct((keys_name, keys), (values_name, values)),
-        lengths_blob=lengths_blob
-    )
-
-def MapWithEvicted(
-    keys,
-    values,
-    keys_name='keys',
-    values_name='values',
-    lengths_blob=None,
-    evicted_values=None
-):
-    """A map with extra field evicted_values
-    """
-    return ListWithEvicted(
-        Struct((keys_name, keys), (values_name, values)),
-        lengths_blob=lengths_blob,
-        evicted_values=evicted_values
-    )
-
-
-def NamedTuple(name_prefix, *fields):
-    return Struct(* [('%s_%d' % (name_prefix, i), field)
-                     for i, field in enumerate(fields)])
-
-
-def Tuple(*fields):
-    """
-    Creates a Struct with default, sequential, field names of given types.
-    """
-    return NamedTuple('field', *fields)
-
-
-def RawTuple(num_fields, name_prefix='field'):
-    """
-    Creates a tuple of `num_field` untyped scalars.
-    """
-    assert isinstance(num_fields, int)
-    assert num_fields >= 0
-    return NamedTuple(name_prefix, *([np.void] * num_fields))
-
-
-def from_dtype(dtype, _outer_shape=()):
-    """Constructs a Caffe2 schema from the given numpy's dtype.
-
-    Numpy supports scalar, array-like and structured datatypes, as long as
-    all the shapes are fixed. This function breaks down the given dtype into
-    a Caffe2 schema containing `Struct` and `Scalar` types.
-
-    Fields containing byte offsets are not currently supported.
-    """
-    if not isinstance(dtype, np.dtype):
-        # wrap into a ndtype
-        shape = _outer_shape
-        dtype = np.dtype((dtype, _outer_shape))
-    else:
-        # concatenate shapes if necessary
-        shape = _outer_shape + dtype.shape
-        if shape != dtype.shape:
-            dtype = np.dtype((dtype.base, shape))
-
-    if not dtype.fields:
-        return Scalar(dtype)
-
-    struct_fields = []
-    for name, (fdtype, offset) in dtype.fields:
-        assert offset == 0, ('Fields with byte offsets are not supported.')
-        struct_fields += (name, from_dtype(fdtype, _outer_shape=shape))
-    return Struct(*struct_fields)
-
-
-class _SchemaNode:
-    """This is a private class used to represent a Schema Node"""
-
-    __slots__: Sequence[str] = ("name", "children", "type_str", "field")
-
-    def __init__(self, name, type_str=''):
-        self.name = name
-        self.children = []
-        self.type_str = type_str
-        self.field = None
-
-    def add_child(self, name, type_str=''):
-        for child in self.children:
-            if child.name == name and child.type_str == type_str:
-                return child
-        child = _SchemaNode(name, type_str)
-        self.children.append(child)
-        return child
-
-    def get_field(self):
-
-        list_names = ['lengths', 'values']
-        map_names = ['lengths', 'keys', 'values']
-
-        if len(self.children) == 0 or self.field is not None:
-            if self.field is None:
-                return Struct()
-            else:
-                return self.field
-
-        child_names = []
-        for child in self.children:
-            child_names.append(child.name)
-
-        if (set(child_names) == set(list_names)):
-            for child in self.children:
-                if child.name == 'values':
-                    values_field = child.get_field()
-                else:
-                    lengths_field = child.get_field()
-            self.field = List(
-                values_field,
-                lengths_blob=lengths_field
-            )
-            self.type_str = "List"
-            return self.field
-        elif (set(child_names) == set(map_names)):
-            for child in self.children:
-                if child.name == 'keys':
-                    key_field = child.get_field()
-                elif child.name == 'values':
-                    values_field = child.get_field()
-                else:
-                    lengths_field = child.get_field()
-            self.field = Map(
-                key_field,
-                values_field,
-                lengths_blob=lengths_field
-            )
-            self.type_str = "Map"
-            return self.field
-
-        else:
-            struct_fields = []
-            for child in self.children:
-                struct_fields.append((child.name, child.get_field()))
-
-            self.field = Struct(*struct_fields)
-            self.type_str = "Struct"
-            return self.field
-
-    def print_recursively(self):
-        for child in self.children:
-            child.print_recursively()
-        logger.info("Printing node: Name and type")
-        logger.info(self.name)
-        logger.info(self.type_str)
-
-
-def from_column_list(
-    col_names, col_types=None,
-    col_blobs=None, col_metadata=None
-):
-    """
-    Given a list of names, types, and optionally values, construct a Schema.
-    """
-    if col_types is None:
-        col_types = [None] * len(col_names)
-    if col_metadata is None:
-        col_metadata = [None] * len(col_names)
-    if col_blobs is None:
-        col_blobs = [None] * len(col_names)
-    assert len(col_names) == len(col_types), (
-        'col_names and col_types must have the same length.'
-    )
-    assert len(col_names) == len(col_metadata), (
-        'col_names and col_metadata must have the same length.'
-    )
-    assert len(col_names) == len(col_blobs), (
-        'col_names and col_blobs must have the same length.'
-    )
-    root = _SchemaNode('root', 'Struct')
-    for col_name, col_type, col_blob, col_md in zip(
-        col_names, col_types, col_blobs, col_metadata
-    ):
-        columns = col_name.split(FIELD_SEPARATOR)
-        current = root
-        for i in range(len(columns)):
-            name = columns[i]
-            type_str = ''
-            field = None
-            if i == len(columns) - 1:
-                type_str = col_type
-                field = Scalar(
-                    dtype=col_type,
-                    blob=col_blob,
-                    metadata=col_md
-                )
-            next = current.add_child(name, type_str)
-            if field is not None:
-                next.field = field
-            current = next
-
-    return root.get_field()
-
-
-def from_blob_list(schema, values, throw_on_type_mismatch=False):
-    """
-    Create a schema that clones the given schema, but containing the given
-    list of values.
-    """
-    assert isinstance(schema, Field), 'Argument `schema` must be a Field.'
-    if isinstance(values, BlobReference):
-        values = [values]
-    record = schema.clone_schema()
-    scalars = record.all_scalars()
-    assert len(scalars) == len(values), (
-        'Values must have %d elements, got %d.' % (len(scalars), len(values))
-    )
-    for scalar, value in zip(scalars, values):
-        scalar.set_value(value, throw_on_type_mismatch, unsafe=True)
-    return record
-
-
-def as_record(value):
-    if isinstance(value, Field):
-        return value
-    elif isinstance(value, list) or isinstance(value, tuple):
-        is_field_list = all(
-            f is tuple and len(f) == 2 and isinstance(f[0], basestring)
-            for f in value
-        )
-        if is_field_list:
-            return Struct(* [(k, as_record(v)) for k, v in value])
-        else:
-            return Tuple(* [as_record(f) for f in value])
-    elif isinstance(value, dict):
-        return Struct(* [(k, as_record(v)) for k, v in value.items()])
-    else:
-        return _normalize_field(value)
-
-
-def FetchRecord(blob_record, ws=None, throw_on_type_mismatch=False):
-    """
-    Given a record containing BlobReferences, return a new record with same
-    schema, containing numpy arrays, fetched from the current active workspace.
-    """
-
-    def fetch(v):
-        if ws is None:
-            return workspace.FetchBlob(str(v))
-        else:
-            return ws.blobs[str(v)].fetch()
-
-    assert isinstance(blob_record, Field)
-    field_blobs = blob_record.field_blobs()
-    assert all(isinstance(v, BlobReference) for v in field_blobs)
-    field_arrays = [fetch(value) for value in field_blobs]
-    return from_blob_list(blob_record, field_arrays, throw_on_type_mismatch)
-
-
-def FeedRecord(blob_record, arrays, ws=None):
-    """
-    Given a Record containing blob_references and arrays, which is either
-    a list of numpy arrays or a Record containing numpy arrays, feeds the
-    record to the current workspace.
-    """
-
-    def feed(b, v):
-        if ws is None:
-            workspace.FeedBlob(str(b), v)
-        else:
-            ws.create_blob(str(b))
-            ws.blobs[str(b)].feed(v)
-    assert isinstance(blob_record, Field)
-    field_blobs = blob_record.field_blobs()
-    assert all(isinstance(v, BlobReference) for v in field_blobs)
-    if isinstance(arrays, Field):
-        # TODO: check schema
-        arrays = arrays.field_blobs()
-    assert len(arrays) == len(field_blobs), (
-        'Values must contain exactly %d ndarrays.' % len(field_blobs)
-    )
-    for blob, array in zip(field_blobs, arrays):
-        feed(blob, array)
-
-
-def NewRecord(net, schema):
-    """
-    Given a record of np.arrays, create a BlobReference for each one of them,
-    returning a record containing BlobReferences. The name of each returned blob
-    is NextScopedBlob(field_name), which guarantees unique name in the current
-    net. Use NameScope explicitly to avoid name conflictions between different
-    nets.
-    """
-    if isinstance(schema, Scalar):
-        result = schema.clone()
-        result.set_value(
-            blob=net.NextScopedBlob('unnamed_scalar'),
-            unsafe=True,
-        )
-        return result
-
-    assert isinstance(schema, Field), 'Record must be a schema.Field instance.'
-    blob_refs = [
-        net.NextScopedBlob(prefix=name)
-        for name in schema.field_names()
-    ]
-    return from_blob_list(schema, blob_refs)
-
-
-def ConstRecord(net, array_record):
-    """
-    Given a record of arrays, returns a record of blobs,
-    initialized with net.Const.
-    """
-    blob_record = NewRecord(net, array_record)
-    for blob, array in zip(
-        blob_record.field_blobs(), array_record.field_blobs()
-    ):
-        net.Const(array, blob)
-    return blob_record
-
-
-def InitEmptyRecord(net, schema_or_record, enforce_types=False):
-    if not schema_or_record.has_blobs():
-        record = NewRecord(net, schema_or_record)
-    else:
-        record = schema_or_record
-
-    for blob_type, blob in zip(record.field_types(), record.field_blobs()):
-        try:
-            data_type = data_type_for_dtype(blob_type)
-            shape = [0] + list(blob_type.shape)
-            net.ConstantFill([], blob, shape=shape, dtype=data_type)
-        except TypeError:
-            logger.warning("Blob {} has type error".format(blob))
-            # If data_type_for_dtype doesn't know how to resolve given numpy
-            # type to core.DataType, that function can throw type error (for
-            # example that would happen for cases of unknown types such as
-            # np.void). This is not a problem for cases when the record if going
-            # to be overwritten by some operator later, though it might be an
-            # issue for type/shape inference.
-            if enforce_types:
-                raise
-            # If we don't enforce types for all items we'll create a blob with
-            # the default ConstantFill (FLOAT, no shape)
-            net.ConstantFill([], blob, shape=[0])
-
-    return record
-
-
-_DATA_TYPE_FOR_DTYPE = [
-    (str, core.DataType.STRING),
-    (np.float16, core.DataType.FLOAT16),
-    (np.float32, core.DataType.FLOAT),
-    (np.float64, core.DataType.DOUBLE),
-    (bool, core.DataType.BOOL),
-    (np.int8, core.DataType.INT8),
-    (np.int16, core.DataType.INT16),
-    (np.int32, core.DataType.INT32),
-    (np.int64, core.DataType.INT64),
-    (np.uint8, core.DataType.UINT8),
-    (np.uint16, core.DataType.UINT16),
-]
-
-
-def is_schema_subset(schema, original_schema):
-    # TODO add more checks
-    return set(schema.field_names()).issubset(
-        set(original_schema.field_names()))
-
-def equal_schemas(schema,
-                  original_schema,
-                  check_field_names=True,
-                  check_field_types=True,
-                  check_field_metas=False):
-    assert isinstance(schema, Field)
-    assert isinstance(original_schema, Field)
-
-    if check_field_names and (
-            schema.field_names() != original_schema.field_names()):
-        return False
-    if check_field_types and (
-            schema.field_types() != original_schema.field_types()):
-        return False
-    if check_field_metas and (
-            schema.field_metadata() != original_schema.field_metadata()):
-        return False
-
-    return True
-
-
-def schema_check(schema, previous=None):
-    record = as_record(schema)
-    if previous is not None:
-        assert equal_schemas(schema, previous)
-    return record
-
-
-def data_type_for_dtype(dtype):
-    for np_type, dt in _DATA_TYPE_FOR_DTYPE:
-        if dtype.base == np_type:
-            return dt
-    raise TypeError('Unknown dtype: ' + str(dtype.base))
-
-
-def dtype_for_core_type(core_type):
-    for np_type, dt in _DATA_TYPE_FOR_DTYPE:
-        if dt == core_type:
-            return np_type
-    raise TypeError('Unknown core type: ' + str(core_type))
-
-
-def attach_metadata_to_scalars(field, metadata):
-    for f in field.all_scalars():
-        f.set_metadata(metadata)
diff --git a/caffe2/python/schema_test.py b/caffe2/python/schema_test.py
deleted file mode 100644
index 2f3eaf38dc13..000000000000
--- a/caffe2/python/schema_test.py
+++ /dev/null
@@ -1,471 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, schema
-import numpy as np
-
-import unittest
-import pickle
-import random
-
-class TestField(unittest.TestCase):
-    def testInitShouldSetEmptyParent(self):
-        f = schema.Field([])
-        self.assertTupleEqual(f._parent, (None, 0))
-
-    def testInitShouldSetFieldOffsets(self):
-        f = schema.Field([
-            schema.Scalar(dtype=np.int32),
-            schema.Struct(
-                ('field1', schema.Scalar(dtype=np.int32)),
-                ('field2', schema.List(schema.Scalar(dtype=str))),
-            ),
-            schema.Scalar(dtype=np.int32),
-            schema.Struct(
-                ('field3', schema.Scalar(dtype=np.int32)),
-                ('field4', schema.List(schema.Scalar(dtype=str)))
-            ),
-            schema.Scalar(dtype=np.int32),
-        ])
-        self.assertListEqual(f._field_offsets, [0, 1, 4, 5, 8, 9])
-
-    def testInitShouldSetFieldOffsetsIfNoChildren(self):
-        f = schema.Field([])
-        self.assertListEqual(f._field_offsets, [0])
-
-
-class TestDB(unittest.TestCase):
-    def testPicklable(self):
-        s = schema.Struct(
-            ('field1', schema.Scalar(dtype=np.int32)),
-            ('field2', schema.List(schema.Scalar(dtype=str)))
-        )
-        s2 = pickle.loads(pickle.dumps(s))
-        for r in (s, s2):
-            self.assertTrue(isinstance(r.field1, schema.Scalar))
-            self.assertTrue(isinstance(r.field2, schema.List))
-            self.assertTrue(getattr(r, 'non_existent', None) is None)
-
-    def testListSubclassClone(self):
-        class Subclass(schema.List):
-            pass
-
-        s = Subclass(schema.Scalar())
-        clone = s.clone()
-        self.assertIsInstance(clone, Subclass)
-        self.assertEqual(s, clone)
-        self.assertIsNot(clone, s)
-
-    def testListWithEvictedSubclassClone(self):
-        class Subclass(schema.ListWithEvicted):
-            pass
-
-        s = Subclass(schema.Scalar())
-        clone = s.clone()
-        self.assertIsInstance(clone, Subclass)
-        self.assertEqual(s, clone)
-        self.assertIsNot(clone, s)
-
-    def testStructSubclassClone(self):
-        class Subclass(schema.Struct):
-            pass
-
-        s = Subclass(
-            ('a', schema.Scalar()),
-        )
-        clone = s.clone()
-        self.assertIsInstance(clone, Subclass)
-        self.assertEqual(s, clone)
-        self.assertIsNot(clone, s)
-
-    def testNormalizeField(self):
-        s = schema.Struct(('field1', np.int32), ('field2', str))
-        self.assertEqual(
-            s,
-            schema.Struct(
-                ('field1', schema.Scalar(dtype=np.int32)),
-                ('field2', schema.Scalar(dtype=str))
-            )
-        )
-
-    def testTuple(self):
-        s = schema.Tuple(np.int32, str, np.float32)
-        s2 = schema.Struct(
-            ('field_0', schema.Scalar(dtype=np.int32)),
-            ('field_1', schema.Scalar(dtype=str)),
-            ('field_2', schema.Scalar(dtype=np.float32))
-        )
-        self.assertEqual(s, s2)
-        self.assertEqual(s[0], schema.Scalar(dtype=np.int32))
-        self.assertEqual(s[1], schema.Scalar(dtype=str))
-        self.assertEqual(s[2], schema.Scalar(dtype=np.float32))
-        self.assertEqual(
-            s[2, 0],
-            schema.Struct(
-                ('field_2', schema.Scalar(dtype=np.float32)),
-                ('field_0', schema.Scalar(dtype=np.int32)),
-            )
-        )
-        # test iterator behavior
-        for i, (v1, v2) in enumerate(zip(s, s2)):
-            self.assertEqual(v1, v2)
-            self.assertEqual(s[i], v1)
-            self.assertEqual(s2[i], v1)
-
-    def testRawTuple(self):
-        s = schema.RawTuple(2)
-        self.assertEqual(
-            s, schema.Struct(
-                ('field_0', schema.Scalar()), ('field_1', schema.Scalar())
-            )
-        )
-        self.assertEqual(s[0], schema.Scalar())
-        self.assertEqual(s[1], schema.Scalar())
-
-    def testStructIndexing(self):
-        s = schema.Struct(
-            ('field1', schema.Scalar(dtype=np.int32)),
-            ('field2', schema.List(schema.Scalar(dtype=str))),
-            ('field3', schema.Struct()),
-        )
-        self.assertEqual(s['field2'], s.field2)
-        self.assertEqual(s['field2'], schema.List(schema.Scalar(dtype=str)))
-        self.assertEqual(s['field3'], schema.Struct())
-        self.assertEqual(
-            s['field2', 'field1'],
-            schema.Struct(
-                ('field2', schema.List(schema.Scalar(dtype=str))),
-                ('field1', schema.Scalar(dtype=np.int32)),
-            )
-        )
-
-    def testListInStructIndexing(self):
-        a = schema.List(schema.Scalar(dtype=str))
-        s = schema.Struct(
-            ('field1', schema.Scalar(dtype=np.int32)),
-            ('field2', a)
-        )
-        self.assertEqual(s['field2:lengths'], a.lengths)
-        self.assertEqual(s['field2:values'], a.items)
-        with self.assertRaises(KeyError):
-            s['fields2:items:non_existent']
-        with self.assertRaises(KeyError):
-            s['fields2:non_existent']
-
-    def testListWithEvictedInStructIndexing(self):
-        a = schema.ListWithEvicted(schema.Scalar(dtype=str))
-        s = schema.Struct(
-            ('field1', schema.Scalar(dtype=np.int32)),
-            ('field2', a)
-        )
-        self.assertEqual(s['field2:lengths'], a.lengths)
-        self.assertEqual(s['field2:values'], a.items)
-        self.assertEqual(s['field2:_evicted_values'], a._evicted_values)
-        with self.assertRaises(KeyError):
-            s['fields2:items:non_existent']
-        with self.assertRaises(KeyError):
-            s['fields2:non_existent']
-
-    def testMapInStructIndexing(self):
-        a = schema.Map(
-            schema.Scalar(dtype=np.int32),
-            schema.Scalar(dtype=np.float32),
-        )
-        s = schema.Struct(
-            ('field1', schema.Scalar(dtype=np.int32)),
-            ('field2', a)
-        )
-        self.assertEqual(s['field2:values:keys'], a.keys)
-        self.assertEqual(s['field2:values:values'], a.values)
-        with self.assertRaises(KeyError):
-            s['fields2:keys:non_existent']
-
-    def testPreservesMetadata(self):
-        s = schema.Struct(
-            ('a', schema.Scalar(np.float32)), (
-                'b', schema.Scalar(
-                    np.int32,
-                    metadata=schema.Metadata(categorical_limit=5)
-                )
-            ), (
-                'c', schema.List(
-                    schema.Scalar(
-                        np.int32,
-                        metadata=schema.Metadata(categorical_limit=6)
-                    )
-                )
-            )
-        )
-        # attach metadata to lengths field
-        s.c.lengths.set_metadata(schema.Metadata(categorical_limit=7))
-
-        self.assertEqual(None, s.a.metadata)
-        self.assertEqual(5, s.b.metadata.categorical_limit)
-        self.assertEqual(6, s.c.value.metadata.categorical_limit)
-        self.assertEqual(7, s.c.lengths.metadata.categorical_limit)
-        sc = s.clone()
-        self.assertEqual(None, sc.a.metadata)
-        self.assertEqual(5, sc.b.metadata.categorical_limit)
-        self.assertEqual(6, sc.c.value.metadata.categorical_limit)
-        self.assertEqual(7, sc.c.lengths.metadata.categorical_limit)
-        sv = schema.from_blob_list(
-            s, [
-                np.array([3.4]), np.array([2]), np.array([3]),
-                np.array([1, 2, 3])
-            ]
-        )
-        self.assertEqual(None, sv.a.metadata)
-        self.assertEqual(5, sv.b.metadata.categorical_limit)
-        self.assertEqual(6, sv.c.value.metadata.categorical_limit)
-        self.assertEqual(7, sv.c.lengths.metadata.categorical_limit)
-
-    def testDupField(self):
-        with self.assertRaises(ValueError):
-            schema.Struct(
-                ('a', schema.Scalar()),
-                ('a', schema.Scalar()))
-
-    def testAssignToField(self):
-        with self.assertRaises(TypeError):
-            s = schema.Struct(('a', schema.Scalar()))
-            s.a = schema.Scalar()
-
-    def testPreservesEmptyFields(self):
-        s = schema.Struct(
-            ('a', schema.Scalar(np.float32)),
-            ('b', schema.Struct()),
-        )
-        sc = s.clone()
-        self.assertIn("a", sc.fields)
-        self.assertIn("b", sc.fields)
-        sv = schema.from_blob_list(s, [np.array([3.4])])
-        self.assertIn("a", sv.fields)
-        self.assertIn("b", sv.fields)
-        self.assertEqual(0, len(sv.b.fields))
-
-    def testStructSubstraction(self):
-        s1 = schema.Struct(
-            ('a', schema.Scalar()),
-            ('b', schema.Scalar()),
-            ('c', schema.Scalar()),
-        )
-        s2 = schema.Struct(
-            ('b', schema.Scalar())
-        )
-        s = s1 - s2
-        self.assertEqual(['a', 'c'], s.field_names())
-
-        s3 = schema.Struct(
-            ('a', schema.Scalar())
-        )
-        s = s1 - s3
-        self.assertEqual(['b', 'c'], s.field_names())
-
-        with self.assertRaises(TypeError):
-            s1 - schema.Scalar()
-
-    def testStructNestedSubstraction(self):
-        s1 = schema.Struct(
-            ('a', schema.Scalar()),
-            ('b', schema.Struct(
-                ('c', schema.Scalar()),
-                ('d', schema.Scalar()),
-                ('e', schema.Scalar()),
-                ('f', schema.Scalar()),
-            )),
-        )
-        s2 = schema.Struct(
-            ('b', schema.Struct(
-                ('d', schema.Scalar()),
-                ('e', schema.Scalar()),
-            )),
-        )
-        s = s1 - s2
-        self.assertEqual(['a', 'b:c', 'b:f'], s.field_names())
-
-    def testStructAddition(self):
-        s1 = schema.Struct(
-            ('a', schema.Scalar())
-        )
-        s2 = schema.Struct(
-            ('b', schema.Scalar())
-        )
-        s = s1 + s2
-        self.assertIn("a", s.fields)
-        self.assertIn("b", s.fields)
-        with self.assertRaises(TypeError):
-            s1 + s1
-        with self.assertRaises(TypeError):
-            s1 + schema.Scalar()
-
-    def testStructNestedAddition(self):
-        s1 = schema.Struct(
-            ('a', schema.Scalar()),
-            ('b', schema.Struct(
-                ('c', schema.Scalar())
-            )),
-        )
-        s2 = schema.Struct(
-            ('b', schema.Struct(
-                ('d', schema.Scalar())
-            ))
-        )
-        s = s1 + s2
-        self.assertEqual(['a', 'b:c', 'b:d'], s.field_names())
-
-        s3 = schema.Struct(
-            ('b', schema.Scalar()),
-        )
-        with self.assertRaises(TypeError):
-            s = s1 + s3
-
-    def testGetFieldByNestedName(self):
-        st = schema.Struct(
-            ('a', schema.Scalar()),
-            ('b', schema.Struct(
-                ('c', schema.Struct(
-                    ('d', schema.Scalar()),
-                )),
-            )),
-        )
-        self.assertRaises(KeyError, st.__getitem__, '')
-        self.assertRaises(KeyError, st.__getitem__, 'x')
-        self.assertRaises(KeyError, st.__getitem__, 'x:y')
-        self.assertRaises(KeyError, st.__getitem__, 'b:c:x')
-        a = st['a']
-        self.assertTrue(isinstance(a, schema.Scalar))
-        bc = st['b:c']
-        self.assertIn('d', bc.fields)
-        bcd = st['b:c:d']
-        self.assertTrue(isinstance(bcd, schema.Scalar))
-
-    def testAddFieldByNestedName(self):
-        f_a = schema.Scalar(blob=core.BlobReference('blob1'))
-        f_b = schema.Struct(
-            ('c', schema.Struct(
-                ('d', schema.Scalar(blob=core.BlobReference('blob2'))),
-            )),
-        )
-        f_x = schema.Struct(
-            ('x', schema.Scalar(blob=core.BlobReference('blob3'))),
-        )
-
-        with self.assertRaises(TypeError):
-            st = schema.Struct(
-                ('a', f_a),
-                ('b', f_b),
-                ('b:c:d', f_x),
-            )
-        with self.assertRaises(TypeError):
-            st = schema.Struct(
-                ('a', f_a),
-                ('b', f_b),
-                ('b:c:d:e', f_x),
-            )
-
-        st = schema.Struct(
-            ('a', f_a),
-            ('b', f_b),
-            ('e:f', f_x),
-        )
-        self.assertEqual(['a', 'b:c:d', 'e:f:x'], st.field_names())
-        self.assertEqual(['blob1', 'blob2', 'blob3'], st.field_blobs())
-
-        st = schema.Struct(
-            ('a', f_a),
-            ('b:c:e', f_x),
-            ('b', f_b),
-        )
-        self.assertEqual(['a', 'b:c:e:x', 'b:c:d'], st.field_names())
-        self.assertEqual(['blob1', 'blob3', 'blob2'], st.field_blobs())
-
-        st = schema.Struct(
-            ('a:a1', f_a),
-            ('b:b1', f_b),
-            ('a', f_x),
-        )
-        self.assertEqual(['a:a1', 'a:x', 'b:b1:c:d'], st.field_names())
-        self.assertEqual(['blob1', 'blob3', 'blob2'], st.field_blobs())
-
-    def testContains(self):
-        st = schema.Struct(
-            ('a', schema.Scalar()),
-            ('b', schema.Struct(
-                ('c', schema.Struct(
-                    ('d', schema.Scalar()),
-                )),
-            )),
-        )
-        self.assertTrue('a' in st)
-        self.assertTrue('b:c' in st)
-        self.assertTrue('b:c:d' in st)
-        self.assertFalse('' in st)
-        self.assertFalse('x' in st)
-        self.assertFalse('b:c:x' in st)
-        self.assertFalse('b:c:d:x' in st)
-
-    def testFromEmptyColumnList(self):
-        st = schema.Struct()
-        columns = st.field_names()
-        rec = schema.from_column_list(col_names=columns)
-        self.assertEqual(rec, schema.Struct())
-
-    def testFromColumnList(self):
-        st = schema.Struct(
-            ('a', schema.Scalar()),
-            ('b', schema.List(schema.Scalar())),
-            ('c', schema.Map(schema.Scalar(), schema.Scalar()))
-        )
-        columns = st.field_names()
-        # test that recovery works for arbitrary order
-        for _ in range(10):
-            some_blobs = [core.BlobReference('blob:' + x) for x in columns]
-            rec = schema.from_column_list(columns, col_blobs=some_blobs)
-            self.assertTrue(rec.has_blobs())
-            self.assertEqual(sorted(st.field_names()), sorted(rec.field_names()))
-            self.assertEqual([str(blob) for blob in rec.field_blobs()],
-                             [str('blob:' + name) for name in rec.field_names()])
-            random.shuffle(columns)
-
-    def testStructGet(self):
-        net = core.Net('test_net')
-        s1 = schema.NewRecord(net, schema.Scalar(np.float32))
-        s2 = schema.NewRecord(net, schema.Scalar(np.float32))
-        t = schema.Tuple(s1, s2)
-        assert t.get('field_0', None) == s1
-        assert t.get('field_1', None) == s2
-        assert t.get('field_2', None) is None
-
-    def testScalarForVoidType(self):
-        s0_good = schema.Scalar((None, (2, )))
-        with self.assertRaises(TypeError):
-            s0_bad = schema.Scalar((np.void, (2, )))
-
-        s1_good = schema.Scalar(np.void)
-        s2_good = schema.Scalar(None)
-        assert s1_good == s2_good
-
-    def testScalarShape(self):
-        s0 = schema.Scalar(np.int32)
-        self.assertEqual(s0.field_type().shape, ())
-
-        s1_good = schema.Scalar((np.int32, 5))
-        self.assertEqual(s1_good.field_type().shape, (5, ))
-
-        with self.assertRaises(ValueError):
-            s1_bad = schema.Scalar((np.int32, -1))
-
-        s1_hard = schema.Scalar((np.int32, 1))
-        self.assertEqual(s1_hard.field_type().shape, (1, ))
-
-        s2 = schema.Scalar((np.int32, (2, 3)))
-        self.assertEqual(s2.field_type().shape, (2, 3))
-
-    def testDtypeForCoreType(self):
-        dtype = schema.dtype_for_core_type(core.DataType.FLOAT16)
-        self.assertEqual(dtype, np.float16)
-
-        with self.assertRaises(TypeError):
-            schema.dtype_for_core_type(100)
diff --git a/caffe2/python/scope.py b/caffe2/python/scope.py
deleted file mode 100644
index 11fddc7b0f62..000000000000
--- a/caffe2/python/scope.py
+++ /dev/null
@@ -1,122 +0,0 @@
-## @package scope
-# Module caffe2.python.scope
-
-
-
-
-
-import contextlib
-import threading
-from past.builtins import basestring
-
-from caffe2.proto import caffe2_pb2
-
-
-# The name scope and device scope when creating a new operator.
-_NAMESCOPE_SEPARATOR = '/'
-
-_threadlocal_scope = threading.local()
-
-
-def CurrentNameScope():
-    global _threadlocal_scope
-    if not hasattr(_threadlocal_scope, "namescope"):
-        _threadlocal_scope.namescope = ''
-    return _threadlocal_scope.namescope
-
-
-def CurrentDeviceScope():
-    global _threadlocal_scope
-    if not hasattr(_threadlocal_scope, "devicescope"):
-        _threadlocal_scope.devicescope = None
-    return _threadlocal_scope.devicescope
-
-
-@contextlib.contextmanager
-def NameScope(prefix, reset=False):
-    global _threadlocal_scope
-    assert isinstance(prefix, basestring) or prefix is None, \
-        "NameScope takes in a string as its argument."
-    old_scope = CurrentNameScope()
-    prefix = prefix + _NAMESCOPE_SEPARATOR if prefix else ''
-    if reset:
-        _threadlocal_scope.namescope = prefix
-    else:
-        _threadlocal_scope.namescope = _threadlocal_scope.namescope + prefix
-
-    try:
-        yield
-    finally:
-        assert _threadlocal_scope.namescope.endswith(prefix), \
-            "The namescope variable is changed from outside NameScope() calls."
-        _threadlocal_scope.namescope = old_scope
-
-
-@contextlib.contextmanager
-def DeviceScope(scope, node_name=None):
-    new_scope = caffe2_pb2.DeviceOption()
-    if scope:
-        assert isinstance(scope, caffe2_pb2.DeviceOption), \
-            "DeviceScope takes in a caffe2_pb2.DeviceOption as its argument."
-        new_scope.CopyFrom(scope)
-    else:
-        assert node_name, "At least one argument should be non-null in DeviceScope"
-
-    # rewrite node_name if it is explicitly given
-    if node_name:
-        new_scope.node_name = node_name
-    global _threadlocal_scope
-    old_scope = CurrentDeviceScope()
-    # nested scope should inherit the node_name if it is not explicitly set
-    if old_scope and old_scope.HasField('node_name') and \
-            not new_scope.HasField('node_name'):
-        new_scope.node_name = old_scope.node_name
-
-    # nested scope should inherit the extra_info and merged it with new extra_info
-    if old_scope and hasattr(old_scope, 'extra_info'):
-        new_scope.extra_info.extend(old_scope.extra_info)
-    new_scope.extra_info.sort()
-
-    _threadlocal_scope.devicescope = new_scope
-    try:
-        yield
-    finally:
-        assert _threadlocal_scope.devicescope == new_scope, \
-            "The device scope is changed from outside DeviceScope() calls."
-        _threadlocal_scope.devicescope = old_scope
-
-
-@contextlib.contextmanager
-def EmptyNameScope():
-    """
-    Allow users to 'disable' the name scope behaviour.
-
-    This sets the CurrentNameScope() to None, so that the field is
-    not set in CreateOperator(...), etc.
-    """
-    old_scope = CurrentNameScope()
-    try:
-        _threadlocal_scope.namescope = ''
-        yield
-    finally:
-        _threadlocal_scope.namescope = old_scope
-        return
-
-
-@contextlib.contextmanager
-def EmptyDeviceScope():
-    """
-    Allow users to 'disable' the device scope behaviour (so it can be
-    controlled at a NetDef::DeviceOption level, not overridden at
-    OperatorDef::DeviceOption level).
-
-    This sets the CurrentDeviceScope() to None, so that the field is
-    not set in CreateOperator(...), etc.
-    """
-    old_scope = CurrentDeviceScope()
-    try:
-        _threadlocal_scope.devicescope = None
-        yield
-    finally:
-        _threadlocal_scope.devicescope = old_scope
-        return
diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py
deleted file mode 100644
index c2498cd800d8..000000000000
--- a/caffe2/python/scope_test.py
+++ /dev/null
@@ -1,152 +0,0 @@
-
-
-
-
-
-from caffe2.python import scope, core, workspace
-
-import unittest
-import threading
-import time
-
-SUCCESS_COUNT = 0
-
-
-def thread_runner(idx, testobj):
-    global SUCCESS_COUNT
-    testobj.assertEquals(scope.CurrentNameScope(), "")
-    testobj.assertEquals(scope.CurrentDeviceScope(), None)
-    namescope = "namescope_{}".format(idx)
-    dsc = core.DeviceOption(workspace.GpuDeviceType, idx)
-    with scope.DeviceScope(dsc):
-        with scope.NameScope(namescope):
-            testobj.assertEquals(scope.CurrentNameScope(), namescope + "/")
-            testobj.assertEquals(scope.CurrentDeviceScope(), dsc)
-
-            time.sleep(0.01 + idx * 0.01)
-            testobj.assertEquals(scope.CurrentNameScope(), namescope + "/")
-            testobj.assertEquals(scope.CurrentDeviceScope(), dsc)
-
-    testobj.assertEquals(scope.CurrentNameScope(), "")
-    testobj.assertEquals(scope.CurrentDeviceScope(), None)
-    SUCCESS_COUNT += 1
-
-
-class TestScope(unittest.TestCase):
-
-    def testNamescopeBasic(self):
-        self.assertEqual(scope.CurrentNameScope(), "")
-
-        with scope.NameScope("test_scope"):
-            self.assertEqual(scope.CurrentNameScope(), "test_scope/")
-
-        self.assertEqual(scope.CurrentNameScope(), "")
-
-    def testNamescopeAssertion(self):
-        self.assertEqual(scope.CurrentNameScope(), "")
-
-        try:
-            with scope.NameScope("test_scope"):
-                self.assertEqual(scope.CurrentNameScope(), "test_scope/")
-                raise Exception()
-        except Exception:
-            pass
-
-        self.assertEqual(scope.CurrentNameScope(), "")
-
-    def testEmptyNamescopeBasic(self):
-        self.assertEqual(scope.CurrentNameScope(), "")
-
-        with scope.NameScope("test_scope"):
-            with scope.EmptyNameScope():
-                self.assertEqual(scope.CurrentNameScope(), "")
-            self.assertEqual(scope.CurrentNameScope(), "test_scope/")
-
-    def testDevicescopeBasic(self):
-        self.assertEqual(scope.CurrentDeviceScope(), None)
-
-        dsc = core.DeviceOption(workspace.GpuDeviceType, 9)
-        with scope.DeviceScope(dsc):
-            self.assertEqual(scope.CurrentDeviceScope(), dsc)
-
-        self.assertEqual(scope.CurrentDeviceScope(), None)
-
-    def testEmptyDevicescopeBasic(self):
-        self.assertEqual(scope.CurrentDeviceScope(), None)
-
-        dsc = core.DeviceOption(workspace.GpuDeviceType, 9)
-        with scope.DeviceScope(dsc):
-            self.assertEqual(scope.CurrentDeviceScope(), dsc)
-            with scope.EmptyDeviceScope():
-                self.assertEqual(scope.CurrentDeviceScope(), None)
-            self.assertEqual(scope.CurrentDeviceScope(), dsc)
-        self.assertEqual(scope.CurrentDeviceScope(), None)
-
-    def testDevicescopeAssertion(self):
-        self.assertEqual(scope.CurrentDeviceScope(), None)
-
-        dsc = core.DeviceOption(workspace.GpuDeviceType, 9)
-
-        try:
-            with scope.DeviceScope(dsc):
-                self.assertEqual(scope.CurrentDeviceScope(), dsc)
-                raise Exception()
-        except Exception:
-            pass
-
-        self.assertEqual(scope.CurrentDeviceScope(), None)
-
-    def testTags(self):
-        self.assertEqual(scope.CurrentDeviceScope(), None)
-
-        extra_info1 = ["key1:value1"]
-        extra_info2 = ["key2:value2"]
-        extra_info3 = ["key3:value3"]
-
-        extra_info_1_2 = ["key1:value1", "key2:value2"]
-        extra_info_1_2_3 = ["key1:value1", "key2:value2", "key3:value3"]
-
-        with scope.DeviceScope(core.DeviceOption(0, extra_info=extra_info1)):
-            self.assertEqual(scope.CurrentDeviceScope().extra_info, extra_info1)
-
-            with scope.DeviceScope(core.DeviceOption(0, extra_info=extra_info2)):
-                self.assertEqual(scope.CurrentDeviceScope().extra_info, extra_info_1_2)
-
-                with scope.DeviceScope(core.DeviceOption(0, extra_info=extra_info3)):
-                    self.assertEqual(
-                        scope.CurrentDeviceScope().extra_info, extra_info_1_2_3
-                    )
-
-                self.assertEqual(scope.CurrentDeviceScope().extra_info, extra_info_1_2)
-            self.assertEqual(scope.CurrentDeviceScope().extra_info, extra_info1)
-        self.assertEqual(scope.CurrentDeviceScope(), None)
-
-    def testMultiThreaded(self):
-        """
-        Test that name/device scope are properly local to the thread
-        and don't interfere
-        """
-        global SUCCESS_COUNT
-        self.assertEqual(scope.CurrentNameScope(), "")
-        self.assertEqual(scope.CurrentDeviceScope(), None)
-
-        threads = []
-        for i in range(4):
-            threads.append(threading.Thread(
-                target=thread_runner,
-                args=(i, self),
-            ))
-        for t in threads:
-            t.start()
-
-        with scope.NameScope("master"):
-            self.assertEqual(scope.CurrentDeviceScope(), None)
-            self.assertEqual(scope.CurrentNameScope(), "master/")
-            for t in threads:
-                t.join()
-
-            self.assertEqual(scope.CurrentNameScope(), "master/")
-            self.assertEqual(scope.CurrentDeviceScope(), None)
-
-        # Ensure all threads succeeded
-        self.assertEqual(SUCCESS_COUNT, 4)
diff --git a/caffe2/python/serialized_test/README.md b/caffe2/python/serialized_test/README.md
deleted file mode 100644
index aadfc5532929..000000000000
--- a/caffe2/python/serialized_test/README.md
+++ /dev/null
@@ -1,21 +0,0 @@
-# Serialized operator test framework
-
-Major functionality lives in `serialized_test_util.py`
-
-## How to use
-1. Extend the test case class from `SerializedTestCase`
-2. Change the `@given` decorator to `@serialized_test_util.given`. This runs a seeded hypothesis test instance which will generate outputs if desired in addition to the unseeded hypothesis tests normally run.
-3. [Optional] Add (or change a call of `unittest.main()` to) `testWithArgs` in `__main__`. This allows you to generate outputs using `python caffe2/python/operator_test/my_test.py -G`.
-4.  Run your test `python -m pytest caffe2/python/operator_test/my_test.py -G` to generate serialized outputs. They will live in `caffe2/python/serialized_test/data/operator_test`, one zip file per test function. The zip file contains an `inout.npz` file of the inputs, outputs, and meta data (like device type), a `op.pb` file of the operator, and `grad_#.pb` files of the gradients if there are any. Use `-O` to change the output directory. This also generates a markdown document summarizing the coverage of serialized tests. We can disable generating this coverage document using the `-C` flag.
-5. Thereafter, runs of the test without the flag will load serialized outputs and gradient operators for comparison against the seeded run. The comparison is done as long as you have a call to assertReferenceChecks. If for any reason the seeded run's inputs are different (this can happen with different hypothesis versions or different setups), then we'll run the serialized inputs through the serialized operator to get a runtime output for comparison.
-
-## Coverage report
-`SerializedTestCoverage.md` contains some statistics about the coverage of serialized tests. It is regenerated every time someone regenerates a serialized test (i.e. running an operator test with the `-G` option). If you run into merge conflicts for the file, please rebase and regenerate. If you'd like to disable generating this file when generating the serialized test, you can run with `-G -C`. The logic for generating this file lives in `coverage.py`.
-
-##Additional Notes
-
-If we'd like to extend the test framework beyond that for operator tests, we can create a new subfolder for them inside `caffe2/python/serialized_test/data`.
-
-Note, we currently don't support using other hypothesis decorators on top of `given_and_seeded`. Hypothesis has some handling to explicitly check that `@given` is on the bottom of the decorator stack.
-
-If there are multiple calls to assertReferenceChecks in a test function, we'll serialize and write the last one. The actual input checked may then differ if we refactor a test function that calls this multiple times, though the serialized test should still pass since we then use the serialized input to generate a dynamic output.
diff --git a/caffe2/python/serialized_test/SerializedTestCoverage.md b/caffe2/python/serialized_test/SerializedTestCoverage.md
deleted file mode 100644
index 4e61b65ec44d..000000000000
--- a/caffe2/python/serialized_test/SerializedTestCoverage.md
+++ /dev/null
@@ -1,878 +0,0 @@
-# Serialized Test Coverage Report
-This is an automatically generated file. Please see `caffe2/python/serialized_test/README.md` for details. In the case of merge conflicts, please rebase and regenerate.
-## Summary
-Serialized tests have covered 220/852 (25.8%) operators
-
-## Not covered operators
-<details>
-<summary>There are 632 not covered operators</summary>
-
-* APMeter
-* ATen
-* Abs
-* AbsGradient
-* Accumulate
-* AccumulateHistogram
-* Accuracy
-* Adam
-* Add
-* AddFakeFp16
-* AddGradient
-* Alias
-* AliasWithName
-* Allgather
-* Allreduce
-* And
-* Append
-* Assert
-* AtomicAppend
-* AtomicFetchAdd
-* AtomicFetchAdd64
-* AtomicIter
-* AveragePool
-* AveragePool1D
-* AveragePool1DGradient
-* AveragePool2D
-* AveragePool2DGradient
-* AveragePool3D
-* AveragePool3DGradient
-* AveragePoolGradient
-* AveragePut
-* BRGNCHWCToPackedInt8BGRAStylizerDeprocess
-* Barrier
-* BatchMatMulFP16Acc16Fake
-* BatchMatMulFP16Acc32Fake
-* BatchMatMulFP16Fake
-* BatchPermutation
-* BatchPermutationGradient
-* BatchToSpace
-* BisectPercentile
-* BitwiseAnd
-* BitwiseOr
-* BitwiseXor
-* BooleanMaskGradient
-* BooleanMaskLengths
-* Broadcast
-* Bucketize
-* ByteWeightDequant
-* Cast
-* Cbrt
-* CbrtGradient
-* CheckAtomicBool
-* CheckCounterDone
-* CheckDatasetConsistency
-* Checkpoint
-* CloneCommonWorld
-* CloseBlobsQueue
-* CloseRebatchingQueue
-* Col2Im
-* CollectRpnProposals
-* CollectTensor
-* ComputeOffset
-* ConcatBatchMatMulBatchGatherOp
-* ConcatTensorVector
-* ConditionalSetAtomicBool
-* Conv1D
-* Conv1DGradient
-* Conv2D
-* Conv2DGradient
-* Conv3D
-* Conv3DGradient
-* ConvRelu
-* ConvTranspose
-* ConvTransposeGradient
-* Copy
-* CopyCPUToGPU
-* CopyFromCPUInput
-* CopyGPUToCPU
-* CopyOnDeviceLike
-* CopyRowsToTensor
-* CopyRowsToTensorGradient
-* Cos
-* CosGradient
-* CosineSimilarity
-* CosineSimilarityGradient
-* CountDown
-* CountUp
-* Crash
-* CreateAtomicBool
-* CreateBlobsQueue
-* CreateBlobsQueueDB
-* CreateCommonWorld
-* CreateCounter
-* CreateDB
-* CreateMap
-* CreateMutex
-* CreateRebatchingQueue
-* CreateScope
-* CreateTensorVector
-* CreateTextFileReader
-* CreateTreeCursor
-* CrossEntropy
-* CrossEntropyGradient
-* Cube
-* CubeGradient
-* DBExists
-* DataCouple
-* DeformConv
-* DeformConvGradient
-* DenseVectorToIdList
-* DepthConcat
-* DepthSplit
-* DequeueBlobs
-* DequeueRebatchingQueue
-* DestroyCommonWorld
-* DistributeFpnProposals
-* Div
-* DivFakeFp16
-* DivGradient
-* Do
-* DotProductWithPadding
-* DotProductWithPaddingGradient
-* EQ
-* EnforceFinite
-* EnqueueBlobs
-* EnqueueRebatchingQueue
-* EnsureCPUOutput
-* EnsureClipped
-* EnsureDense
-* Equalizer
-* Exp
-* ExpandDims
-* FCFp16X
-* FCGradient_Decomp
-* FCGradient_Prune
-* FCTransposed
-* FCTransposedGradient
-* FC_Decomp
-* FC_Prune
-* FC_Sparse
-* FP16MomentumSGDUpdate
-* FP32MomentumSGDUpdate
-* FP32ToFP16Fake
-* FP32ToFP16FakeNoSubnormal
-* Fail
-* FbFCPacked
-* FbGemmPack
-* FbGemmPackTranspose
-* FeedBlob
-* FilterExampleIds
-* FilterSparseLabels
-* Flatten
-* FlattenToVec
-* Float16ConstantFill
-* Float16UniformFill
-* FloatToFused2BitFakeRowwiseQuantized
-* FloatToFused2BitRowwiseQuantized
-* FloatToFused4BitFakeRowwiseQuantized
-* FloatToFused4BitRowwiseQuantized
-* FloatToFused8BitRowwiseQuantized
-* FloatToFused8BitRowwiseQuantizedHalfScaleBias
-* FloatToFusedRandRowwiseQuantized
-* FloatToHalf
-* FloatToRowwiseQuantized8Bits
-* Fp16FC
-* Fp16FCAcc16
-* Fp16FCAcc16NNPI
-* Fp16FCAcc32
-* Fp16FCAcc32NNPI
-* Free
-* Ftrl
-* FunHash
-* FunHashGradient
-* Fused2BitRowwiseQuantizedToFloat
-* Fused2BitRowwiseQuantizedToHalf
-* Fused4BitRowwiseQuantizedToFloat
-* Fused4BitRowwiseQuantizedToHalf
-* Fused8BitRowwiseQuantizedHalfScaleBiasToFloat
-* Fused8BitRowwiseQuantizedHalfScaleBiasToHalfFloat
-* Fused8BitRowwiseQuantizedToFloat
-* Fused8BitRowwiseQuantizedToHalfFloat
-* FusedRandRowwiseQuantizedToFloat
-* GE
-* GFtrl
-* GT
-* GatherByKey
-* GatherFused8BitRowwise
-* GaussianFill
-* Gelu
-* GeluGradient
-* GenerateProposals
-* GenerateProposalsCPP
-* GetAllBlobNames
-* GetCursorOffset
-* GetGPUMemoryUsage
-* GivenTensorBoolFill
-* GivenTensorByteStringToUInt8Fill
-* GivenTensorDoubleFill
-* GivenTensorFill
-* GivenTensorInt16Fill
-* GivenTensorInt64Fill
-* GivenTensorIntFill
-* GivenTensorStringFill
-* HSoftmax
-* HSoftmaxGradient
-* HSoftmaxSearch
-* HalfFloatToFused8BitRowwiseQuantized
-* HalfFloatToFused8BitRowwiseQuantizedHalfScaleBias
-* HalfToFloat
-* HalfToFused2BitFakeRowwiseQuantized
-* HalfToFused2BitRowwiseQuantized
-* HalfToFused4BitFakeRowwiseQuantized
-* HalfToFused4BitRowwiseQuantized
-* HardSigmoid
-* HardSigmoidGradient
-* HasElements
-* HasScope
-* HeatmapMaxKeypoint
-* Histogram
-* HuffmanTreeHierarchy
-* If
-* Im2Col
-* ImageInput
-* IncrementPut
-* IndexFreeze
-* IndexGet
-* IndexLoad
-* IndexSize
-* IndexStore
-* InferenceLSTM
-* Int8Add
-* Int8AddRelu
-* Int8AveragePool
-* Int8AveragePoolRelu
-* Int8ChannelShuffle
-* Int8Concat
-* Int8Conv
-* Int8ConvRelu
-* Int8ConvTranspose
-* Int8Dequantize
-* Int8DequantizeNNPI
-* Int8FC
-* Int8FCFakeAcc32NNPI
-* Int8Flatten
-* Int8GivenIntTensorFill
-* Int8GivenTensorFill
-* Int8LeakyRelu
-* Int8MaxPool
-* Int8MaxPoolRelu
-* Int8Quantize
-* Int8QuantizeNNPI
-* Int8Relu
-* Int8Reshape
-* Int8ResizeNearest
-* Int8RoIAlign
-* Int8Sigmoid
-* Int8Slice
-* Int8Softmax
-* Int8Sum
-* Int8SumRelu
-* Int8Transpose
-* IntIndexCreate
-* IsEmpty
-* IsNaN
-* Iter
-* KeySplit
-* KeyValueToMap
-* L1Distance
-* L1DistanceGradient
-* LC1D
-* LC1DGradient
-* LC2D
-* LC2DGradient
-* LC3D
-* LC3DGradient
-* LE
-* LRN
-* LRNGradient
-* LSTMUnit
-* LSTMUnitGradient
-* LT
-* LabelCrossEntropy
-* LabelCrossEntropyGradient
-* LambdaRankNdcg
-* LambdaRankNdcgGradient
-* Lars
-* LastNWindowCollector
-* LayerNorm
-* LayerNormFakeFP16
-* LengthsIndicesInGradientMeanGradient
-* LengthsIndicesInGradientSumGradient
-* LengthsMax
-* LengthsMaxWithMainInputAndForwardOutputGradient
-* LengthsMean
-* LengthsMeanGradient
-* LengthsPartition
-* LengthsSumGradient
-* LengthsToSegmentIds
-* LengthsToShape
-* LengthsToWeights
-* LengthsWeightedSum
-* LengthsWeightedSumGradient
-* LengthsWeightedSumWithMainInputGradient
-* Load
-* LogFatal
-* Logit
-* LogitGradient
-* LongIndexCreate
-* LpNorm
-* LpNormGradient
-* LpPool
-* LpPoolGradient
-* MSRAFill
-* MakeTwoClass
-* MakeTwoClassGradient
-* MapToKeyValue
-* MaxPool
-* MaxPool1D
-* MaxPool1DGradient
-* MaxPool2D
-* MaxPool2DGradient
-* MaxPool3D
-* MaxPool3DGradient
-* MaxPoolGradient
-* MaxPoolWithIndex
-* MaxPoolWithIndexGradient
-* MergeDim
-* MergeExampleIds
-* MergeMultiListFeatureTensors
-* MergeMultiListFeatureTensorsGradient
-* MergeMultiMapFeatureTensors
-* MergeMultiMapFeatureTensorsGradient
-* MergeMultiScalarFeatureTensors
-* MergeMultiScalarFeatureTensorsGradient
-* MergeSingleListFeatureTensors
-* MergeSingleListFeatureTensorsGradient
-* MergeSingleMapFeatureTensors
-* MergeSingleMapFeatureTensorsGradient
-* MergeSingleScalarFeatureTensors
-* MergeSingleScalarFeatureTensorsGradient
-* Mod
-* MomentumSGDUpdate
-* MulFakeFp16
-* MulGradient
-* MultiClassAccuracy
-* NCHW2NHWC
-* NE
-* NGramFromCategorical
-* NHWC2NCHW
-* Normalize
-* NormalizeGradient
-* NormalizeL1
-* NormalizePlanarYUV
-* Not
-* Onnxifi
-* Or
-* PRelu
-* PReluGradient
-* PackRecords
-* PackedInt8BGRANHWCToNCHWCStylizerPreprocess
-* PadEmptySamples
-* Partition
-* Percentile
-* Perplexity
-* PrependDim
-* Print
-* Python
-* PythonDLPack
-* PythonDLPackGradient
-* PythonGradient
-* QuantDecode
-* QuantDecodeGradient
-* QuantDecompZstd
-* Quantile
-* RMACRegions
-* Range
-* RangeFill
-* ReadNextBatch
-* ReadRandomBatch
-* ReceiveTensor
-* Reciprocal
-* ReciprocalGradient
-* RecurrentNetworkBlobFetcher
-* Reduce
-* ReduceBackSum
-* ReduceBackSumGradient
-* ReduceFrontWeightedSum
-* ReduceFrontWeightedSumGradient
-* ReduceL1
-* ReduceL1Gradient
-* ReduceScatter
-* ReduceSum
-* ReduceSumGradient
-* ReduceTailSum
-* ReluFakeFp16
-* RemovePadding
-* ReplaceNaN
-* ReservoirSampling
-* ResetCounter
-* ResetCursor
-* Reshape
-* ResizeLike
-* ResizeNearest
-* ResizeNearest3D
-* ResizeNearest3DGradient
-* ResizeNearestGradient
-* RetrieveCount
-* RmsProp
-* RoIAlign
-* RoIAlignGradient
-* RoIAlignRotated
-* RoIAlignRotatedGradient
-* RoIPool
-* RoIPoolGradient
-* RowMul
-* RowWiseCounter
-* RowWiseSparseAdagradFusedWithSparseLengthsSumGradient
-* RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradient
-* RowWiseSparseAdagradFusedWithSparseLengthsWeightedSumGradientApprox
-* RowWiseSparseAdam
-* Rowwise8BitQuantizedToFloat
-* Rsqrt
-* RsqrtGradient
-* SafeDequeueBlobs
-* SafeEnqueueBlobs
-* Save
-* Scale
-* ScaleBlobs
-* Scatter
-* ScriptModule
-* ScriptModuleLoad
-* SegmentIdsToLengths
-* SegmentIdsToRanges
-* SendTensor
-* Shape
-* Sigmoid
-* SigmoidCrossEntropyWithLogits
-* SigmoidCrossEntropyWithLogitsGradient
-* SigmoidFakeFp16
-* SigmoidFakeFp16NNPI
-* SigmoidGradient
-* Sin
-* SinGradient
-* Snapshot
-* Softplus
-* SoftplusGradient
-* Softsign
-* SoftsignGradient
-* SortAndShuffle
-* SortedSegmentMean
-* SortedSegmentMeanGradient
-* SortedSegmentRangeLogMeanExp
-* SortedSegmentRangeLogMeanExpGradient
-* SortedSegmentRangeLogSumExp
-* SortedSegmentRangeLogSumExpGradient
-* SortedSegmentRangeMax
-* SortedSegmentRangeMaxGradient
-* SortedSegmentRangeMean
-* SortedSegmentRangeMeanGradient
-* SortedSegmentRangeSum
-* SortedSegmentRangeSumGradient
-* SortedSegmentSum
-* SortedSegmentSumGradient
-* SortedSegmentWeightedSum
-* SortedSegmentWeightedSumGradient
-* SpaceToBatch
-* SparseAdagradFusedWithSparseLengthsSumGradient
-* SparseAdagradFusedWithSparseLengthsWeightedSumGradient
-* SparseAdagradFusedWithSparseLengthsWeightedSumGradientApprox
-* SparseAdam
-* SparseDropoutWithReplacement
-* SparseFtrl
-* SparseFunHash
-* SparseFunHashGradient
-* SparseLabelSplit
-* SparseLabelSplitGradient
-* SparseLabelToBool
-* SparseLabelToDense
-* SparseLengthsIndicesInGradientMeanGradient
-* SparseLengthsIndicesInGradientSumGradient
-* SparseLengthsIndicesInGradientWeightedSumGradient
-* SparseLengthsIndicesInGradientWeightedSumWithMainInputGradient
-* SparseLengthsMean
-* SparseLengthsMean2BitRowwiseSparse
-* SparseLengthsMean4BitRowwiseSparse
-* SparseLengthsMean8BitRowwiseSparse
-* SparseLengthsMean8BitsRowwise
-* SparseLengthsMeanFakeFP16
-* SparseLengthsMeanFakeFP16AccFP16
-* SparseLengthsMeanFakeFP16EmbeddingOnly
-* SparseLengthsMeanFused2BitRowwise
-* SparseLengthsMeanFused4BitRowwise
-* SparseLengthsMeanFused8BitRowwise
-* SparseLengthsMeanFused8BitRowwiseFakeFP16
-* SparseLengthsMeanFused8BitRowwiseFakeFP16AccFP16
-* SparseLengthsMeanGradient
-* SparseLengthsPositionalWeightedSum
-* SparseLengthsSum
-* SparseLengthsSum2BitRowwiseSparse
-* SparseLengthsSum4BitRowwiseSparse
-* SparseLengthsSum8BitRowwiseSparse
-* SparseLengthsSum8BitsRowwise
-* SparseLengthsSumFakeFP16
-* SparseLengthsSumFakeFP16AccFP16
-* SparseLengthsSumFakeFP16EmbeddingOnly
-* SparseLengthsSumFused2BitRowwise
-* SparseLengthsSumFused4BitRowwise
-* SparseLengthsSumFused4BitRowwiseFakeFP16EmbeddingOnly
-* SparseLengthsSumFused4BitRowwiseFakeFP16NNPI
-* SparseLengthsSumFused8BitRowwise
-* SparseLengthsSumFused8BitRowwiseFakeFP16
-* SparseLengthsSumFused8BitRowwiseFakeFP16AccFP16
-* SparseLengthsSumFused8BitRowwiseFakeFP16AccInvScaleFP16
-* SparseLengthsSumFused8BitRowwiseFakeFP16EmbeddingOnly
-* SparseLengthsSumFused8BitRowwiseFakeFP16NNPI
-* SparseLengthsSumFused8BitRowwiseFakeFP32NNPI
-* SparseLengthsSumGradient
-* SparseLengthsSumSparseLookup
-* SparseLengthsWeightedMean8BitsRowwise
-* SparseLengthsWeightedSum
-* SparseLengthsWeightedSum2BitRowwiseSparse
-* SparseLengthsWeightedSum4BitRowwiseSparse
-* SparseLengthsWeightedSum8BitRowwiseSparse
-* SparseLengthsWeightedSum8BitsRowwise
-* SparseLengthsWeightedSumFakeFP16
-* SparseLengthsWeightedSumFakeFP16AccFP16
-* SparseLengthsWeightedSumFakeFP16EmbeddingOnly
-* SparseLengthsWeightedSumFused2BitRowwise
-* SparseLengthsWeightedSumFused4BitRowwise
-* SparseLengthsWeightedSumFused4BitRowwiseFakeFP16EmbeddingOnly
-* SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI
-* SparseLengthsWeightedSumFused8BitRowwise
-* SparseLengthsWeightedSumFused8BitRowwiseFakeFP16
-* SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccFP16
-* SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccInvScaleFP16
-* SparseLengthsWeightedSumFused8BitRowwiseFakeFP16EmbeddingOnly
-* SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI
-* SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI
-* SparseLengthsWeightedSumGradient
-* SparseLengthsWeightedSumWithMainInputGradient
-* SparseLpRegularizer
-* SparseMatrixReshape
-* SparseNormalize
-* SparseSortedSegmentMean
-* SparseSortedSegmentMeanGradient
-* SparseSortedSegmentSum
-* SparseSortedSegmentSumGradient
-* SparseSortedSegmentWeightedSum
-* SparseSortedSegmentWeightedSumGradient
-* SparseStorm
-* SparseToDense
-* SparseToDenseMask
-* SparseToDenseMaskGradient
-* SparseUnsortedSegmentMean
-* SparseUnsortedSegmentMeanGradient
-* SparseUnsortedSegmentSum
-* SparseUnsortedSegmentSumGradient
-* SparseUnsortedSegmentWeightedSum
-* SparseUnsortedSegmentWeightedSumGradient
-* SpatialBNFakeFp16NNPI
-* SpatialBNFakeLoweredFp16NNPI
-* SpatialBNRelu
-* Sqr
-* SqrFakeFp16
-* Sqrt
-* SquaredL2Distance
-* SquaredL2DistanceGradient
-* Squeeze
-* StatRegistryCreate
-* StatRegistryExport
-* StatRegistryUpdate
-* StdDevPut
-* StopGradient
-* Storm
-* StringIndexCreate
-* StringJoin
-* StringPrefix
-* StringSuffix
-* StumpFunc
-* StumpFuncIndex
-* SubFakeFp16
-* SubGradient
-* SumFakeFp16
-* SumInt
-* SumRelu
-* Summarize
-* SwapBestPath
-* Swish
-* SwishGradient
-* TT
-* TTContraction
-* TTContractionGradient
-* TTLinearGradient
-* TTPad
-* TTPadGradient
-* TanhFakeFp16
-* TanhFakeFp16NNPI
-* TensorProtosDBInput
-* TensorVectorSize
-* TextFileReaderRead
-* ThrowChildThreadException
-* ThrowException
-* TimerBegin
-* TimerEnd
-* TimerGet
-* TimerGetAndEnd
-* TrimDataset
-* UnPackRecords
-* UniformFill
-* UniformIntFill
-* UniqueUniformFill
-* UnsortedSegmentMean
-* UnsortedSegmentMeanGradient
-* UnsortedSegmentSum
-* UnsortedSegmentSumGradient
-* UnsortedSegmentWeightedSum
-* UnsortedSegmentWeightedSumGradient
-* VariableLengthSequencePadding
-* VideoInput
-* ViterbiPath
-* WallClockTime
-* WeightScale
-* WeightedMultiSampling
-* WeightedSample
-* WeightedSampleDequeueBlobs
-* WeightedSigmoidCrossEntropyWithLogits
-* WeightedSigmoidCrossEntropyWithLogitsGradient
-* While
-* XavierFill
-* Xor
-* YellowFin
-* ZeroGradient
-</details>
-
-## Covered operators
-<details>
-<summary>There are 220 covered operators</summary>
-
-* Acos
-* AcosGradient
-* Adadelta
-* Adagrad
-* AddPadding
-* AffineChannel
-* AffineChannelGradient
-* ArgMax
-* ArgMin
-* Asin
-* AsinGradient
-* Atan
-* AtanGradient
-* AveragedLoss
-* AveragedLossGradient
-* BBoxTransform
-* BatchBoxCox
-* BatchBucketOneHot
-* BatchBucketize
-* BatchDenseToSparse
-* BatchGather
-* BatchGatherGradient
-* BatchMatMul
-* BatchMoments
-* BatchMomentsGradient
-* BatchOneHot
-* BatchSparseToDense
-* BernoulliJSD
-* BernoulliJSDGradient
-* BooleanMask
-* BooleanUnmask
-* BoxWithNMSLimit
-* CTCBeamSearchDecoder
-* CTCGreedyDecoder
-* Ceil
-* ChannelBackpropStats
-* ChannelShuffle
-* ChannelShuffleGradient
-* ChannelStats
-* Clip
-* ClipGradient
-* ClipTensorByScaling
-* CollectAndDistributeFpnRpnProposals
-* ColwiseMax
-* ColwiseMaxGradient
-* Concat
-* Conditional
-* ConstantFill
-* Conv
-* ConvGradient
-* Cosh
-* CoshGradient
-* CosineEmbeddingCriterion
-* CosineEmbeddingCriterionGradient
-* DiagonalFill
-* DotProduct
-* DotProductGradient
-* Dropout
-* DropoutGrad
-* ElementwiseLinear
-* ElementwiseLinearGradient
-* Elu
-* EluGradient
-* Erf
-* ErfGradient
-* Expand
-* ExpandGradient
-* FC
-* FCGradient
-* Find
-* FindDuplicateElements
-* FlexibleTopK
-* FlexibleTopKGradient
-* Floor
-* GRUUnit
-* GRUUnitGradient
-* Gather
-* GatherPadding
-* GatherRanges
-* GatherRangesToDense
-* Glu
-* GroupNorm
-* GroupNormGradient
-* IndexHash
-* InstanceNorm
-* InstanceNormGradient
-* IntegralImage
-* IntegralImageGradient
-* IsMemberOf
-* LC
-* LCGradient
-* LayerNormGradient
-* LeakyRelu
-* LeakyReluGradient
-* LearningRate
-* LearningRateAdaption
-* LengthsGather
-* LengthsPad
-* LengthsRangeFill
-* LengthsSplit
-* LengthsSum
-* LengthsTile
-* LengthsToRanges
-* LengthsTopK
-* LengthsTopKGradient
-* Log
-* MarginRankingCriterion
-* MarginRankingCriterionGradient
-* MatMul
-* Max
-* MaxGradient
-* Mean
-* MeanGradient
-* MergeIdLists
-* Min
-* MinGradient
-* Moments
-* MomentsGradient
-* MomentumSGD
-* Mul
-* NanCheck
-* NegateGradient
-* Negative
-* NumpyTile
-* ONNXWhile
-* OneHot
-* PackRNNSequence
-* PackSegments
-* PadImage
-* PadImageGradient
-* PairWiseLoss
-* PairWiseLossGradient
-* PiecewiseLinearTransform
-* Pow
-* RecurrentNetwork
-* RecurrentNetworkGradient
-* ReduceBackMax
-* ReduceBackMaxGradient
-* ReduceBackMean
-* ReduceBackMeanGradient
-* ReduceFrontMax
-* ReduceFrontMaxGradient
-* ReduceFrontMean
-* ReduceFrontMeanGradient
-* ReduceFrontSum
-* ReduceFrontSumGradient
-* ReduceL2
-* ReduceL2Gradient
-* ReduceMax
-* ReduceMaxGradient
-* ReduceMean
-* ReduceMeanGradient
-* ReduceMin
-* ReduceMinGradient
-* Relu
-* ReluGradient
-* ReluN
-* ReluNGradient
-* RemoveDataBlocks
-* ReversePackedSegs
-* RowWiseSparseAdagrad
-* RowwiseMax
-* RowwiseMaxGradient
-* ScatterAssign
-* ScatterWeightedSum
-* SegmentOneHot
-* Selu
-* SeluGradient
-* SequenceMask
-* Sign
-* Sinh
-* SinhGradient
-* SinusoidPositionEncoding
-* Size
-* Slice
-* SliceGradient
-* Softmax
-* SoftmaxGradient
-* SoftmaxWithLoss
-* SoftmaxWithLossGradient
-* SparseAdadelta
-* SparseAdagrad
-* SparseMomentumSGDUpdate
-* SparseWngrad
-* SpatialBN
-* SpatialBNGradient
-* SpatialSoftmaxWithLoss
-* SpatialSoftmaxWithLossGradient
-* Split
-* SplitByLengths
-* SquareRootDivide
-* StringEndsWith
-* StringStartsWith
-* Sub
-* Sum
-* SumElements
-* SumElementsGradient
-* SumElementsInt
-* SumReduceLike
-* SumSqrElements
-* Tan
-* TanGradient
-* Tanh
-* TanhGradient
-* ThresholdedRelu
-* ThresholdedReluGradient
-* Tile
-* TileGradient
-* TopK
-* TopKGradient
-* Transpose
-* Unique
-* UnpackRNNSequence
-* UnpackSegments
-* UpsampleBilinear
-* UpsampleBilinearGradient
-* WeightedSum
-* WeightedSumGradient
-* Where
-* Wngrad
-</details>
-
-## Excluded from coverage statistics
-### Schemaless operators
-<details>
-<summary>There are 3 schemaless operators</summary>
-
-* C10LayerNorm_DontUseThisOpYet
-* LengthsSumFakeFp16
-* SparseLengthsMax
-</details>
diff --git a/caffe2/python/serialized_test/__init__.py b/caffe2/python/serialized_test/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/serialized_test/coverage.py b/caffe2/python/serialized_test/coverage.py
deleted file mode 100644
index 2014847242c4..000000000000
--- a/caffe2/python/serialized_test/coverage.py
+++ /dev/null
@@ -1,116 +0,0 @@
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-import os
-import tempfile
-from zipfile import ZipFile
-
-'''
-Generates a document in markdown format summrizing the coverage of serialized
-testing. The document lives in
-`caffe2/python/serialized_test/SerializedTestCoverage.md`
-'''
-
-OpSchema = workspace.C.OpSchema
-
-
-def gen_serialized_test_coverage(source_dir, output_dir):
-    (covered, not_covered, schemaless) = gen_coverage_sets(source_dir)
-    num_covered = len(covered)
-    num_not_covered = len(not_covered)
-    num_schemaless = len(schemaless)
-    total_ops = num_covered + num_not_covered
-
-    with open(os.path.join(output_dir, 'SerializedTestCoverage.md'), 'w+') as f:
-        f.write('# Serialized Test Coverage Report\n')
-        f.write("This is an automatically generated file. Please see "
-            "`caffe2/python/serialized_test/README.md` for details. "
-            "In the case of merge conflicts, please rebase and regenerate.\n")
-        f.write('## Summary\n')
-        f.write(
-            'Serialized tests have covered {}/{} ({}%) operators\n\n'.format(
-                num_covered, total_ops,
-                (int)(num_covered / total_ops * 1000) / 10))
-
-        f.write('## Not covered operators\n')
-        f.write('<details>\n')
-        f.write(
-            '<summary>There are {} not covered operators</summary>\n\n'.format(
-                num_not_covered))
-        for n in sorted(not_covered):
-            f.write('* ' + n + '\n')
-        f.write('</details>\n\n')
-
-        f.write('## Covered operators\n')
-        f.write('<details>\n')
-        f.write(
-            '<summary>There are {} covered operators</summary>\n\n'.format(
-                num_covered))
-        for n in sorted(covered):
-            f.write('* ' + n + '\n')
-        f.write('</details>\n\n')
-
-        f.write('## Excluded from coverage statistics\n')
-        f.write('### Schemaless operators\n')
-        f.write('<details>\n')
-        f.write(
-            '<summary>There are {} schemaless operators</summary>\n\n'.format(
-                num_schemaless))
-        for n in sorted(schemaless):
-            f.write('* ' + n + '\n')
-        f.write('</details>\n\n')
-
-
-def gen_coverage_sets(source_dir):
-    covered_ops = gen_covered_ops(source_dir)
-
-    not_covered_ops = set()
-    schemaless_ops = []
-    for op_name in core._GetRegisteredOperators():
-        s = OpSchema.get(op_name)
-
-        if s is not None and s.private:
-            continue
-        if s:
-            if op_name not in covered_ops:
-                not_covered_ops.add(op_name)
-        else:
-            if op_name.find("_ENGINE_") == -1:
-                schemaless_ops.append(op_name)
-    return (covered_ops, not_covered_ops, schemaless_ops)
-
-
-def gen_covered_ops(source_dir):
-    def parse_proto(x):
-        proto = caffe2_pb2.OperatorDef()
-        proto.ParseFromString(x)
-        return proto
-
-    covered = set()
-    for f in os.listdir(source_dir):
-        zipfile = os.path.join(source_dir, f)
-        if not os.path.isfile(zipfile):
-            continue
-        temp_dir = tempfile.mkdtemp()
-        with ZipFile(zipfile) as z:
-            z.extractall(temp_dir)
-        op_path = os.path.join(temp_dir, 'op.pb')
-        with open(op_path, 'rb') as f:
-            loaded_op = f.read()
-        op_proto = parse_proto(loaded_op)
-        covered.add(op_proto.type)
-
-        index = 0
-        grad_path = os.path.join(temp_dir, 'grad_{}.pb'.format(index))
-        while os.path.isfile(grad_path):
-            with open(grad_path, 'rb') as f:
-                loaded_grad = f.read()
-            grad_proto = parse_proto(loaded_grad)
-            covered.add(grad_proto.type)
-            index += 1
-            grad_path = os.path.join(temp_dir, 'grad_{}.pb'.format(index))
-    return covered
diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip
deleted file mode 100644
index 415a47d71c31..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_elu.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip
deleted file mode 100644
index e4584245ab11..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_leaky_relu.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip
deleted file mode 100644
index 0dc8e48877d4..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip b/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip
deleted file mode 100644
index 07e439a921cf..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/activation_ops_test.test_relu_n.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip
deleted file mode 100644
index 2bdb95bdaf79..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_adadelta.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip b/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip
deleted file mode 100644
index adac8479290e..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/adadelta_test.test_sparse_adadelta_empty.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip
deleted file mode 100644
index 9326bfd4df1d..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_adagrad.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip
deleted file mode 100644
index 27c7db4e9a17..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_row_wise_sparse_adagrad_empty.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip b/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip
deleted file mode 100644
index 8ddd81fbf44a..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/adagrad_test.test_sparse_adagrad_empty.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip b/caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip
deleted file mode 100644
index cb08f9f86e6d..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/affine_channel_op_test.test_affine_channel_2d.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip
deleted file mode 100644
index 88ec3cc8dbba..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmax.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip b/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip
deleted file mode 100644
index a0e1408b5a73..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/arg_ops_test.test_argmin.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip b/caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip
deleted file mode 100644
index 5a115b00e3b1..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/batch_box_cox_test.test_batch_box_cox.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip b/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip
deleted file mode 100644
index 73717a440d95..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/batch_bucketize_op_test.test_batch_bucketize_example.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip b/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip
deleted file mode 100644
index bb95ce7149a6..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/batch_moments_op_test.test_batch_moments_2d.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip b/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip
deleted file mode 100644
index eaddc47759a9..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/batch_sparse_to_dense_op_test.test_batch_sparse_to_dense.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip b/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip
deleted file mode 100644
index f51ee2ee182b..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/bbox_transform_test.test_bbox_transform.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip
deleted file mode 100644
index 668efa6e1643..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_boolean_mask.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip b/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip
deleted file mode 100644
index 126920673705..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/boolean_mask_test.test_sequence_mask_with_lengths.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip b/caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip
deleted file mode 100644
index 166b4b1d8022..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/boolean_unmask_test.test.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip b/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip
deleted file mode 100644
index ccdd2257ffc7..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/box_with_nms_limit_op_test.test_simple.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip b/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip
deleted file mode 100644
index 928a74f90cec..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/ceil_op_test.test_ceil.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip b/caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip
deleted file mode 100644
index 2c3e35be43a1..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/channel_backprop_stats_op_test.testChannelBackpropStats.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip b/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip
deleted file mode 100644
index 1c9f5abc9bd0..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/channel_shuffle_test.test_channel_shuffle.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip b/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip
deleted file mode 100644
index f0e22405a92f..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.testChannelStats.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.test_channel_stats_2d.zip b/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.test_channel_stats_2d.zip
deleted file mode 100644
index 9e1936974f10..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.test_channel_stats_2d.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.test_channel_stats_3d.zip b/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.test_channel_stats_3d.zip
deleted file mode 100644
index a8a237af7474..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/channel_stats_op_test.test_channel_stats_3d.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip b/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip
deleted file mode 100644
index d19477ff24f9..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/clip_op_test.test_clip.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip b/caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip
deleted file mode 100644
index 016c14957582..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/clip_tensor_op_test.test_clip_tensor_by_scaling.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip b/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip
deleted file mode 100644
index a1768f9b2d70..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/collect_and_distribute_fpn_rpn_proposals_op_test.test_collect_and_dist.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip
deleted file mode 100644
index 709d1674d305..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_concat.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip
deleted file mode 100644
index 9939456c7dbc..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip b/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip
deleted file mode 100644
index 6a22bf9d5b9c..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/concat_split_op_test.test_split_by_lengths.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip b/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip
deleted file mode 100644
index 2b3813c79aee..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/conditional_test.test_conditional.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip b/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip
deleted file mode 100644
index 37b399e1584e..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/conv_test.test_1x1_conv.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip b/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip
deleted file mode 100644
index 11701717ee81..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/cosine_embedding_criterion_op_test.test_cosine_embedding_criterion.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip b/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip
deleted file mode 100644
index 405ba823bcc3..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/ctc_beam_search_decoder_op_test.test_ctc_beam_search_decoder.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip b/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip
deleted file mode 100644
index c6a3150b72d0..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/ctc_greedy_decoder_op_test.test_ctc_greedy_decoder.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip b/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip
deleted file mode 100644
index 867437401d2f..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/distance_op_test.test_dot_product.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip b/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip
deleted file mode 100644
index ef9530a9a11c..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/dropout_op_test.test_dropout_is_test.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip
deleted file mode 100644
index a8e45abf3c8e..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/elementwise_linear_op_test.test.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip
deleted file mode 100644
index 02cb6d516ae6..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_is_member_of.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip
deleted file mode 100644
index 61e709080e5c..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/elementwise_logical_ops_test.test_where.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip b/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip
deleted file mode 100644
index 407edcbba9a9..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/elementwise_op_broadcast_test.test_broadcast_powt.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/erf_op_test.test_erf.zip b/caffe2/python/serialized_test/data/operator_test/erf_op_test.test_erf.zip
deleted file mode 100644
index 3e50fe68deda..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/erf_op_test.test_erf.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip b/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip
deleted file mode 100644
index 21b93d67f95a..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/expand_op_test.test_expand_rand_shape.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip b/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip
deleted file mode 100644
index 334cc694c49e..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/fc_operator_test.test_fc.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip
deleted file mode 100644
index 08d071b420c2..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_diagonal_fill_op_float.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip b/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip
deleted file mode 100644
index 338cb152f91a..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/filler_ops_test.test_lengths_range_fill.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip b/caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip
deleted file mode 100644
index 23cc6cdedec7..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/find_op_test.test_find.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip b/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip
deleted file mode 100644
index 234c13761bd8..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/flexible_top_k_test.test_flexible_top_k.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip b/caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip
deleted file mode 100644
index 91b7a6ea038f..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/floor_op_test.test_floor.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip
deleted file mode 100644
index 2886b5689db0..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_batch_gather_ops.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip b/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip
deleted file mode 100644
index 53885e5840d9..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/gather_ops_test.test_gather_ops.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip
deleted file mode 100644
index 8396b7975691..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip b/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip
deleted file mode 100644
index 012725912304..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/gather_ranges_op_test.test_gather_ranges_split.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip b/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip
deleted file mode 100644
index 9fd13e82fb75..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/glu_op_test.test_glu_old.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip b/caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip
deleted file mode 100644
index 72fe453f766f..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/group_norm_op_test.test_group_norm_2d.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip b/caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip
deleted file mode 100644
index ce0af76aa787..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/gru_test.test_gru_unit_op.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip
deleted file mode 100644
index 1e1eceec001f..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_cosh.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip
deleted file mode 100644
index e2295f316c3a..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_sinh.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip b/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip
deleted file mode 100644
index 87eab5db10f0..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/hyperbolic_ops_test.test_tanh.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip b/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip
deleted file mode 100644
index 00a30f3611f9..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/index_hash_ops_test.test_index_hash_ops.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip b/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip
deleted file mode 100644
index fa9214050033..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/instance_norm_test.test_instance_norm_reference_check.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip
deleted file mode 100644
index a878aedf18c9..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_gradient_ops.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip b/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip
deleted file mode 100644
index 1425d94c588e..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/integral_image_ops_test.test_integral_image_ops.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip b/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip
deleted file mode 100644
index e723b255e952..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/jsd_ops_test.test_bernoulli_jsd.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip b/caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip
deleted file mode 100644
index 0bdb30fab892..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/layer_norm_op_test.test_layer_norm_grad_op.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip b/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip
deleted file mode 100644
index b38de79996df..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/learning_rate_adaption_op_test.test_learning_rate_adaption_op_normalization.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip b/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip
deleted file mode 100644
index 0d37ccb28e3e..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/learning_rate_op_test.test_alter_learning_rate_op.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip b/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip
deleted file mode 100644
index cdc2079dc277..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/length_split_op_test.test_length_split_edge.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip b/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip
deleted file mode 100644
index 86d5742dd78e..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/lengths_pad_op_test.test_lengths_pad.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip b/caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip
deleted file mode 100644
index ddaa761fab5a..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/lengths_tile_op_test.test_lengths_tile.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip b/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip
deleted file mode 100644
index 2ef77b58d9b0..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/lengths_top_k_ops_test.test_lengths_top_k_op.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip b/caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip
deleted file mode 100644
index 2de39276663a..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/locally_connected_op_test.test_lc_2d.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip b/caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip
deleted file mode 100644
index 0d9b65472803..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/loss_ops_test.test_averaged_loss.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip b/caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip
deleted file mode 100644
index 4c2a40e90619..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/margin_ranking_criterion_op_test.test_margin_ranking_criterion.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip b/caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip
deleted file mode 100644
index 6255a751d925..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/math_ops_test.test_sign.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip
deleted file mode 100644
index 305f6fa75e5f..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_batch_matmul.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip
deleted file mode 100644
index 3a72d02000f2..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_matmul.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip b/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip
deleted file mode 100644
index bffbc66c8ea8..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/matmul_op_test.test_numpy_batch_matmul.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip b/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip
deleted file mode 100644
index 2cf316c359a4..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/mean_op_test.test_mean.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip b/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip
deleted file mode 100644
index 652b0dc09091..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/merge_id_lists_op_test.test_merge_id_lists_op.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip b/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip
deleted file mode 100644
index 3ff14dcf91ec..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/moments_op_test.test_moments.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip
deleted file mode 100644
index 437698665e19..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_momentum_sgd.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip b/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip
deleted file mode 100644
index 3998133efd35..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/momentum_sgd_test.test_sparse_momentum_sgd.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip b/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip
deleted file mode 100644
index 855579a23a35..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/negate_gradient_op_test.test_forward.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip b/caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip
deleted file mode 100644
index 8c09282a249a..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/numpy_tile_op_test.test_numpy_tile.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip
deleted file mode 100644
index 76d2671c4fbc..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_bucketized_one_hot.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip
deleted file mode 100644
index d0db2a7d3384..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_batch_one_hot.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip
deleted file mode 100644
index 20e95bdf8633..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_one_hot.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip b/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip
deleted file mode 100644
index f6917e3c4ffa..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/one_hot_ops_test.test_segment_one_hot.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip b/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip
deleted file mode 100644
index 5b29d0534549..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/onnx_while_test.test_onnx_while_fibb.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip b/caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip
deleted file mode 100644
index 09b2dd7e7fb9..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/pack_ops_test.test_pack_with_max_length_ops.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip
deleted file mode 100644
index 8438af3a9f78..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_pack_rnn_seqence.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip b/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip
deleted file mode 100644
index 953e275453a5..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/pack_rnn_sequence_op_test.test_unpack_rnn_seqence.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip b/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip
deleted file mode 100644
index 945f89561e9d..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/pad_test.test_crop.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip b/caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip
deleted file mode 100644
index 2339652431c7..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/piecewise_linear_transform_test.test_multi_predictions_params_from_arg.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/rank_loss_operator_test.test_pair_wise_loss_batch.zip b/caffe2/python/serialized_test/data/operator_test/rank_loss_operator_test.test_pair_wise_loss_batch.zip
deleted file mode 100644
index 0b377f6c3176..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/rank_loss_operator_test.test_pair_wise_loss_batch.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/recurrent_network_test.test_mul.zip b/caffe2/python/serialized_test/data/operator_test/recurrent_network_test.test_mul.zip
deleted file mode 100644
index 330663b52eda..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/recurrent_network_test.test_mul.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_max.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_max.zip
deleted file mode 100644
index 3f709bb1d2db..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_max.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_mean.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_mean.zip
deleted file mode 100644
index a291738573f4..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_back_mean.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_max.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_max.zip
deleted file mode 100644
index 9f8aed3c93f2..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_max.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_mean.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_mean.zip
deleted file mode 100644
index 85487c40f483..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_mean.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_sum.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_sum.zip
deleted file mode 100644
index ed898d62efeb..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_front_sum.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_l2.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_l2.zip
deleted file mode 100644
index 3a3ca69a1761..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_l2.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_max.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_max.zip
deleted file mode 100644
index 934b3a548e75..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_max.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_mean.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_mean.zip
deleted file mode 100644
index 805775c93328..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_mean.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_min.zip b/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_min.zip
deleted file mode 100644
index 45e37d688b21..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduce_ops_test.test_reduce_min.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_columnwise_max.zip b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_columnwise_max.zip
deleted file mode 100644
index 3eb10178c78b..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_columnwise_max.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_int_sum.zip b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_int_sum.zip
deleted file mode 100644
index 3948ea005d79..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_int_sum.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sqrsum.zip b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sqrsum.zip
deleted file mode 100644
index a1d513d641e8..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sqrsum.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sum.zip b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sum.zip
deleted file mode 100644
index bda6b05b3850..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_elementwise_sum.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_rowwise_max.zip b/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_rowwise_max.zip
deleted file mode 100644
index bd36e749b0da..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/reduction_ops_test.test_rowwise_max.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/selu_op_test.test_selu_1.zip b/caffe2/python/serialized_test/data/operator_test/selu_op_test.test_selu_1.zip
deleted file mode 100644
index 967d48edc540..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/selu_op_test.test_selu_1.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_add_padding.zip b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_add_padding.zip
deleted file mode 100644
index 4f95c338e952..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_add_padding.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_find_duplicate_elements.zip b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_find_duplicate_elements.zip
deleted file mode 100644
index 6a3a2dec9d77..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_find_duplicate_elements.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_gather_padding.zip b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_gather_padding.zip
deleted file mode 100644
index 58c0f55039bd..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_gather_padding.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_remove_data_blocks.zip b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_remove_data_blocks.zip
deleted file mode 100644
index 877e7757de68..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_remove_data_blocks.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_reverse_packed_segs.zip b/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_reverse_packed_segs.zip
deleted file mode 100644
index ca0d7eafa6b3..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/sequence_ops_test.test_reverse_packed_segs.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sinusoid_position_encoding_op_test.test_sinusoid_embedding.zip b/caffe2/python/serialized_test/data/operator_test/sinusoid_position_encoding_op_test.test_sinusoid_embedding.zip
deleted file mode 100644
index 0850604cea11..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/sinusoid_position_encoding_op_test.test_sinusoid_embedding.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax.zip b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax.zip
deleted file mode 100644
index 571fdb1ae9fb..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_grad.zip b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_grad.zip
deleted file mode 100644
index 440915c51a78..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_grad.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_with_loss.zip b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_with_loss.zip
deleted file mode 100644
index 98f8809e4a2b..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_softmax_with_loss.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_spatial_softmax_with_loss.zip b/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_spatial_softmax_with_loss.zip
deleted file mode 100644
index 69303ad5b1e8..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/softmax_ops_test.test_spatial_softmax_with_loss.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterAssign.zip b/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterAssign.zip
deleted file mode 100644
index aff23e7502d0..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterAssign.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterWeightedSum.zip b/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterWeightedSum.zip
deleted file mode 100644
index 6ad5bfd310e6..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/sparse_ops_test.testScatterWeightedSum.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/spatial_bn_op_test.test_spatialbn_test_mode_3d.zip b/caffe2/python/serialized_test/data/operator_test/spatial_bn_op_test.test_spatialbn_test_mode_3d.zip
deleted file mode 100644
index 1bcde7c346f9..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/spatial_bn_op_test.test_spatialbn_test_mode_3d.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/square_root_divide_op_test.test_square_root_divide.zip b/caffe2/python/serialized_test/data/operator_test/square_root_divide_op_test.test_square_root_divide.zip
deleted file mode 100644
index a9c2954a7a0b..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/square_root_divide_op_test.test_square_root_divide.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_ends_with.zip b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_ends_with.zip
deleted file mode 100644
index 6af51439bed6..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_ends_with.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_starts_with.zip b/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_starts_with.zip
deleted file mode 100644
index cd0682f99b30..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/string_ops_test.test_string_starts_with.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/thresholded_relu_op_test.test_thresholded_relu_1.zip b/caffe2/python/serialized_test/data/operator_test/thresholded_relu_op_test.test_thresholded_relu_1.zip
deleted file mode 100644
index 0f80f7df8897..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/thresholded_relu_op_test.test_thresholded_relu_1.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/tile_op_test.test_tile.zip b/caffe2/python/serialized_test/data/operator_test/tile_op_test.test_tile.zip
deleted file mode 100644
index 17b064f64066..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/tile_op_test.test_tile.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/top_k_test.test_top_k.zip b/caffe2/python/serialized_test/data/operator_test/top_k_test.test_top_k.zip
deleted file mode 100644
index 592bc05b1ec9..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/top_k_test.test_top_k.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/transpose_op_test.test_transpose.zip b/caffe2/python/serialized_test/data/operator_test/transpose_op_test.test_transpose.zip
deleted file mode 100644
index 19ef2d4a3c47..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/transpose_op_test.test_transpose.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_acos.zip b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_acos.zip
deleted file mode 100644
index 0e93add75df7..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_acos.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_asin.zip b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_asin.zip
deleted file mode 100644
index 0df01759115a..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_asin.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_atan.zip b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_atan.zip
deleted file mode 100644
index 02dd9a82bea2..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_atan.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_tan.zip b/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_tan.zip
deleted file mode 100644
index c0de1850d99a..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/trigonometric_op_test.test_tan.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/unique_ops_test.test_unique_op.zip b/caffe2/python/serialized_test/data/operator_test/unique_ops_test.test_unique_op.zip
deleted file mode 100644
index ba2a4cb04578..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/unique_ops_test.test_unique_op.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample.zip b/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample.zip
deleted file mode 100644
index 66b8ce7f7a68..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample_grad.zip b/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample_grad.zip
deleted file mode 100644
index 2ac20c92650a..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/upsample_op_test.test_upsample_grad.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max.zip
deleted file mode 100644
index c5cff9946365..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max_grad.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max_grad.zip
deleted file mode 100644
index 37b634118275..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_max_grad.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min.zip
deleted file mode 100644
index cd11c2117fe4..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min_grad.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min_grad.zip
deleted file mode 100644
index f8f8f647cc3d..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_elementwise_min_grad.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_gather.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_gather.zip
deleted file mode 100644
index 81606ea7ee6e..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_gather.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_to_ranges.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_to_ranges.zip
deleted file mode 100644
index df301b65b443..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_lengths_to_ranges.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_nan_check.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_nan_check.zip
deleted file mode 100644
index 104da2e4df45..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_nan_check.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_size_op.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_size_op.zip
deleted file mode 100644
index 21d46fd22d36..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_size_op.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_slice.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_slice.zip
deleted file mode 100644
index 3e5e53c9b68e..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_slice.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_sum.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_sum.zip
deleted file mode 100644
index c3dc2ca3201a..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_sum.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_transpose.zip b/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_transpose.zip
deleted file mode 100644
index adcb3f57f19f..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/utility_ops_test.test_transpose.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip b/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip
deleted file mode 100644
index 7bdda94c3fdb..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/weighted_sum_test.test_weighted_sum.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_sparse_wngrad_empty.zip b/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_sparse_wngrad_empty.zip
deleted file mode 100644
index 54edacc12634..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_sparse_wngrad_empty.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_wngrad_dense_base.zip b/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_wngrad_dense_base.zip
deleted file mode 100644
index 92225758a1fa..000000000000
Binary files a/caffe2/python/serialized_test/data/operator_test/wngrad_test.test_wngrad_dense_base.zip and /dev/null differ
diff --git a/caffe2/python/serialized_test/serialized_test_util.py b/caffe2/python/serialized_test/serialized_test_util.py
deleted file mode 100644
index e2a691e0e352..000000000000
--- a/caffe2/python/serialized_test/serialized_test_util.py
+++ /dev/null
@@ -1,298 +0,0 @@
-
-
-
-import inspect
-import os
-import shutil
-import sys
-import tempfile
-import threading
-from contextlib import contextmanager
-from zipfile import ZipFile
-
-import argparse
-import hypothesis as hy
-import numpy as np
-
-import caffe2.python.hypothesis_test_util as hu
-from caffe2.proto import caffe2_pb2
-from caffe2.python import gradient_checker
-from caffe2.python.serialized_test import coverage
-
-operator_test_type = 'operator_test'
-TOP_DIR = os.path.dirname(os.path.realpath(__file__))
-DATA_SUFFIX = 'data'
-DATA_DIR = os.path.join(TOP_DIR, DATA_SUFFIX)
-_output_context = threading.local()
-
-
-def given(*given_args, **given_kwargs):
-    def wrapper(f):
-        hyp_func = hy.seed(0)(hy.settings(max_examples=1)(hy.given(*given_args, **given_kwargs)(f)))
-        fixed_seed_func = hy.seed(0)(hy.settings(max_examples=1)(hy.given(
-            *given_args, **given_kwargs)(f)))
-
-        def func(self, *args, **kwargs):
-            self.should_serialize = True
-            fixed_seed_func(self, *args, **kwargs)
-            self.should_serialize = False
-            hyp_func(self, *args, **kwargs)
-        return func
-    return wrapper
-
-
-def _getGradientOrNone(op_proto):
-    try:
-        grad_ops, _ = gradient_checker.getGradientForOp(op_proto)
-        return grad_ops
-    except Exception:
-        return []
-
-
-# necessary to support converting jagged lists into numpy arrays
-def _transformList(l):
-    ret = np.empty(len(l), dtype=object)
-    for (i, arr) in enumerate(l):
-        ret[i] = arr
-    return ret
-
-
-def _prepare_dir(path):
-    if os.path.exists(path):
-        shutil.rmtree(path)
-    os.makedirs(path)
-
-
-class SerializedTestCase(hu.HypothesisTestCase):
-
-    should_serialize = False
-
-    def get_output_dir(self):
-        output_dir_arg = getattr(_output_context, 'output_dir', DATA_DIR)
-        output_dir = os.path.join(
-            output_dir_arg, operator_test_type)
-
-        if os.path.exists(output_dir):
-            return output_dir
-
-        # fall back to pwd
-        cwd = os.getcwd()
-        serialized_util_module_components = __name__.split('.')
-        serialized_util_module_components.pop()
-        serialized_dir = '/'.join(serialized_util_module_components)
-        output_dir_fallback = os.path.join(cwd, serialized_dir, DATA_SUFFIX)
-        output_dir = os.path.join(
-            output_dir_fallback,
-            operator_test_type)
-
-        return output_dir
-
-    def get_output_filename(self):
-        class_path = inspect.getfile(self.__class__)
-        file_name_components = os.path.basename(class_path).split('.')
-        test_file = file_name_components[0]
-
-        function_name_components = self.id().split('.')
-        test_function = function_name_components[-1]
-
-        return test_file + '.' + test_function
-
-    def serialize_test(self, inputs, outputs, grad_ops, op, device_option):
-        output_dir = self.get_output_dir()
-        test_name = self.get_output_filename()
-        full_dir = os.path.join(output_dir, test_name)
-        _prepare_dir(full_dir)
-
-        inputs = _transformList(inputs)
-        outputs = _transformList(outputs)
-        device_type = int(device_option.device_type)
-
-        op_path = os.path.join(full_dir, 'op.pb')
-        grad_paths = []
-        inout_path = os.path.join(full_dir, 'inout')
-
-        with open(op_path, 'wb') as f:
-            f.write(op.SerializeToString())
-        for (i, grad) in enumerate(grad_ops):
-            grad_path = os.path.join(full_dir, 'grad_{}.pb'.format(i))
-            grad_paths.append(grad_path)
-            with open(grad_path, 'wb') as f:
-                f.write(grad.SerializeToString())
-
-        np.savez_compressed(
-            inout_path,
-            inputs=inputs,
-            outputs=outputs,
-            device_type=device_type)
-
-        with ZipFile(os.path.join(output_dir, test_name + '.zip'), 'w') as z:
-            z.write(op_path, 'op.pb')
-            z.write(inout_path + '.npz', 'inout.npz')
-            for path in grad_paths:
-                z.write(path, os.path.basename(path))
-
-        shutil.rmtree(full_dir)
-
-    def compare_test(self, inputs, outputs, grad_ops, atol=1e-7, rtol=1e-7):
-
-        def parse_proto(x):
-            proto = caffe2_pb2.OperatorDef()
-            proto.ParseFromString(x)
-            return proto
-
-        source_dir = self.get_output_dir()
-        test_name = self.get_output_filename()
-        temp_dir = tempfile.mkdtemp()
-        with ZipFile(os.path.join(source_dir, test_name + '.zip')) as z:
-            z.extractall(temp_dir)
-
-        op_path = os.path.join(temp_dir, 'op.pb')
-        inout_path = os.path.join(temp_dir, 'inout.npz')
-
-        # load serialized input and output
-        loaded = np.load(inout_path, encoding='bytes', allow_pickle=True)
-        loaded_inputs = loaded['inputs'].tolist()
-        inputs_equal = True
-        for (x, y) in zip(inputs, loaded_inputs):
-            if not np.array_equal(x, y):
-                inputs_equal = False
-        loaded_outputs = loaded['outputs'].tolist()
-
-        # if inputs are not the same, run serialized input through serialized op
-        if not inputs_equal:
-            # load operator
-            with open(op_path, 'rb') as f:
-                loaded_op = f.read()
-
-            op_proto = parse_proto(loaded_op)
-            device_type = loaded['device_type']
-            device_option = caffe2_pb2.DeviceOption(
-                device_type=int(device_type))
-
-            outputs = hu.runOpOnInput(device_option, op_proto, loaded_inputs)
-            grad_ops = _getGradientOrNone(op_proto)
-
-        # assert outputs are equal
-        for (x, y) in zip(outputs, loaded_outputs):
-            np.testing.assert_allclose(x, y, atol=atol, rtol=rtol)
-
-        # assert gradient op is equal
-        for i in range(len(grad_ops)):
-            grad_path = os.path.join(temp_dir, 'grad_{}.pb'.format(i))
-            with open(grad_path, 'rb') as f:
-                loaded_grad = f.read()
-            grad_proto = parse_proto(loaded_grad)
-            self._assertSameOps(grad_proto, grad_ops[i])
-
-        shutil.rmtree(temp_dir)
-
-    def _assertSameOps(self, op1, op2):
-        op1_ = caffe2_pb2.OperatorDef()
-        op1_.CopyFrom(op1)
-        op1_.arg.sort(key=lambda arg: arg.name)
-
-        op2_ = caffe2_pb2.OperatorDef()
-        op2_.CopyFrom(op2)
-        op2_.arg.sort(key=lambda arg: arg.name)
-
-        self.assertEqual(op1_, op2_)
-
-    def assertSerializedOperatorChecks(
-            self,
-            inputs,
-            outputs,
-            gradient_operator,
-            op,
-            device_option,
-            atol=1e-7,
-            rtol=1e-7,
-    ):
-        if self.should_serialize:
-            if getattr(_output_context, 'should_generate_output', False):
-                self.serialize_test(
-                    inputs, outputs, gradient_operator, op, device_option)
-                if not getattr(_output_context, 'disable_gen_coverage', False):
-                    coverage.gen_serialized_test_coverage(
-                        self.get_output_dir(), TOP_DIR)
-            else:
-                self.compare_test(
-                    inputs, outputs, gradient_operator, atol, rtol)
-
-    def assertReferenceChecks(
-        self,
-        device_option,
-        op,
-        inputs,
-        reference,
-        input_device_options=None,
-        threshold=1e-4,
-        output_to_grad=None,
-        grad_reference=None,
-        atol=None,
-        outputs_to_check=None,
-        ensure_outputs_are_inferred=False,
-    ):
-        outs = super().assertReferenceChecks(
-            device_option,
-            op,
-            inputs,
-            reference,
-            input_device_options,
-            threshold,
-            output_to_grad,
-            grad_reference,
-            atol,
-            outputs_to_check,
-            ensure_outputs_are_inferred,
-        )
-        if not getattr(_output_context, 'disable_serialized_check', False):
-            grad_ops = _getGradientOrNone(op)
-            rtol = threshold
-            if atol is None:
-                atol = threshold
-            self.assertSerializedOperatorChecks(
-                inputs,
-                outs,
-                grad_ops,
-                op,
-                device_option,
-                atol,
-                rtol,
-            )
-
-    @contextmanager
-    def set_disable_serialized_check(self, val: bool):
-        orig = getattr(_output_context, 'disable_serialized_check', False)
-        try:
-            # pyre-fixme[16]: `local` has no attribute `disable_serialized_check`.
-            _output_context.disable_serialized_check = val
-            yield
-        finally:
-            _output_context.disable_serialized_check = orig
-
-
-def testWithArgs():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '-G', '--generate-serialized', action='store_true', dest='generate',
-        help='generate output files (default=false, compares to current files)')
-    parser.add_argument(
-        '-O', '--output', default=DATA_DIR,
-        help='output directory (default: %(default)s)')
-    parser.add_argument(
-        '-D', '--disable-serialized_check', action='store_true', dest='disable',
-        help='disable checking serialized tests')
-    parser.add_argument(
-        '-C', '--disable-gen-coverage', action='store_true',
-        dest='disable_coverage',
-        help='disable generating coverage markdown file')
-    parser.add_argument('unittest_args', nargs='*')
-    args = parser.parse_args()
-    sys.argv[1:] = args.unittest_args
-    _output_context.__setattr__('should_generate_output', args.generate)
-    _output_context.__setattr__('output_dir', args.output)
-    _output_context.__setattr__('disable_serialized_check', args.disable)
-    _output_context.__setattr__('disable_gen_coverage', args.disable_coverage)
-
-    import unittest
-    unittest.main()
diff --git a/caffe2/python/session.py b/caffe2/python/session.py
deleted file mode 100644
index edc32ccf808f..000000000000
--- a/caffe2/python/session.py
+++ /dev/null
@@ -1,213 +0,0 @@
-## @package session
-# Module caffe2.python.session
-
-
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.task import Cluster, Task, TaskGroup, WorkspaceType
-
-
-class CompiledRunnable:
-    """ Wrapper for compiled runnable returned from session.compile() """
-    def __init__(self, obj, session_class):
-        self.obj = obj
-        self.session_class = session_class
-
-
-class Session:
-    """
-    Allows to run Nets, ExecutionSteps, Plans, Tasks and TaskGroups.
-    A session can potentially run in multiple nodes concurrently.
-
-
-    Example:
-        from core import Net
-        from caffe2.python.task import Task, TaskGroup, WorkspaceType
-
-        net = Net('test1')
-        net.Add([net.Const(1), net.Const(2)])
-
-        net2 = net.Clone()
-        step = core.execution_step('step1', [net2])
-
-        with TaskGroup(WorkspaceType.GLOBAL) as init_tg:
-            with Node('node1'):
-                n1setup = net.Net('n1setup')
-                n1msg = n1setup.Const('Hello from node 1.')
-                Task(step=n1setup)
-
-        with TaskGroup() as private_tg:
-            with Node('node1'):
-                n1 = net.Net('n1')
-                n1.Print(n1msg, 0)
-                Task(step=n1)
-            with Node('node2'):
-                n2 = net.Net('n2')
-                n2.Print(n2.Const('Hello from node 2.'), 0)
-                Task(step=n2)
-
-        session = LocalSession()
-        session.run(net)
-        session.run(step)
-        session.run(init_tg)
-        session.run(private_tg)
-
-
-    Global Workspace:
-        At the beginning of the session, a global workspace is created and kept
-        alive for the duration of the session.
-
-
-    Private Workspace:
-        Tasks can be run either directly on the global workspace, or they can
-        instantiate a private child workspace that is released after each run.
-
-    Blob visibility:
-        Tasks running in different nodes in parallel will always run under
-        different workspaces, so it must be assumed that they won't be able to
-        access each other's blobs. Tasks running on the same node will follow
-        Workspace hierarchy rules: tasks running on separate private workspaces
-        will only be able to share blobs defined on a common parent Workspace.
-    """
-
-    _compiled_cache = {}
-
-    def __init__(self):
-        self._open = True
-
-    def is_open(self):
-        return self._open
-
-    @classmethod
-    def compile(cls, runnable, workspace_type=None, setup_net_list=None):
-        if isinstance(runnable, CompiledRunnable):
-            assert cls == runnable.session_class, (
-                'Runnable was compiled for different session type. ' +
-                'Need: %s, got: %s' % (
-                    cls.__name__, runnable.session_class.__name__))
-            return runnable
-
-        if runnable in cls._compiled_cache:
-            return cls._compiled_cache[runnable]
-
-        if isinstance(runnable, TaskGroup):
-            if workspace_type:
-                if runnable.workspace_type():
-                    assert runnable.workspace_type() == workspace_type, \
-                        "Require {} but already have {}".format(
-                            workspace_type, runnable.workspace_type())
-                else:
-                    runnable._workspace_type = workspace_type
-            tg = runnable
-        else:
-            if workspace_type is None:
-                workspace_type = WorkspaceType.GLOBAL
-            tg = TaskGroup(workspace_type=workspace_type)
-            if isinstance(runnable, Task):
-                tg.add(runnable)
-            elif isinstance(runnable, core.ExecutionStep):
-                tg.add(Task(step=runnable))
-            elif isinstance(runnable, core.Plan):
-                # ExecutionSteps in Plan() object is supposed to run sequentially, while
-                # tasks in TaskGroup run in parallel. So if we have multiple
-                # ExecutionSteps in Plan() object, we choose to have a root
-                # ExecutionStep to wrap all ExecutionSteps.
-                assert len(runnable.Steps()) > 0
-                if len(runnable.Steps()) == 1:
-                    tg.add(Task(step=runnable.Steps()[0]))
-                else:
-                    # Task takes a list of ExecutionSteps and automatically wrap into
-                    # a root ExecutionStep
-                    tg.add(Task(step=runnable.Steps()))
-            else:
-                step = core.execution_step('runnable', runnable)
-                tg.add(Task(step=step))
-        compiled = CompiledRunnable(
-            cls._compile_task_group(tg, setup_net_list), session_class=cls)
-        cls._compiled_cache[runnable] = compiled
-        return compiled
-
-    def run(self, runnable, workspace_type=None, setup_net_list=None):
-        """Run the given runnable.
-
-        Args:
-            runnable: Object recognized by the Session. Currently, we support
-                TaskGroup, Task, Plan, ExecutionStep, and Net.
-            workspace_type: A string defined in the WorkspaceType object.
-            setup_net_list: A list of Net objects or a list of NetDef protos.
-                So far this is only used by the DistributedSession, in which we
-                need to pass a list of special nets to setup the master.
-        """
-        assert self.is_open(), 'Session is closed.'
-        assert runnable is not None, 'Got a none runnable.'
-        self._run_compiled(self.compile(runnable, workspace_type,
-                                        setup_net_list).obj)
-
-    def close(self):
-        if self.is_open():
-            self._do_close()
-            self._open = False
-
-    def fetch_output(self, output):
-        raise NotImplementedError()
-
-    def _run_compiled(self, task_group):
-        raise NotImplementedError()
-
-    @classmethod
-    def _compile_task_group(cls, task_group, setup_net_list=None):
-        return task_group
-
-    def _do_close(self):
-        pass
-
-    def __enter__(self):
-        assert self._open, 'Session already closed.'
-        return self
-
-    def __exit__(self, ex_type, value, traceback):
-        if ex_type is None:
-            self.close()
-
-
-class LocalSession(Session):
-    """
-    Session that runs in a single node.
-    Tasks are all remapped to run in parallel in the 'local' node.
-
-    Currently, LocalSession runs all parallel tasks in the same workspace,
-    but this behavior may change in the future. Only tasks pointing to the
-    same logical node are guaranteed to always run in the same workspace.
-    """
-    def __init__(self, ws=None):
-        Session.__init__(self)
-        self._ws = ws or workspace.C.Workspace.current
-
-    @classmethod
-    def _compile_task_group(cls, task_group, setup_net_list=None):
-        with Cluster():
-            task = task_group.to_task()
-        plan = core.Plan('task_group_plan')
-        plan.AddStep(task.get_step())
-        return (plan, task.output_list(), task.workspace_type())
-
-    def _run_compiled(self, compiled):
-        plan, output_list, workspace_type = compiled
-
-        # make sure the output blobs belong to the parent workspace
-        outputs = []
-        for name in output_list.names():
-            self._ws.create_blob(str(name))
-            outputs.append(core.BlobReference(str(name)))
-        output_list.set_values(outputs, _fetch_func=self._fetch_output)
-        task_ws = (
-            workspace.C.Workspace(self._ws)
-            if workspace_type == WorkspaceType.PRIVATE else self._ws)
-        with workspace.WorkspaceGuard(task_ws):
-            task_ws.run(plan)
-
-    def _fetch_output(self, output):
-        return self._ws.blobs[str(output)].fetch()
diff --git a/caffe2/python/session_test.py b/caffe2/python/session_test.py
deleted file mode 100644
index fa505c296820..000000000000
--- a/caffe2/python/session_test.py
+++ /dev/null
@@ -1,63 +0,0 @@
-
-
-
-
-
-from caffe2.python.schema import (
-    Struct, FetchRecord, NewRecord, FeedRecord, InitEmptyRecord)
-from caffe2.python import core, workspace
-from caffe2.python.session import LocalSession
-from caffe2.python.dataset import Dataset
-from caffe2.python.pipeline import pipe
-from caffe2.python.task import TaskGroup
-from caffe2.python.test_util import TestCase
-import numpy as np
-
-
-class TestLocalSession(TestCase):
-    def test_local_session(self):
-        init_net = core.Net('init')
-        src_values = Struct(
-            ('uid', np.array([1, 2, 6])),
-            ('value', np.array([1.4, 1.6, 1.7])))
-        expected_dst = Struct(
-            ('uid', np.array([2, 4, 12])),
-            ('value', np.array([0.0, 0.0, 0.0])))
-
-        with core.NameScope('init'):
-            src_blobs = NewRecord(init_net, src_values)
-            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())
-
-        def proc1(rec):
-            net = core.Net('proc1')
-            with core.NameScope('proc1'):
-                out = NewRecord(net, rec)
-            net.Add([rec.uid(), rec.uid()], [out.uid()])
-            out.value.set(blob=rec.value(), unsafe=True)
-            return [net], out
-
-        def proc2(rec):
-            net = core.Net('proc2')
-            with core.NameScope('proc2'):
-                out = NewRecord(net, rec)
-            out.uid.set(blob=rec.uid(), unsafe=True)
-            net.Sub([rec.value(), rec.value()], [out.value()])
-            return [net], out
-
-        src_ds = Dataset(src_blobs)
-        dst_ds = Dataset(dst_blobs)
-
-        with TaskGroup() as tg:
-            out1 = pipe(src_ds.reader(), processor=proc1)
-            out2 = pipe(out1, processor=proc2)
-            pipe(out2, dst_ds.writer())
-
-        ws = workspace.C.Workspace()
-        FeedRecord(src_blobs, src_values, ws)
-        session = LocalSession(ws)
-        session.run(init_net)
-        session.run(tg)
-        output = FetchRecord(dst_blobs, ws=ws)
-
-        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
-            np.testing.assert_array_equal(a, b)
diff --git a/caffe2/python/sparse_to_dense_mask_test.py b/caffe2/python/sparse_to_dense_mask_test.py
deleted file mode 100644
index 78202cc47ba4..000000000000
--- a/caffe2/python/sparse_to_dense_mask_test.py
+++ /dev/null
@@ -1,157 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-
-import numpy as np
-
-
-class TestSparseToDenseMask(TestCase):
-
-    def test_sparse_to_dense_mask_float(self):
-        op = core.CreateOperator(
-            'SparseToDenseMask',
-            ['indices', 'values', 'default', 'lengths'],
-            ['output'],
-            mask=[999999999, 2, 6])
-        workspace.FeedBlob(
-            'indices',
-            np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32))
-        workspace.FeedBlob(
-            'values',
-            np.array([1, 2, 3, 4, 5, 6, 7], dtype=np.float64))
-        workspace.FeedBlob('default', np.array(-1, dtype=np.float64))
-        workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32))
-        workspace.RunOperatorOnce(op)
-        output = workspace.FetchBlob('output')
-        expected = np.array([[-1, 1, 3], [6, 7, -1]], dtype=np.float64)
-        self.assertEqual(output.shape, expected.shape)
-        np.testing.assert_array_equal(output, expected)
-
-    def test_sparse_to_dense_mask_invalid_inputs(self):
-        op = core.CreateOperator(
-            'SparseToDenseMask',
-            ['indices', 'values', 'default', 'lengths'],
-            ['output'],
-            mask=[999999999, 2],
-            max_skipped_indices=3)
-        workspace.FeedBlob(
-            'indices',
-            np.array([2000000000000, 999999999, 2, 3, 4, 5], dtype=np.int32))
-        workspace.FeedBlob(
-            'values',
-            np.array([1, 2, 3, 4, 5, 6], dtype=np.float64))
-        workspace.FeedBlob('default', np.array(-1, dtype=np.float64))
-        workspace.FeedBlob('lengths', np.array([6], dtype=np.int32))
-        try:
-            workspace.RunOperatorOnce(op)
-        except RuntimeError:
-            self.fail("Exception raised with only one negative index")
-
-        # 3 invalid inputs should throw.
-        workspace.FeedBlob(
-            'indices',
-            np.array([-1, 1, 2, 3, 4, 5], dtype=np.int32))
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorMultiple(op, 3)
-
-    def test_sparse_to_dense_mask_subtensor(self):
-        op = core.CreateOperator(
-            'SparseToDenseMask',
-            ['indices', 'values', 'default', 'lengths'],
-            ['output'],
-            mask=[999999999, 2, 888, 6])
-        workspace.FeedBlob(
-            'indices',
-            np.array([2, 4, 6, 999999999, 2], dtype=np.int64))
-        workspace.FeedBlob(
-            'values',
-            np.array([[[1, -1]], [[2, -2]], [[3, -3]], [[4, -4]], [[5, -5]]],
-                     dtype=np.float64))
-        workspace.FeedBlob('default', np.array([[-1, 0]], dtype=np.float64))
-        workspace.FeedBlob('lengths', np.array([2, 3], dtype=np.int32))
-        workspace.RunOperatorOnce(op)
-        output = workspace.FetchBlob('output')
-        expected = np.array([
-            [[[-1, 0]], [[1, -1]], [[-1, 0]], [[-1, 0]]],
-            [[[4, -4]], [[5, -5]], [[-1, 0]], [[3, -3]]]], dtype=np.float64)
-        self.assertEqual(output.shape, expected.shape)
-        np.testing.assert_array_equal(output, expected)
-
-    def test_sparse_to_dense_mask_string(self):
-        op = core.CreateOperator(
-            'SparseToDenseMask',
-            ['indices', 'values', 'default', 'lengths'],
-            ['output'],
-            mask=[999999999, 2, 6])
-        workspace.FeedBlob(
-            'indices',
-            np.array([2, 4, 6, 1, 2, 999999999, 2], dtype=np.int32))
-        workspace.FeedBlob(
-            'values',
-            np.array(['1', '2', '3', '4', '5', '6', '7'], dtype='S'))
-        workspace.FeedBlob('default', np.array('-1', dtype='S'))
-        workspace.FeedBlob('lengths', np.array([3, 4], dtype=np.int32))
-        workspace.RunOperatorOnce(op)
-        output = workspace.FetchBlob('output')
-        expected =\
-            np.array([['-1', '1', '3'], ['6', '7', '-1']], dtype='S')
-        self.assertEqual(output.shape, expected.shape)
-        np.testing.assert_array_equal(output, expected)
-
-    def test_sparse_to_dense_mask_empty_lengths(self):
-        op = core.CreateOperator(
-            'SparseToDenseMask',
-            ['indices', 'values', 'default'],
-            ['output'],
-            mask=[1, 2, 6])
-        workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32))
-        workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float64))
-        workspace.FeedBlob('default', np.array(-1, dtype=np.float64))
-        workspace.RunOperatorOnce(op)
-        output = workspace.FetchBlob('output')
-        expected = np.array([-1, 1, 3], dtype=np.float64)
-        self.assertEqual(output.shape, expected.shape)
-        np.testing.assert_array_equal(output, expected)
-
-    def test_sparse_to_dense_mask_no_lengths(self):
-        op = core.CreateOperator(
-            'SparseToDenseMask',
-            ['indices', 'values', 'default'],
-            ['output'],
-            mask=[1, 2, 6])
-        workspace.FeedBlob('indices', np.array([2, 4, 6], dtype=np.int32))
-        workspace.FeedBlob('values', np.array([1, 2, 3], dtype=np.float64))
-        workspace.FeedBlob('default', np.array(-1, dtype=np.float64))
-        workspace.RunOperatorOnce(op)
-        output = workspace.FetchBlob('output')
-        expected = np.array([-1, 1, 3], dtype=np.float64)
-        self.assertEqual(output.shape, expected.shape)
-        np.testing.assert_array_equal(output, expected)
-
-    def test_sparse_to_dense_mask_presence_mask(self):
-        op = core.CreateOperator(
-            'SparseToDenseMask',
-            ['indices', 'values', 'default', 'lengths'],
-            ['output', 'presence_mask'],
-            mask=[11, 12],
-            return_presence_mask=True)
-        workspace.FeedBlob('indices', np.array([11, 12, 13], dtype=np.int32))
-        workspace.FeedBlob('values', np.array([11, 12, 13], dtype=np.float64))
-        workspace.FeedBlob('default', np.array(-1, dtype=np.float64))
-        workspace.FeedBlob('lengths', np.array([1, 2], dtype=np.int32))
-
-        workspace.RunOperatorOnce(op)
-
-        output = workspace.FetchBlob('output')
-        presence_mask = workspace.FetchBlob('presence_mask')
-        expected_output = np.array([[11, -1], [-1, 12]], dtype=np.float64)
-        expected_presence_mask = np.array(
-            [[True, False], [False, True]],
-            dtype=bool)
-        self.assertEqual(output.shape, expected_output.shape)
-        np.testing.assert_array_equal(output, expected_output)
-        self.assertEqual(presence_mask.shape, expected_presence_mask.shape)
-        np.testing.assert_array_equal(presence_mask, expected_presence_mask)
diff --git a/caffe2/python/sparse_to_dense_test.py b/caffe2/python/sparse_to_dense_test.py
deleted file mode 100644
index dc43d2c03394..000000000000
--- a/caffe2/python/sparse_to_dense_test.py
+++ /dev/null
@@ -1,110 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-
-import numpy as np
-
-
-class TestSparseToDense(TestCase):
-    def test_sparse_to_dense(self):
-        op = core.CreateOperator(
-            'SparseToDense',
-            ['indices', 'values'],
-            ['output'])
-        workspace.FeedBlob(
-            'indices',
-            np.array([2, 4, 999, 2], dtype=np.int32))
-        workspace.FeedBlob(
-            'values',
-            np.array([1, 2, 6, 7], dtype=np.int32))
-
-        workspace.RunOperatorOnce(op)
-        output = workspace.FetchBlob('output')
-        print(output)
-
-        expected = np.zeros(1000, dtype=np.int32)
-        expected[2] = 1 + 7
-        expected[4] = 2
-        expected[999] = 6
-
-        self.assertEqual(output.shape, expected.shape)
-        np.testing.assert_array_equal(output, expected)
-
-    def test_sparse_to_dense_shape_inference(self):
-        indices = np.array([2, 4, 999, 2], dtype=np.int32)
-        values = np.array([[1, 2], [2, 4], [6, 7], [7, 8]], dtype=np.int32)
-        data_to_infer_dim = np.array(np.zeros(1500, ), dtype=np.int32)
-        op = core.CreateOperator(
-            'SparseToDense',
-            ['indices', 'values', 'data_to_infer_dim'],
-            ['output'])
-        workspace.FeedBlob('indices', indices)
-        workspace.FeedBlob('values', values)
-        workspace.FeedBlob('data_to_infer_dim', data_to_infer_dim)
-
-        net = core.Net("sparse_to_dense")
-        net.Proto().op.extend([op])
-        shapes, types = workspace.InferShapesAndTypes(
-            [net],
-            blob_dimensions={
-                "indices": indices.shape,
-                "values": values.shape,
-                "data_to_infer_dim": data_to_infer_dim.shape,
-            },
-            blob_types={
-                "indices": core.DataType.INT32,
-                "values": core.DataType.INT32,
-                "data_to_infer_dim": core.DataType.INT32,
-            },
-        )
-        assert (
-            "output" in shapes and "output" in types
-        ), "Failed to infer the shape or type of output"
-        self.assertEqual(shapes["output"], [1500, 2])
-        self.assertEqual(types["output"], core.DataType.INT32)
-
-
-    def test_sparse_to_dense_invalid_inputs(self):
-        op = core.CreateOperator(
-            'SparseToDense',
-            ['indices', 'values'],
-            ['output'])
-        workspace.FeedBlob(
-            'indices',
-            np.array([2, 4, 999, 2], dtype=np.int32))
-        workspace.FeedBlob(
-            'values',
-            np.array([1, 2, 6], dtype=np.int32))
-
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(op)
-
-    def test_sparse_to_dense_with_data_to_infer_dim(self):
-        op = core.CreateOperator(
-            'SparseToDense',
-            ['indices', 'values', 'data_to_infer_dim'],
-            ['output'])
-        workspace.FeedBlob(
-            'indices',
-            np.array([2, 4, 999, 2], dtype=np.int32))
-        workspace.FeedBlob(
-            'values',
-            np.array([1, 2, 6, 7], dtype=np.int32))
-        workspace.FeedBlob(
-            'data_to_infer_dim',
-            np.array(np.zeros(1500, ), dtype=np.int32))
-
-        workspace.RunOperatorOnce(op)
-        output = workspace.FetchBlob('output')
-        print(output)
-
-        expected = np.zeros(1500, dtype=np.int32)
-        expected[2] = 1 + 7
-        expected[4] = 2
-        expected[999] = 6
-
-        self.assertEqual(output.shape, expected.shape)
-        np.testing.assert_array_equal(output, expected)
diff --git a/caffe2/python/task.py b/caffe2/python/task.py
deleted file mode 100644
index 8a332de0767a..000000000000
--- a/caffe2/python/task.py
+++ /dev/null
@@ -1,692 +0,0 @@
-## @package task
-# Module caffe2.python.task
-
-from caffe2.python import core, context
-from caffe2.python.schema import Field, from_blob_list
-from collections import defaultdict
-from copy import copy
-
-
-def _merge_node_kwargs(a, b):
-    # TODO(azzolini): consistency checks
-    if a is None:
-        return b
-    if b is None:
-        return a
-    c = copy(a)
-    c.update(b)
-    return c
-
-
-class Cluster(context.DefaultManaged):
-    """
-    Context that keeps track of all the node names used.
-    Users shouldn't have to use them directly, since a Cluster is automatically
-    generated at the first usage of 'Node'.
-    """
-
-    def __init__(self):
-        # list instead of set to keep order
-        self._nodes = []
-        self._node_kwargs = {}
-
-    def add_node(self, node):
-        if str(node) not in self._nodes:
-            self._nodes.append(str(node))
-        self._node_kwargs[str(node)] = _merge_node_kwargs(
-            node.kwargs(),
-            self._node_kwargs.get(str(node)))
-
-    def nodes(self):
-        """
-        Returns the list of unique node names used within this context.
-        """
-        return self._nodes
-
-    def node_kwargs(self):
-        return self._node_kwargs
-
-    def __repr__(self):
-        return "Cluster(nodes={}, node_kwargs={})".format(
-            self.nodes(), self.node_kwargs())
-
-
-class Node(context.DefaultManaged):
-    """
-    A Node context is used to indicate that all Tasks instantiated within will
-    run on the given node name. (Only the name of the node actually counts.)
-    Example:
-
-        with TaskGroup() as tg:
-            with Node('node1'):
-                s1 = execution_step(...)
-                Task(step=s1)
-            with Node('node2'):
-                s2 = execution_step(...)
-            with Node('node1'):
-                s3 = execution_step(...)
-
-        In this example, all three execution steps will run in parallel.
-        Moreover, s1 and s3 will run on the same node, and can see each
-        others blobs.
-
-        Additionally, a Node can be passed implementation-specific kwargs,
-        in order to specify properties of the node.
-    """
-
-    def __init__(self, node='local', **kwargs):
-        self._name = str(node)
-        self._kwargs = kwargs
-        Cluster.current().add_node(self)
-
-    def __str__(self):
-        return self._name
-
-    def __repr__(self):
-        return "Node(name={}, kwargs={})".format(self._name, self._kwargs)
-
-    def kwargs(self):
-        return self._kwargs
-
-
-class WorkspaceType:
-    """
-    Determines whether tasks of a TaskGroup will run directly at the global
-    workspace, which is kept alive across runs, or whether a new child
-    workspace will be created for the run and destroyed afterwards.
-    """
-    PRIVATE = 'private'
-    GLOBAL = 'global'
-
-
-def get_setup_nets(key, steps_or_nets, target):
-    init_net = core.Net(key + '/init')
-    exit_net = core.Net(key + '/exit')
-    init_nets = []
-    exit_nets = []
-    objs = []
-    for step_or_net in steps_or_nets:
-        if hasattr(step_or_net, 'get_all_attributes'):
-            objs += step_or_net.get_all_attributes(key)
-        elif hasattr(step_or_net, 'get_attributes'):
-            objs += step_or_net.get_attributes(key)
-    for obj in objs:
-        # these are needed in order to allow nesting of TaskGroup, which
-        # is a feature not yet implemented.
-        if hasattr(obj, '_setup_used') and obj._setup_used:
-            continue
-        if hasattr(obj, '_setup_target') and obj._setup_target != target:
-            continue
-        if hasattr(obj, 'setup'):
-            nets = obj.setup(init_net)
-            if isinstance(nets, (list, tuple)):
-                init_nets += nets
-            elif isinstance(nets, (core.Net, core.ExecutionStep)):
-                init_nets.append(nets)
-            elif nets is not None:
-                raise TypeError('Unsupported type for setup: %s' % type(nets))
-            obj._setup_used = True
-        if hasattr(obj, 'exit'):
-            nets = obj.exit(exit_net)
-            if isinstance(nets, (list, tuple)):
-                exit_nets += nets
-            elif isinstance(nets, (core.Net, core.ExecutionStep)):
-                exit_nets.append(nets)
-            elif nets is not None:
-                raise TypeError('Unsupported type for setup: %s' % type(nets))
-            obj._setup_used = True
-
-    if len(init_net.Proto().op) > 0:
-        init_nets.insert(0, init_net)
-    if len(exit_net.Proto().op) > 0:
-        exit_nets.insert(0, exit_net)
-    return init_nets, exit_nets
-
-
-def add_setup_steps(step, init_nets, exit_nets, name):
-    if not init_nets and not exit_nets:
-        return step
-    steps = []
-    if init_nets:
-        steps.append(core.execution_step('%s:init' % name, init_nets))
-    steps.append(step)
-    if len(exit_nets) > 0:
-        steps.append(core.execution_step('%s:exit' % name, exit_nets))
-    return core.execution_step(name, steps)
-
-
-class TaskGroup(context.Managed):
-    """
-    Context that gathers tasks which will run concurrently, potentially on
-    multiple nodes. All tasks in the same node will share the same workspace
-    and thus can share blobs, while tasks running in different nodes won't
-    be able to directly share data.
-
-    All tasks of the task group will start concurrently, and the task group
-    will finish execution when the last task of the group finishes.
-
-    Example:
-        # suppose that s1 ... s5 are execution steps or nets.
-        with TaskGroup() as tg:
-            # these tasks go to default node 'local'
-            Task(step=s1)
-            Task(step=s2)
-
-            with Node('n2'):
-                Task(step=s3)
-            with Node('n1'):
-                Task(step=s4)
-            with Node('n2'):
-                Task(step=s5)
-
-        # this will run all steps in parallel.
-        # s1 and s2 will run at default node 'local'
-        # s3 and s5 will run at node 'n2'
-        # s4 will run at node 'n1'
-        session.run(tg)
-    """
-    LOCAL_SETUP = 'local_setup'
-
-    def __init__(self, workspace_type=None):
-        self._plan_cache = None
-        self._tasks = []
-        self._already_used = False
-        self._prev_active = None
-        self._tasks_to_add = []
-        self._report_nets = {}
-        self._report_steps = []
-        self._workspace_type = workspace_type
-        self._tasks_by_node = None
-        self._remote_nets = []
-
-    def add_remote_net(self, net):
-        self._remote_nets.append(net)
-
-    def remote_nets(self):
-        return self._remote_nets
-
-    def add(self, task):
-        assert not self._already_used, (
-            'Cannot add Task to an already used TaskGroup.')
-        assert (
-            self._workspace_type is None or
-            task._workspace_type is None or
-            self._workspace_type == task._workspace_type)
-        if task._workspace_type is None:
-            task._workspace_type = (
-                self._workspace_type or WorkspaceType.PRIVATE)
-        if self._workspace_type is None:
-            self._workspace_type = task._workspace_type
-        task._notify_used()
-        self._tasks.append(task)
-
-    def tasks(self):
-        for task in self._tasks_to_add:
-            self.add(task)
-        self._tasks_to_add = []
-        self._already_used = True
-        return self._tasks
-
-    def num_registered_tasks(self):
-        return len(self._tasks_to_add) + len(self._tasks)
-
-    def used_nodes(self):
-        # use list to keep order
-        used = []
-        for task in self._tasks + self._tasks_to_add:
-            if task.node not in used:
-                used.append(task.node)
-        return used
-
-    def report_step(self, step=None, node=None, interval_ms=1000):
-        """
-        Add a "report step" to this TaskGroup. This step will run repeatedly
-        every `interval_ms` milliseconds for the duration of the TaskGroup
-        execution on each of the nodes. It is guaranteed that this step
-        will be run at least once after every Task in the node has finished.
-        """
-        step = core.to_execution_step(step)
-        step.RunEveryMillis(interval_ms)
-        self._report_steps.append((str(node or Node.current(node)), step))
-
-    def report_net(self, net=None, node=None, report_interval=5):
-        """
-        DEPRECATED. Use report_step instead.
-        """
-        node = str(node or Node.current(node))
-        assert net is None or node not in self._report_nets
-        if node not in self._report_nets:
-            self._report_nets[node] = (
-                net if net else core.Net('%s/reporter' % node),
-                report_interval)
-        return self._report_nets[node][0]
-
-    def tasks_by_node(self, node_remap=None):
-        # tasks_by_node can't be called twice because the setup won't
-        # work properly a second time.
-        node_map = {}
-        for task in self.tasks():
-            node_map[task.node] =\
-                node_remap(task.node) if node_remap else task.node
-        if self._tasks_by_node is not None:
-            tasks_by_node, prev_node_map = self._tasks_by_node
-            assert prev_node_map == node_map, (
-                'Cannot call tasks_by_node multiple times.')
-            return tasks_by_node
-
-        # now we have report_steps. report_net is deprecated
-        for node, (net, interval) in self._report_nets.items():
-            self.report_step(net, node=node, interval_ms=interval * 1000)
-        self._report_nets = {}
-
-        tasks_by_node = defaultdict(list)
-        for task in self.tasks():
-            mapped_node = node_map[task.node]
-            tasks_by_node[mapped_node].append(task)
-
-        report_steps_by_node = defaultdict(list)
-        for original_node, step in self._report_steps:
-            report_steps_by_node[node_map[original_node]].append(step)
-
-        grouped_by_node = TaskGroup()
-        for node, tasks in tasks_by_node.items():
-            report_steps = report_steps_by_node[node]
-            node_inits, node_exits = get_setup_nets(
-                TaskGroup.LOCAL_SETUP,
-                [t.get_step() for t in tasks] + report_steps,
-                self)
-            # shortcut for single task with no queue
-            steps = report_steps
-            outputs = []
-            grouped_workspace_type = WorkspaceType.PRIVATE
-            for task in tasks:
-                step = task.get_step()
-                step.SetCreateWorkspace(
-                    task.workspace_type() == WorkspaceType.PRIVATE)
-                if step is not None:
-                    steps.append(step)
-                outputs += task.outputs()
-                # If any of the tasks in the node uses the global workspace,
-                # then set the grouped task to use the global workspace as well
-                if task.workspace_type() == WorkspaceType.GLOBAL:
-                    grouped_workspace_type = WorkspaceType.GLOBAL
-            if len(steps) == 0:
-                steps.append(core.execution_step('empty', []))
-            if len(steps) == 1:
-                step = steps[0]
-            else:
-                step = core.execution_step(
-                    '%s:body' % node, steps, concurrent_substeps=True)
-            if len(node_inits) > 0 or len(node_exits) > 0:
-                steps = []
-                if len(node_inits) > 0:
-                    steps.append(
-                        core.execution_step('%s:init' % node, node_inits))
-                steps.append(step)
-                if len(node_exits) > 0:
-                    steps.append(
-                        core.execution_step('%s:exit' % node, node_exits))
-                step = core.execution_step(node, steps)
-            Task(
-                node=node, step=step, outputs=outputs,
-                name='grouped_by_node',
-                group=grouped_by_node, workspace_type=grouped_workspace_type)
-        self._tasks_by_node = (grouped_by_node, node_map)
-        return grouped_by_node
-
-    def to_task(self, node=None):
-        node = str(Node.current(node))
-        tasks = self.tasks_by_node(lambda x: node).tasks()
-        if len(tasks) == 0:
-            return Task()
-        return tasks[0]
-
-    def workspace_type(self):
-        return self._workspace_type
-
-    def __repr__(self):
-        return "TaskGroup(tasks={}, workspace_type={}, remote_nets={})".format(
-            self._tasks + self._tasks_to_add,
-            self.workspace_type(),
-            self.remote_nets())
-
-
-class TaskOutput:
-    """
-    Represents the output of a task. An output can be a blob,
-    a list of blob, or a record.
-    """
-
-    def __init__(self, names):
-        self._schema = None
-        self._is_scalar = False
-        if isinstance(names, Field):
-            self._schema = names
-            names = self._schema.field_blobs()
-        self._is_scalar = type(names) not in (tuple, list)
-        if self._is_scalar:
-            names = [names]
-        self.names = names
-        self._values = None
-
-    def set(self, values, _fetch_func=None):
-        assert len(values) == len(self.names)
-        self._values = values
-        self._fetch_func = _fetch_func
-
-    def get(self):
-        assert self._values is not None, 'Output value not set yet.'
-        if self._is_scalar:
-            return self._values[0]
-        elif self._schema:
-            return from_blob_list(self._schema, self._values)
-        else:
-            return self._values
-
-    def fetch(self):
-        assert self._fetch_func is not None, (
-            'Cannot fetch value for this output.')
-        fetched_vals = [self._fetch_func(v) for v in self._values]
-        if self._is_scalar:
-            return fetched_vals[0]
-        elif self._schema:
-            return from_blob_list(self._schema, fetched_vals)
-        else:
-            return fetched_vals
-
-    def __repr__(self):
-        return "TaskOutput(names={}, values={})".format(self.names, self._values)
-
-
-def final_output(blob_or_record):
-    """
-    Adds an output to the current Task, or if no task is active,
-    create a dummy task that returns the given blob or record
-    to the client. This will return the value of the blob or record when
-    the last task of the TaskGroup for a given node finishes.
-    """
-    cur_task = Task.current(required=False) or Task()
-    return cur_task.add_output(blob_or_record)
-
-
-class TaskOutputList:
-    """ Keeps a list of outputs for a task """
-    def __init__(self, outputs=None):
-        self.outputs = outputs or []
-
-    def names(self):
-        """
-        Retrive the output names.
-        TODO(azzolini): make this schema-based.
-        """
-        names = []
-        for o in self.outputs:
-            names += o.names
-        return names
-
-    def set_values(self, values, _fetch_func=None):
-        offset = 0
-        for o in self.outputs:
-            num = len(o.names)
-            o.set(values[offset:offset + num], _fetch_func)
-            offset += num
-        assert offset == len(values), 'Wrong number of output values.'
-
-    def __repr__(self):
-        return "TaskOutputList(outputs={})".format(self.outputs)
-
-
-class Task(context.Managed):
-    """
-    A Task is composed of an execution step and zero or more outputs.
-    Tasks are executed in the context of a TaskGroup, which, in turn, can
-    be run by a Session.
-
-    Task outputs are fetched by the session at the end of the run.
-
-    The recommended way of creating a task is by using `net_builder.ops`.
-    Example:
-
-        from net_builder import ops
-        with Node('trainer'), Task(name='my_task', num_instances=2):
-            with ops.task_init():
-                globl = ops.Const(0)
-            with ops.task_instance_init():
-                local = ops.Const(0)
-            with ops.loop(100):
-                ops.Copy(globl, local)
-            with ops.task_instance_exit():
-                ops.Add([globl, local], [globl])
-            with ops.task_exit():
-                ops.Mul([globl, globl], [globl])
-
-    The task above will create 2 instances that will run in parallel.
-    Each instance will copy `local` to `globl` 100 times, Then Add `local`
-    to `globl` once. The `Mul` will only execute once, after all the instances
-    of the task have finished.
-    """
-
-    # TASK_SETUP runs once per task, before/after all
-    # concurrent task instances start/finish.
-    TASK_SETUP = 'task_setup'
-    # Setup will run once for each instance of the task.
-    TASK_INSTANCE_SETUP = 'task_instance_setup'
-    REPORT_STEP = 'report_step'
-    _global_names_used = set()
-
-    @staticmethod
-    def _get_next_name(node, group, name):
-        basename = str(node) + '/' + str(name)
-        names_used = (
-            Task._global_names_used
-            if group is None else
-            set(t.name for t in group._tasks_to_add))
-        cur_name = basename
-        i = 0
-        while cur_name in names_used:
-            i += 1
-            cur_name = '%s:%d' % (basename, i)
-        return cur_name
-
-    def __init__(
-            self, step=None, outputs=None,
-            workspace_type=None, group=None, node=None, name=None,
-            num_instances=None):
-        """
-        Instantiate a Task and add it to the current TaskGroup and Node.
-
-        Args:
-           step:    If provided, this task will run this ExecutionStep.
-           outputs: If provided, the task will return the provided outputs
-                    to the client at completion time.
-           node:    If provided, force task execution on the given node.
-           name:    Name of the Task.
-           num_instances: If provided, this task will be cloned num_instances
-                          times at runtime, and all instances will run
-                          concurrently.
-        """
-        if not name and isinstance(step, core.ExecutionStep):
-            name = step.Proto().name
-        if not name:
-            name = 'task'
-        # register this node name with active context
-        self.node = str(Node.current(None if node is None else Node(node)))
-        self.group = TaskGroup.current(group, required=False)
-
-        self.name = Task._get_next_name(self.node, self.group, name)
-
-        # may need to be temporarily removed later if Task used as a context
-        if self.group is not None:
-            self.group._tasks_to_add.append(self)
-
-        self._already_used = False
-        self._step = None
-        self._step_with_setup = None
-        self._outputs = []
-        if step is not None:
-            self.set_step(step)
-        if outputs is not None:
-            self.add_outputs(outputs)
-
-        self._pipeline = None
-        self._is_pipeline_context = False
-        self._workspace_type = workspace_type
-        self._report_net = None
-        self._num_instances = num_instances
-
-    def __enter__(self):
-        super().__enter__()
-
-        # temporarily remove from _tasks_to_add to ensure correct order
-        if self.group is not None:
-            self.group._tasks_to_add.remove(self)
-        self._assert_not_used()
-        assert self._step is None, 'This Task already has an execution step.'
-        from caffe2.python import net_builder
-        self._net_builder = net_builder.NetBuilder(_fullname=self.name)
-        self._net_builder.__enter__()
-        return self
-
-    def __exit__(self, type, value, traceback):
-        super().__exit__(type, value, traceback)
-
-        self._net_builder.__exit__(type, value, traceback)
-        if type is None:
-            self.set_step(self._net_builder)
-        if self.group is not None:
-            self.group._tasks_to_add.append(self)
-        self._net_builder = None
-
-    def workspace_type(self):
-        return self._workspace_type
-
-    def _assert_not_used(self):
-        assert not self._already_used, (
-            'Cannot modify task since it is already been used.')
-
-    def add_output(self, output):
-        self._assert_not_used()
-        output = (
-            output if isinstance(output, TaskOutput) else TaskOutput(output))
-        self._outputs.append(output)
-        return output
-
-    def add_outputs(self, outputs):
-        self._assert_not_used()
-        if type(outputs) not in (list, tuple):
-            return self.add_output(outputs)
-        else:
-            return [self.add_output(output) for output in outputs]
-
-    def set_step(self, step):
-        self._assert_not_used()
-        self._step = core.to_execution_step(step)
-
-    def get_step(self):
-        if self._step_with_setup is not None:
-            return self._step_with_setup
-
-        if self._step is None:
-            self._step_with_setup = core.execution_step(self.name, [])
-            return self._step_with_setup
-
-        report_steps = [
-            s
-            for s in self._step.get_all_attributes(Task.REPORT_STEP)
-            if not hasattr(s, '_report_step_used')
-        ]
-        for step in report_steps:
-            step._report_step_used = True
-            if not step.Proto().run_every_ms:
-                step.RunEveryMillis(1000)
-        task_init_nets, task_exit_nets = get_setup_nets(
-            Task.TASK_SETUP, [self._step] + report_steps, self)
-        instance_init_nets, instance_exit_nets = get_setup_nets(
-            Task.TASK_INSTANCE_SETUP, [self._step] + report_steps, self)
-        if len(self._outputs) == 0:
-            output_net = core.Net('%s:output' % self.name)
-            self.add_output(output_net.ConstantFill(
-                [], 1, dtype=core.DataType.INT32, value=0))
-            task_exit_nets.append(output_net)
-
-        # Add instance-level report steps
-        body = self._step if not report_steps else core.execution_step(
-            '%s:body' % self.name, report_steps + [self._step])
-        # Enclose with instance-level (thread-local) setup nets
-        step_with_instance_setup = add_setup_steps(
-            body, instance_init_nets, instance_exit_nets,
-            self.name + ':instance')
-        # Set up runtime concurrent instances
-        if self._num_instances and self._num_instances > 1:
-            step_with_instance_setup.SetCreateWorkspace(True)
-            step_with_instance_setup = core.execution_step(
-                '%s:parallel',
-                [step_with_instance_setup],
-                num_concurrent_instances=self._num_instances)
-        # Enclose with task-level setup nets
-        self._step_with_setup = add_setup_steps(
-            step_with_instance_setup, task_init_nets, task_exit_nets, self.name)
-
-        return self._step_with_setup
-
-    def output_list(self):
-        return TaskOutputList(self._outputs)
-
-    def outputs(self):
-        return self._outputs
-
-    def _notify_used(self):
-        self.get_step()
-        self._already_used = True
-
-    def __repr__(self):
-        return "Task(name={}, node={}, outputs={})".format(
-            self.name, self.node, self.outputs())
-
-
-class SetupNets:
-    """
-    Allow to register a list of nets to be run at initialization
-    and finalization of Tasks or TaskGroups.
-    For example, let's say you have the following:
-
-        init_net = core.Net('init')
-        my_val = init_net.ConstantFill([], 'my_val', value=0)
-
-        net = core.Net('counter')
-        net.Add([my_val, net.Const(1),], [my_val])
-
-        with TaskGroup() as task_group:
-            with Node('trainer'):
-                my_task = Task(step=[net])
-
-    In order to have `init_net` run once before `net` runs for the
-    first time, you can do one of the following:
-
-        net.add_attribute(Task.TASK_SETUP, SetupNets([init_net]))
-
-    or
-
-        net.add_attribute(TaskGroup.LOCAL_SETUP, SetupNets([init_net]))
-
-    - With Task.TASK_SETUP, init_net will run once at my_task startup.
-    - With TaskGroup.LOCAL_SETUP, init_net will run once on node 'trainer',
-      before any task of the task group is run on that node.
-
-    The same SetupNets object can be added to multiple nets. It will only
-    run once per Task/TaskGroup run.
-    """
-
-    def __init__(self, init_nets=None, exit_nets=None):
-        self.init_nets = init_nets
-        self.exit_nets = exit_nets
-
-    def setup(self, init_net):
-        return self.init_nets
-
-    def exit(self, exit_net):
-        return self.exit_nets
-
-    def __repr__(self):
-        return "SetupNets(init_nets={}, exit_nets={})".format(
-            self.init_nets, self.exit_nets)
diff --git a/caffe2/python/task_test.py b/caffe2/python/task_test.py
deleted file mode 100644
index 31adb41a0ac9..000000000000
--- a/caffe2/python/task_test.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import unittest
-from caffe2.python import task
-
-
-class TestTask(unittest.TestCase):
-    def testRepr(self):
-        cases = [
-            (task.Cluster(), "Cluster(nodes=[], node_kwargs={})"),
-            (task.Node(), "Node(name=local, kwargs={})"),
-            (
-                task.TaskGroup(),
-                "TaskGroup(tasks=[], workspace_type=None, remote_nets=[])",
-            ),
-            (task.TaskOutput([]), "TaskOutput(names=[], values=None)"),
-            (task.Task(), "Task(name=local/task, node=local, outputs=[])"),
-            (task.SetupNets(), "SetupNets(init_nets=None, exit_nets=None)"),
-        ]
-        for obj, want in cases:
-            self.assertEqual(obj.__repr__(), want)
-
-    def testEffectlessRepr(self):
-        task_group = task.TaskGroup()
-        _repr = task_group.__repr__()
-        self.assertFalse(task_group._already_used)
diff --git a/caffe2/python/test/__init__.py b/caffe2/python/test/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/test/blob_deallocation_test.py b/caffe2/python/test/blob_deallocation_test.py
deleted file mode 100644
index 37886618ef45..000000000000
--- a/caffe2/python/test/blob_deallocation_test.py
+++ /dev/null
@@ -1,28 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-import unittest
-
-core.GlobalInit(['python'])
-
-
-class BlobDeallocationTest(unittest.TestCase):
-    def test(self):
-        net = core.Net('net')
-
-        x = net.GivenTensorStringFill([], ['x'], shape=[3], values=['a', 'b', 'c'])
-        y = net.GivenTensorStringFill([], ['y'], shape=[3], values=['d', 'e', 'f'])
-        net.Concat([x, y], ['concated', '_'], axis=0)
-
-        workspace.ResetWorkspace()
-        workspace.RunNetOnce(net)
-
-        workspace.ResetWorkspace()
-        workspace.RunNetOnce(net)
-        self.assertTrue(True)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/test/do_op_test.py b/caffe2/python/test/do_op_test.py
deleted file mode 100644
index fcc6918d5350..000000000000
--- a/caffe2/python/test/do_op_test.py
+++ /dev/null
@@ -1,77 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test_util import TestCase
-import numpy as np
-import unittest
-
-
-class DoOpTest(TestCase):
-    def test_operator(self):
-        def make_net():
-            subnet = core.Net('subnet')
-            subnet.Add(["X", "Y"], "Z")
-
-            net = core.Net("net")
-            net.CreateScope([], "W")
-
-            net.Do(
-                ["outer_X", "outer_Y", "W"],
-                ["outer_Z", "W"],
-                net=subnet.Proto(),
-                inner_blobs=["X", "Y", "Z"],
-                outer_blobs_idx=[0, 1, 2],
-            )
-
-            return net
-
-        net = make_net()
-
-        workspace.ResetWorkspace()
-        workspace.FeedBlob("outer_X", np.asarray([1, 2]))
-        workspace.FeedBlob("outer_Y", np.asarray([3, 4]))
-
-        workspace.RunNetOnce(net)
-        outer_Z_val = workspace.FetchBlob("outer_Z")
-        self.assertTrue(np.all(outer_Z_val == np.asarray([4, 6])))
-
-    def test_reuse_workspace(self):
-        def make_net():
-            param_init_subnet = core.Net('param_init_subnet')
-            param_init_subnet.ConstantFill([], "X", shape=[1], value=1)
-            param_init_subnet.ConstantFill([], "Y", shape=[1], value=2)
-
-            subnet = core.Net("subnet")
-            subnet.Add(["X", "Y"], "Z")
-
-            net = core.Net("net")
-            net.CreateScope([], "W")
-            net.Do(
-                "W", "W",
-                net=param_init_subnet.Proto(),
-                inner_blobs=[],
-                outer_blobs_idx=[],
-            )
-
-            net.Do(
-                "W", ["outer_Z", "W"],
-                net=subnet.Proto(),
-                inner_blobs=["Z"],
-                outer_blobs_idx=[0],
-                reuse_workspace=True,
-            )
-
-            return net
-
-        net = make_net()
-
-        workspace.ResetWorkspace()
-        workspace.RunNetOnce(net)
-        outer_Z_val = workspace.FetchBlob("outer_Z")
-        self.assertTrue(np.all(outer_Z_val == np.asarray([3])))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/test/executor_test.py b/caffe2/python/test/executor_test.py
deleted file mode 100644
index b4db64005f62..000000000000
--- a/caffe2/python/test/executor_test.py
+++ /dev/null
@@ -1,103 +0,0 @@
-
-
-
-
-from caffe2.python import core, workspace
-from caffe2.python.test.executor_test_util import (
-    build_conv_model,
-    build_resnet50_dataparallel_model,
-    run_resnet50_epoch,
-    ExecutorTestBase,
-    executor_test_settings,
-    executor_test_model_names)
-
-from caffe2.python.test_util import TestCase
-
-from hypothesis import given
-import hypothesis.strategies as st
-
-import unittest
-
-
-EXECUTORS = ["parallel", "async_scheduling"]
-ITERATIONS = 1
-
-
-class ExecutorCPUConvNetTest(ExecutorTestBase):
-    @given(executor=st.sampled_from(EXECUTORS),
-           model_name=st.sampled_from(executor_test_model_names()),
-           batch_size=st.sampled_from([1]),
-           num_workers=st.sampled_from([8]))
-    @executor_test_settings
-    def test_executor(self, executor, model_name, batch_size, num_workers):
-        model = build_conv_model(model_name, batch_size)
-        model.Proto().num_workers = num_workers
-
-        def run_model():
-            iterations = ITERATIONS
-            if model_name == "MLP":
-                iterations = 1  # avoid numeric instability with MLP gradients
-            workspace.RunNet(model.net, iterations)
-
-        self.compare_executors(
-            model,
-            ref_executor="simple",
-            test_executor=executor,
-            model_run_func=run_model,
-        )
-
-
-@unittest.skipIf(not workspace.has_gpu_support, "no gpu")
-class ExecutorGPUResNetTest(ExecutorTestBase):
-    @given(executor=st.sampled_from(EXECUTORS),
-           num_workers=st.sampled_from([8]))
-    @executor_test_settings
-    def test_executor(self, executor, num_workers):
-        model = build_resnet50_dataparallel_model(
-            num_gpus=workspace.NumGpuDevices(), batch_size=8, epoch_size=8)
-        model.Proto().num_workers = num_workers
-
-        def run_model():
-            run_resnet50_epoch(model, batch_size=8, epoch_size=8)
-
-        self.compare_executors(
-            model,
-            ref_executor="simple",
-            test_executor=executor,
-            model_run_func=run_model,
-        )
-
-
-class ExecutorFailingOpTest(TestCase):
-    def test_failing_op(self):
-        def create_failing_net(throw_exception):
-            net = core.Net("failing_net")
-            if throw_exception:
-                net.ThrowException([], [])
-            else:
-                net.Fail([], [])
-            net.Proto().type = "async_scheduling"
-            return net
-
-        workspace.ResetWorkspace()
-        net = create_failing_net(throw_exception=True)
-        workspace.CreateNet(net)
-        with self.assertRaises(RuntimeError):
-            workspace.RunNet(net)
-
-        with self.assertRaises(RuntimeError):
-            workspace.RunNet(net, allow_fail=True)
-
-        workspace.ResetWorkspace()
-        net = create_failing_net(throw_exception=False)
-        workspace.CreateNet(net)
-
-        with self.assertRaises(RuntimeError):
-            workspace.RunNet(net)
-
-        res = workspace.RunNet(net, allow_fail=True)
-        self.assertFalse(res)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/test/executor_test_util.py b/caffe2/python/test/executor_test_util.py
deleted file mode 100644
index abf63626a7fa..000000000000
--- a/caffe2/python/test/executor_test_util.py
+++ /dev/null
@@ -1,267 +0,0 @@
-
-
-
-
-
-from caffe2.python import (
-    brew, cnn, core, workspace, data_parallel_model,
-    timeout_guard, model_helper, optimizer)
-from caffe2.python.test_util import TestCase
-import caffe2.python.models.resnet as resnet
-from caffe2.python.modeling.initializers import Initializer
-from caffe2.python import convnet_benchmarks as cb
-from caffe2.python import hypothesis_test_util as hu
-
-import time
-import numpy as np
-
-
-CI_MAX_EXAMPLES = 2
-CI_TIMEOUT = 600
-
-
-def executor_test_settings(func):
-    if hu.is_sandcastle() or hu.is_travis():
-        return hu.settings(
-            max_examples=CI_MAX_EXAMPLES,
-            deadline=CI_TIMEOUT * 1000  # deadline is in ms
-        )(func)
-    else:
-        return func
-
-
-def gen_test_resnet50(_order, _cudnn_ws):
-    model = cnn.CNNModelHelper(
-        order="NCHW",
-        name="resnet_50_test",
-        cudnn_exhaustive_search=True,
-    )
-    data = model.net.AddExternalInput("data")
-    label = model.net.AddExternalInput("label")
-    (_softmax, loss) = resnet.create_resnet50(
-        model,
-        data,
-        num_input_channels=3,
-        num_labels=1000,
-        label=label,
-        is_test=False,
-    )
-    return model, 227
-
-
-def conv_model_generators():
-    return {
-        'AlexNet': cb.AlexNet,
-        'OverFeat': cb.OverFeat,
-        'VGGA': cb.VGGA,
-        'Inception': cb.Inception,
-        'MLP': cb.MLP,
-        'Resnet50': gen_test_resnet50,
-    }
-
-
-def executor_test_model_names():
-    if hu.is_sandcastle() or hu.is_travis():
-        return ["MLP"]
-    else:
-        return sorted(conv_model_generators().keys())
-
-
-def build_conv_model(model_name, batch_size):
-    model_gen_map = conv_model_generators()
-    assert model_name in model_gen_map, "Model " + model_name + " not found"
-    model, input_size = model_gen_map[model_name]("NCHW", None)
-
-    input_shape = [batch_size, 3, input_size, input_size]
-    if model_name == "MLP":
-        input_shape = [batch_size, input_size]
-
-    model.param_init_net.GaussianFill(
-        [],
-        "data",
-        shape=input_shape,
-        mean=0.0,
-        std=1.0
-    )
-    model.param_init_net.UniformIntFill(
-        [],
-        "label",
-        shape=[batch_size, ],
-        min=0,
-        max=999
-    )
-
-    model.AddGradientOperators(["loss"])
-
-    ITER = brew.iter(model, "iter")
-    LR = model.net.LearningRate(
-        ITER, "LR", base_lr=-1e-8, policy="step", stepsize=10000, gamma=0.999)
-    ONE = model.param_init_net.ConstantFill([], "ONE", shape=[1], value=1.0)
-    for param in model.params:
-        param_grad = model.param_to_grad[param]
-        model.net.WeightedSum([param, ONE, param_grad, LR], param)
-
-    return model
-
-
-def build_resnet50_dataparallel_model(
-        num_gpus,
-        batch_size,
-        epoch_size,
-        cudnn_workspace_limit_mb=64,
-        num_channels=3,
-        num_labels=1000,
-        weight_decay=1e-4,
-        base_learning_rate=0.1,
-        image_size=227,
-        use_cpu=False):
-
-    batch_per_device = batch_size // num_gpus
-
-    train_arg_scope = {
-        'order': 'NCHW',
-        'use_cudnn': True,
-        'cudnn_exhaustive_search': False,
-        'ws_nbytes_limit': (cudnn_workspace_limit_mb * 1024 * 1024),
-        'deterministic': True,
-    }
-    train_model = model_helper.ModelHelper(
-        name="test_resnet50", arg_scope=train_arg_scope
-    )
-
-    def create_resnet50_model_ops(model, loss_scale):
-        with brew.arg_scope([brew.conv, brew.fc],
-                            WeightInitializer=Initializer,
-                            BiasInitializer=Initializer,
-                            enable_tensor_core=0):
-            pred = resnet.create_resnet50(
-                model,
-                "data",
-                num_input_channels=num_channels,
-                num_labels=num_labels,
-                no_bias=True,
-                no_loss=True,
-            )
-
-        softmax, loss = model.SoftmaxWithLoss([pred, 'label'],
-                                              ['softmax', 'loss'])
-        loss = model.Scale(loss, scale=loss_scale)
-        brew.accuracy(model, [softmax, "label"], "accuracy")
-        return [loss]
-
-    def add_optimizer(model):
-        stepsz = int(30 * epoch_size / batch_size)
-        optimizer.add_weight_decay(model, weight_decay)
-        opt = optimizer.build_multi_precision_sgd(
-            model,
-            base_learning_rate,
-            momentum=0.9,
-            nesterov=1,
-            policy="step",
-            stepsize=stepsz,
-            gamma=0.1
-        )
-        return opt
-
-    def add_image_input(model):
-        model.param_init_net.GaussianFill(
-            [],
-            ["data"],
-            shape=[batch_per_device, 3, image_size, image_size],
-            dtype='float',
-        )
-        model.param_init_net.ConstantFill(
-            [],
-            ["label"],
-            shape=[batch_per_device],
-            value=1,
-            dtype=core.DataType.INT32,
-        )
-
-    def add_post_sync_ops(model):
-        for param_info in model.GetOptimizationParamInfo(model.GetParams()):
-            if param_info.blob_copy is not None:
-                model.param_init_net.HalfToFloat(
-                    param_info.blob,
-                    param_info.blob_copy[core.DataType.FLOAT])
-
-    # Create parallelized model
-    data_parallel_model.Parallelize(
-        train_model,
-        input_builder_fun=add_image_input,
-        forward_pass_builder_fun=create_resnet50_model_ops,
-        optimizer_builder_fun=add_optimizer,
-        post_sync_builder_fun=add_post_sync_ops,
-        devices=list(range(num_gpus)),
-        rendezvous=None,
-        optimize_gradient_memory=True,
-        cpu_device=use_cpu,
-        shared_model=use_cpu,
-    )
-
-    return train_model
-
-
-def run_resnet50_epoch(train_model, batch_size, epoch_size, skip_first_n_iter=0):
-    epoch_iters = int(epoch_size / batch_size)
-    prefix = "{}_{}".format(
-        train_model._device_prefix,
-        train_model._devices[0])
-    train_time = 0.0
-    train_examples = 0
-    for i in range(epoch_iters):
-        timeout = 600.0 if i == 0 else 60.0
-        with timeout_guard.CompleteInTimeOrDie(timeout):
-            t1 = time.time()
-            workspace.RunNet(train_model.net.Proto().name)
-            t2 = time.time()
-            dt = t2 - t1
-            if i >= skip_first_n_iter:
-                train_time += dt
-                train_examples += batch_size
-
-        fmt = "Finished iteration {}/{} ({:.2f} images/sec)"
-        print(fmt.format(i + 1, epoch_iters, batch_size / dt))
-
-    accuracy = workspace.FetchBlob(prefix + '/accuracy')
-    loss = workspace.FetchBlob(prefix + '/loss')
-
-    assert loss < 40, "Exploded gradients"
-
-    return (
-        train_examples,
-        train_time,
-        accuracy, loss)
-
-
-class ExecutorTestBase(TestCase):
-    def compare_executors(self, model, ref_executor, test_executor, model_run_func):
-        model.Proto().type = ref_executor
-        model.param_init_net.set_rand_seed(seed=0xCAFFE2)
-        model.net.set_rand_seed(seed=0xCAFFE2)
-
-        workspace.ResetWorkspace()
-        workspace.RunNetOnce(model.param_init_net)
-
-        workspace.CreateNet(model.net)
-        model_run_func()
-        ref_ws = {str(k): workspace.FetchBlob(k) for k in workspace.Blobs()}
-        ref_ws = {k: v for k, v in ref_ws.items() if type(v) is np.ndarray}
-
-        workspace.ResetWorkspace()
-        workspace.RunNetOnce(model.param_init_net)
-
-        model.Proto().type = test_executor
-        workspace.CreateNet(model.net, overwrite=True)
-        model_run_func()
-        test_ws = {str(k): workspace.FetchBlob(k) for k in workspace.Blobs()}
-        test_ws = {k: v for k, v in test_ws.items() if type(v) is np.ndarray}
-
-        for blob_name, ref_val in ref_ws.items():
-            self.assertTrue(
-                blob_name in test_ws,
-                "Blob {} not found in {} run".format(blob_name, test_executor))
-            val = test_ws[blob_name]
-            np.testing.assert_array_equal(
-                val, ref_val,
-                "Blob {} differs in {} run".format(blob_name, test_executor))
diff --git a/caffe2/python/test/fakefp16_transform_test.py b/caffe2/python/test/fakefp16_transform_test.py
deleted file mode 100644
index f98342eba54a..000000000000
--- a/caffe2/python/test/fakefp16_transform_test.py
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
-
-
-import unittest
-from caffe2.python.fakefp16_transform_lib import fakeFp16FuseOps
-from caffe2.python import core
-
-class Transformer(unittest.TestCase):
-    def test_fuse(self):
-        net_swish = core.Net("test_swish")
-        net_swish_init = core.Net("test_swish_init")
-
-        deq = core.CreateOperator("Int8DequantizeNNPI", ["Xq"], ["X"])
-        swish = core.CreateOperator("SwishFakeFp16NNPI", ["X"], ["Y"])
-        quant = core.CreateOperator("Int8QuantizeNNPI", ["Y"], ["Y_q"])
-        net_swish.Proto().op.extend(
-            [
-                deq, swish, quant
-            ]
-        )
-        print(net_swish.Proto())
-        out_net = fakeFp16FuseOps(net_swish.Proto())
-        assert(len(out_net.op) == 1)
diff --git a/caffe2/python/test/gpu_context_test.py b/caffe2/python/test/gpu_context_test.py
deleted file mode 100644
index 9ee8a308cc2e..000000000000
--- a/caffe2/python/test/gpu_context_test.py
+++ /dev/null
@@ -1,31 +0,0 @@
-
-
-
-
-
-import unittest
-
-import torch
-from caffe2.python import core, workspace
-
-# This is a standalone test that doesn't use test_util as we're testing
-# initialization and thus we should be the ones calling GlobalInit
-@unittest.skipIf(not workspace.has_cuda_support,
-                 "THC pool testing is obscure and doesn't work on HIP yet")
-class TestGPUInit(unittest.TestCase):
-    def testTHCAllocator(self):
-        cuda_or_hip = 'hip' if workspace.has_hip_support else 'cuda'
-        flag = '--caffe2_{}_memory_pool=thc'.format(cuda_or_hip)
-        core.GlobalInit(['caffe2', flag])
-        # just run one operator
-        # it's importantant to not call anything here from Torch API
-        # even torch.cuda.memory_allocated would initialize CUDA context
-        workspace.RunOperatorOnce(core.CreateOperator(
-            'ConstantFill', [], ["x"], shape=[5, 5], value=1.0,
-            device_option=core.DeviceOption(workspace.GpuDeviceType)
-        ))
-        # make sure we actually used THC allocator
-        self.assertGreater(torch.cuda.memory_allocated(), 0)
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/test/inference_lstm_op_test.py b/caffe2/python/test/inference_lstm_op_test.py
deleted file mode 100644
index 768827bd8876..000000000000
--- a/caffe2/python/test/inference_lstm_op_test.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/env python3
-
-import hypothesis.strategies as st
-import numpy as np
-import torch
-from caffe2.python import core
-from caffe2.python.test_util import TestCase
-from hypothesis import given, settings
-from torch import nn
-
-
-class TestC2LSTM(TestCase):
-    @given(
-        bsz=st.integers(1, 5),
-        seq_lens=st.integers(1, 6),
-        emb_lens=st.integers(5, 10),
-        hidden_size=st.integers(3, 7),
-        num_layers=st.integers(1, 4),
-        has_biases=st.booleans(),
-        is_bidirectional=st.booleans(),
-        batch_first=st.booleans(),
-    )
-    @settings(deadline=10000)
-    def test_c2_lstm(
-        self,
-        bsz,
-        seq_lens,
-        emb_lens,
-        hidden_size,
-        num_layers,
-        has_biases,
-        is_bidirectional,
-        batch_first,
-    ):
-        net = core.Net("test_net")
-        num_directions = 2 if is_bidirectional else 1
-        py_lstm = nn.LSTM(
-            emb_lens,
-            hidden_size,
-            batch_first=batch_first,
-            bidirectional=is_bidirectional,
-            bias=has_biases,
-            num_layers=num_layers,
-        )
-
-        hx = np.zeros((num_layers * num_directions, bsz, hidden_size), dtype=np.float32)
-
-        if batch_first:
-            inputs = np.random.randn(bsz, seq_lens, emb_lens).astype(np.float32)
-        else:
-            inputs = np.random.randn(seq_lens, bsz, emb_lens).astype(np.float32)
-
-        py_results = py_lstm(torch.from_numpy(inputs))
-        lstm_in = [
-            torch.from_numpy(inputs),
-            torch.from_numpy(hx),
-            torch.from_numpy(hx),
-        ] + [param.detach() for param in py_lstm._flat_weights]
-
-        c2_results = torch.ops._caffe2.InferenceLSTM(
-            lstm_in, num_layers, has_biases, batch_first, is_bidirectional
-        )
-
-        np.testing.assert_array_almost_equal(
-            py_results[0].detach().numpy(), c2_results[0].detach().numpy()
-        )
-        np.testing.assert_array_almost_equal(
-            py_results[1][0].detach().numpy(), c2_results[1].detach().numpy()
-        )
-        np.testing.assert_array_almost_equal(
-            py_results[1][1].detach().numpy(), c2_results[2].detach().numpy()
-        )
diff --git a/caffe2/python/test/net_name_test.py b/caffe2/python/test/net_name_test.py
deleted file mode 100644
index 6d44e07014d4..000000000000
--- a/caffe2/python/test/net_name_test.py
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/usr/bin/env python3
-from unittest.mock import patch
-
-from caffe2.python.test_util import TestCase
-
-
-class NetNameTest(TestCase):
-    @patch("caffe2.python.core.Net.current_prefix", return_value="prefix")
-    def test_net_name(self, _current_prefix):
-        from caffe2.python.core import Net
-
-        self.assertEqual(Net._get_next_net_name("test"), "prefix/test")
-        self.assertEqual(Net._get_next_net_name("test"), "prefix/test_1")
-        self.assertEqual(Net._get_next_net_name("test_1_2"), "prefix/test_1_2")
-        self.assertEqual(Net._get_next_net_name("test_1"), "prefix/test_1_1")
-        self.assertEqual(Net._get_next_net_name("test_1"), "prefix/test_1_3")
diff --git a/caffe2/python/test/python_protobuf_test.py b/caffe2/python/test/python_protobuf_test.py
deleted file mode 100644
index a407f33fe253..000000000000
--- a/caffe2/python/test/python_protobuf_test.py
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-
-
-# make sure we use cpp implementation of protobuf
-import os
-os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "cpp"
-# then import protobuf
-from caffe2.proto import caffe2_pb2, metanet_pb2
-
-import unittest
-
-
-class TestCrossProtoCalls(unittest.TestCase):
-    def testSimple(self):
-        net = caffe2_pb2.NetDef()
-        meta = metanet_pb2.MetaNetDef()
-        # if metanet_pb2 wasn't initialized properly the following fails with a
-        # cryptic message: "Parameter to MergeFrom() must be instance of same
-        # class: expected caffe2.NetDef got caffe2.NetDef."
-        meta.nets.add(key="foo", value=net)
diff --git a/caffe2/python/test_util.py b/caffe2/python/test_util.py
deleted file mode 100644
index 6b0573ccd81f..000000000000
--- a/caffe2/python/test_util.py
+++ /dev/null
@@ -1,90 +0,0 @@
-## @package test_util
-# Module caffe2.python.test_util
-
-
-
-
-import numpy as np
-from caffe2.python import core, workspace
-
-import os
-import pathlib
-import shutil
-import tempfile
-import unittest
-from typing import Any, Callable, Tuple, Type
-from types import TracebackType
-
-
-def rand_array(*dims):
-    # np.random.rand() returns float instead of 0-dim array, that's why need to
-    # do some tricks
-    return np.array(np.random.rand(*dims) - 0.5).astype(np.float32)
-
-
-def randBlob(name, type, *dims, **kwargs):
-    offset = kwargs['offset'] if 'offset' in kwargs else 0.0
-    workspace.FeedBlob(name, np.random.rand(*dims).astype(type) + offset)
-
-
-def randBlobFloat32(name, *dims, **kwargs):
-    randBlob(name, np.float32, *dims, **kwargs)
-
-
-def randBlobsFloat32(names, *dims, **kwargs):
-    for name in names:
-        randBlobFloat32(name, *dims, **kwargs)
-
-
-def numOps(net):
-    return len(net.Proto().op)
-
-
-def str_compare(a, b, encoding="utf8"):
-    if isinstance(a, bytes):
-        a = a.decode(encoding)
-    if isinstance(b, bytes):
-        b = b.decode(encoding)
-    return a == b
-
-
-def get_default_test_flags():
-    return [
-        'caffe2',
-        '--caffe2_log_level=0',
-        '--caffe2_cpu_allocator_do_zero_fill=0',
-        '--caffe2_cpu_allocator_do_junk_fill=1',
-    ]
-
-
-class TestCase(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        workspace.GlobalInit(get_default_test_flags())
-        # clear the default engines settings to separate out its
-        # affect from the ops tests
-        core.SetEnginePref({}, {})
-
-    def setUp(self):
-        self.ws = workspace.C.Workspace()
-        workspace.ResetWorkspace()
-
-    def tearDown(self):
-        workspace.ResetWorkspace()
-
-    def make_tempdir(self) -> pathlib.Path:
-        tmp_folder = pathlib.Path(tempfile.mkdtemp(prefix="caffe2_test."))
-        self.addCleanup(self._remove_tempdir, tmp_folder)
-        return tmp_folder
-
-    def _remove_tempdir(self, path: pathlib.Path) -> None:
-        def _onerror(
-            fn: Callable[..., Any],
-            path: str,
-            exc_info: Tuple[Type[BaseException], BaseException, TracebackType],
-        ) -> None:
-            # Ignore FileNotFoundError, but re-raise anything else
-            if not isinstance(exc_info[1], FileNotFoundError):
-                raise exc_info[1].with_traceback(exc_info[2])
-
-        shutil.rmtree(str(path), onerror=_onerror)
diff --git a/caffe2/python/text_file_reader.py b/caffe2/python/text_file_reader.py
deleted file mode 100644
index 48f69f90c7b4..000000000000
--- a/caffe2/python/text_file_reader.py
+++ /dev/null
@@ -1,58 +0,0 @@
-## @package text_file_reader
-# Module caffe2.python.text_file_reader
-
-
-
-
-from caffe2.python import core
-from caffe2.python.dataio import Reader
-from caffe2.python.schema import Scalar, Struct, data_type_for_dtype
-
-
-class TextFileReader(Reader):
-    """
-    Wrapper around operators for reading from text files.
-    """
-    def __init__(self, init_net, filename, schema, num_passes=1, batch_size=1):
-        """
-        Create op for building a TextFileReader instance in the workspace.
-
-        Args:
-            init_net   : Net that will be run only once at startup.
-            filename   : Path to file to read from.
-            schema     : schema.Struct representing the schema of the data.
-                         Currently, only support Struct of strings and float32.
-            num_passes : Number of passes over the data.
-            batch_size : Number of rows to read at a time.
-        """
-        assert isinstance(schema, Struct), 'Schema must be a schema.Struct'
-        for name, child in schema.get_children():
-            assert isinstance(child, Scalar), (
-                'Only scalar fields are supported in TextFileReader.')
-        field_types = [
-            data_type_for_dtype(dtype) for dtype in schema.field_types()]
-        Reader.__init__(self, schema)
-        self._reader = init_net.CreateTextFileReader(
-            [],
-            filename=filename,
-            num_passes=num_passes,
-            field_types=field_types)
-        self._batch_size = batch_size
-
-    def read(self, net):
-        """
-        Create op for reading a batch of rows.
-        """
-        blobs = net.TextFileReaderRead(
-            [self._reader],
-            len(self.schema().field_names()),
-            batch_size=self._batch_size)
-        if type(blobs) is core.BlobReference:
-            blobs = [blobs]
-
-        is_empty = net.IsEmpty(
-            [blobs[0]],
-            core.ScopedBlobReference(net.NextName('should_stop'))
-        )
-
-        return (is_empty, blobs)
diff --git a/caffe2/python/timeout_guard.py b/caffe2/python/timeout_guard.py
deleted file mode 100644
index 5def5fea8c9a..000000000000
--- a/caffe2/python/timeout_guard.py
+++ /dev/null
@@ -1,113 +0,0 @@
-## @package timeout_guard
-# Module caffe2.python.timeout_guard
-
-
-
-
-
-import contextlib
-import threading
-import os
-import time
-import signal
-import logging
-
-
-'''
-Sometimes CUDA devices can get stuck, 'deadlock'. In this case it is often
-better just the kill the process automatically. Use this guard to set a
-maximum timespan for a python call, such as RunNet(). If it does not complete
-in time, process is killed.
-
-Example usage:
-    with timeout_guard.CompleteInTimeOrDie(10.0):
-        core.RunNet(...)
-'''
-
-
-class WatcherThread(threading.Thread):
-
-    def __init__(self, timeout_secs):
-        threading.Thread.__init__(self)
-        self.timeout_secs = timeout_secs
-        self.completed = False
-        self.condition = threading.Condition()
-        self.daemon = True
-        self.caller_thread = threading.current_thread()
-
-    def run(self):
-        started = time.time()
-        self.condition.acquire()
-        while time.time() - started < self.timeout_secs and not self.completed:
-            self.condition.wait(self.timeout_secs - (time.time() - started))
-        self.condition.release()
-        if not self.completed:
-            log = logging.getLogger("timeout_guard")
-            log.error("Call did not finish in time. Timeout:{}s PID: {}".format(
-                self.timeout_secs,
-                os.getpid(),
-            ))
-
-            # First try dying cleanly, but in 10 secs, exit properly
-            def forcequit():
-                time.sleep(10.0)
-                log.info("Prepared output, dumping threads. ")
-                print("Caller thread was: {}".format(self.caller_thread))
-                print("-----After force------")
-                log.info("-----After force------")
-                import sys
-                import traceback
-                code = []
-                for threadId, stack in sys._current_frames().items():
-                    if threadId == self.caller_thread.ident:
-                        code.append("\n# ThreadID: %s" % threadId)
-                        for filename, lineno, name, line in traceback.extract_stack(stack):
-                            code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
-                            if line:
-                                code.append("  %s" % (line.strip()))
-
-                # Log also with logger, as it is comment practice to suppress print().
-                print("\n".join(code))
-                log.info("\n".join(code))
-                log.error("Process did not terminate cleanly in 10 s, forcing")
-                os.abort()
-
-            forcet = threading.Thread(target=forcequit, args=())
-            forcet.daemon = True
-            forcet.start()
-            print("Caller thread was: {}".format(self.caller_thread))
-            print("-----Before forcing------")
-            import sys
-            import traceback
-            code = []
-            for threadId, stack in sys._current_frames().items():
-                code.append("\n# ThreadID: %s" % threadId)
-                for filename, lineno, name, line in traceback.extract_stack(stack):
-                    code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
-                    if line:
-                        code.append("  %s" % (line.strip()))
-
-            # Log also with logger, as it is comment practice to suppress print().
-            print("\n".join(code))
-            log.info("\n".join(code))
-            os.kill(os.getpid(), signal.SIGINT)
-
-
-@contextlib.contextmanager
-def CompleteInTimeOrDie(timeout_secs):
-    watcher = WatcherThread(timeout_secs)
-    watcher.start()
-    yield
-    watcher.completed = True
-    watcher.condition.acquire()
-    watcher.condition.notify()
-    watcher.condition.release()
-
-
-def EuthanizeIfNecessary(timeout_secs=120):
-    '''
-    Call this if you have problem with process getting stuck at shutdown.
-    It will kill the process if it does not terminate in timeout_secs.
-    '''
-    watcher = WatcherThread(timeout_secs)
-    watcher.start()
diff --git a/caffe2/python/toy_regression_test.py b/caffe2/python/toy_regression_test.py
deleted file mode 100644
index b612b899ab71..000000000000
--- a/caffe2/python/toy_regression_test.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import numpy as np
-import unittest
-
-from caffe2.python import core, workspace, test_util
-
-
-class TestToyRegression(test_util.TestCase):
-    def testToyRegression(self):
-        """Tests a toy regression end to end.
-
-        The test code carries a simple toy regression in the form
-            y = 2.0 x1 + 1.5 x2 + 0.5
-        by randomly generating gaussian inputs and calculating the ground
-        truth outputs in the net as well. It uses a standard SGD to then
-        train the parameters.
-        """
-        workspace.ResetWorkspace()
-        init_net = core.Net("init")
-        W = init_net.UniformFill([], "W", shape=[1, 2], min=-1., max=1.)
-        B = init_net.ConstantFill([], "B", shape=[1], value=0.0)
-        W_gt = init_net.GivenTensorFill(
-            [], "W_gt", shape=[1, 2], values=[2.0, 1.5])
-        B_gt = init_net.GivenTensorFill([], "B_gt", shape=[1], values=[0.5])
-        LR = init_net.ConstantFill([], "LR", shape=[1], value=-0.1)
-        ONE = init_net.ConstantFill([], "ONE", shape=[1], value=1.)
-        ITER = init_net.ConstantFill([], "ITER", shape=[1], value=0,
-                                     dtype=core.DataType.INT64)
-
-        train_net = core.Net("train")
-        X = train_net.GaussianFill([], "X", shape=[64, 2], mean=0.0, std=1.0)
-        Y_gt = X.FC([W_gt, B_gt], "Y_gt")
-        Y_pred = X.FC([W, B], "Y_pred")
-        dist = train_net.SquaredL2Distance([Y_gt, Y_pred], "dist")
-        loss = dist.AveragedLoss([], ["loss"])
-        # Get gradients for all the computations above. Note that in fact we
-        # don't need to get the gradient the Y_gt computation, but we'll just
-        # leave it there. In many cases, I am expecting one to load X and Y
-        # from the disk, so there is really no operator that will calculate the
-        # Y_gt input.
-        input_to_grad = train_net.AddGradientOperators([loss], skip=2)
-        # updates
-        train_net.Iter(ITER, ITER)
-        train_net.LearningRate(ITER, "LR", base_lr=-0.1,
-                               policy="step", stepsize=20, gamma=0.9)
-        train_net.WeightedSum([W, ONE, input_to_grad[str(W)], LR], W)
-        train_net.WeightedSum([B, ONE, input_to_grad[str(B)], LR], B)
-        for blob in [loss, W, B]:
-            train_net.Print(blob, [])
-
-        # the CPU part.
-        plan = core.Plan("toy_regression")
-        plan.AddStep(core.ExecutionStep("init", init_net))
-        plan.AddStep(core.ExecutionStep("train", train_net, 200))
-
-        workspace.RunPlan(plan)
-        W_result = workspace.FetchBlob("W")
-        B_result = workspace.FetchBlob("B")
-        np.testing.assert_array_almost_equal(W_result, [[2.0, 1.5]], decimal=2)
-        np.testing.assert_array_almost_equal(B_result, [0.5], decimal=2)
-        workspace.ResetWorkspace()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/transformations.py b/caffe2/python/transformations.py
deleted file mode 100644
index 78d3bc8b85ff..000000000000
--- a/caffe2/python/transformations.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-import caffe2.python._import_c_extension as C
-
-
-class Transformer:
-    def __init__(self):
-        pass
-
-    @classmethod
-    def runTransform(cls, transform_name, net):
-        pb = net.Proto().SerializeToString()
-        if C.transform_exists(transform_name):
-            output = C.run_transform(transform_name, pb)
-        elif C.workspace_transform_exists(transform_name):
-            output = C.run_workspace_transform(transform_name, pb)
-        else:
-            raise AttributeError('Transformation {} not found.'.format(transform_name))
-        net.Proto().ParseFromString(output)
-
-    def __getattr__(self, transform_name):
-        return lambda net : self.runTransform(transform_name, net)
-
-
-def fuseNNPACKConvRelu(net):
-    net.Proto().ParseFromString(
-        C.transform_fuseNNPACKConvRelu(net.Proto().SerializeToString())
-    )
-
-
-def optimizeForMKLDNN(net, training_mode = False):
-    net.Proto().ParseFromString(
-        C.transform_optimizeForMKLDNN(net.Proto().SerializeToString(), training_mode)
-    )
-
-
-def fuseConvBN(net):
-    net.Proto().ParseFromString(
-        C.transform_fuseConvBN(net.Proto().SerializeToString())
-    )
diff --git a/caffe2/python/transformations_test.py b/caffe2/python/transformations_test.py
deleted file mode 100644
index dbc906f7d405..000000000000
--- a/caffe2/python/transformations_test.py
+++ /dev/null
@@ -1,336 +0,0 @@
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-
-
-
-
-
-from hypothesis import given
-import hypothesis.strategies as st
-import numpy as np
-import unittest
-
-from caffe2.python.transformations import Transformer
-from caffe2.python import core, workspace
-from caffe2.python import test_util as tu
-
-transformer = Transformer()
-
-
-class TestTransformations(tu.TestCase):
-    def _base_test_net(self):
-        net = core.Net("net")
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
-        return net
-
-    def _add_nnpack(self, net):
-        transformer.AddNNPACK(net)
-        assert tu.str_compare(net.Proto().op[0].engine, "NNPACK")
-
-    def _fuse_nnpack_convrelu(self, net, expected_result_num_ops,
-    expected_activation_arg=True):
-        self._add_nnpack(net)
-        transformer.FuseNNPACKConvRelu(net)
-        self.assertEqual(tu.numOps(net), expected_result_num_ops)
-        has_activation_arg = False
-        for arg in net.Proto().op[0].arg:
-            if tu.str_compare(arg.name, "activation"):
-                assert tu.str_compare(arg.s, "Relu")
-                has_activation_arg = True
-        if expected_activation_arg:
-            assert has_activation_arg
-        else:
-            assert not has_activation_arg
-
-    def test_transformer_AddNNPACK(self):
-        net = self._base_test_net()
-        net.Relu(["Y"], ["Y2"])
-        self._add_nnpack(net)
-
-    def test_transformer_FuseNNPACKConvRelu(self):
-        net = self._base_test_net()
-        net.Relu(["Y"], ["Y2"])
-        self._fuse_nnpack_convrelu(net, 1)
-
-    def test_noFuseNNPACKConvRelu(self):
-        net = self._base_test_net()
-        net.Relu(["Y"], ["Y2"])
-        net.Relu(["Y"], ["Y3"])
-        self._fuse_nnpack_convrelu(net, 3, expected_activation_arg=False)
-
-    def test_transformer_FuseNNPACKConvReluNoInplace(self):
-        net = self._base_test_net()
-        net.Relu(["Y"], ["X"])
-        self._fuse_nnpack_convrelu(net, 1)
-        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
-
-    def test_transformer_FuseNNPACKConvReluInplaceRelu(self):
-        net = self._base_test_net()
-        net.Relu(["Y"], ["Y"])
-        self._fuse_nnpack_convrelu(net, 1)
-        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
-
-    def test_transformer_FuseNNPACKConvReluPingPongNaming(self):
-        net = self._base_test_net()
-        net.Relu(["Y"], ["X"])
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
-        self._fuse_nnpack_convrelu(net, 2)
-        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
-        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
-
-    def test_transformer_FuseNNPACKConvReluFollowedByMultipleInputOp(self):
-        net = self._base_test_net()
-        net.Relu(["Y"], ["Y2"])
-        net.Conv(["Y2", "w", "b"], ["Y"], stride=1, pad=0, kernel=3, order="NCHW")
-        net.Relu(["Y"], ["Y2"])
-        self._fuse_nnpack_convrelu(net, 2)
-        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
-        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
-
-    def test_transformer_FuseNNPACKConvReluInplaceFollowedByMultipleInputOp(self):
-        net = self._base_test_net()
-        net.Relu(["Y"], ["Y"])
-        net.Conv(["Y", "w", "b"], ["Y2"], stride=1, pad=0, kernel=3, order="NCHW")
-        net.Relu(["Y2"], ["Y2"])
-        self._fuse_nnpack_convrelu(net, 2)
-        assert net.Proto().op[0].output[0] != net.Proto().op[0].input[0]
-        assert net.Proto().op[1].output[0] != net.Proto().op[1].input[0]
-
-    @given(
-        size=st.integers(7, 10),
-        input_channels=st.integers(1, 10),
-        seed=st.integers(0, 65535),
-        order=st.sampled_from(["NCHW", "NHWC"]),
-        epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-    )
-    def test_transformer_FuseConvBN(self, size, input_channels, seed, order, epsilon):
-        workspace.ResetWorkspace()
-        net = core.Net("net")
-        c = input_channels
-        h = size
-        w = size
-        k = 3
-        net.Conv(["X", "w", "b"], ["Y"], stride=1, pad=0, kernel=k, order=order)
-        net.SpatialBN(
-            ["Y", "scale", "bias", "mean", "var"],
-            ["Y2"],
-            is_test=True,
-            order=order,
-            epsilon=epsilon,
-        )
-
-        np.random.seed(seed)
-        if order == "NCHW":
-            tu.randBlobFloat32("X", 1, c, h, w)
-            tu.randBlobFloat32("w", c, c, k, k)
-        else:
-            tu.randBlobFloat32("X", 1, h, w, c)
-            tu.randBlobFloat32("w", c, k, k, c)
-        tu.randBlobsFloat32(["b", "scale", "bias", "mean"], c)
-
-        # This is necessary because 1/sqrt(var) is used and if var is too small
-        # we get floating point artifacts that cause test failures
-        tu.randBlobFloat32("var", c, offset=0.5)
-        workspace.RunNetOnce(net)
-        preTransformOutput = workspace.FetchBlob("Y2").flatten()
-        workspace.FeedBlob("Y2", np.zeros((1, 1)))
-        transformer.FuseConvBN(net)
-
-        # Ensure fusion
-        assert tu.numOps(net) == 1
-        workspace.RunNetOnce(net)
-        postTransformOutput = workspace.FetchBlob("Y2").flatten()
-        # Check that there is no numerical difference
-        assert np.allclose(
-            preTransformOutput,
-            postTransformOutput,
-            rtol=5e-02,
-            atol=1e-03
-        )
-
-    @unittest.skip("Test is flaky")
-    @given(
-        size=st.integers(7, 10),
-        input_channels=st.integers(1, 10),
-        seed=st.integers(0, 65535),
-        order=st.sampled_from(["NCHW", "NHWC"]),
-        epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-    )
-    def test_transformer_FuseConvBNNoConvBias(self, size, input_channels, seed, order, epsilon):
-        workspace.ResetWorkspace()
-        net = core.Net("net")
-        c = input_channels
-        h = size
-        w = size
-        k = 3
-        net.Conv(["X", "w"], ["Y"], stride=1, pad=0, kernel=k, order=order)
-        net.SpatialBN(
-            ["Y", "scale", "bias", "mean", "var"],
-            ["Y2"],
-            is_test=True,
-            order=order,
-            epsilon=epsilon,
-        )
-
-        np.random.seed(seed)
-        if order == "NCHW":
-            tu.randBlobFloat32("X", 1, c, h, w)
-            tu.randBlobFloat32("w", c, c, k, k)
-        else:
-            tu.randBlobFloat32("X", 1, h, w, c)
-            tu.randBlobFloat32("w", c, k, k, c)
-        tu.randBlobsFloat32(["scale", "bias", "mean"], c)
-        # This is necessary because 1/sqrt(var) is used and if var is too small
-        # we get floating point artifacts that cause test failures
-        tu.randBlobFloat32("var", c, offset=0.5)
-        workspace.RunNetOnce(net)
-        preTransformOutput = workspace.FetchBlob("Y2").flatten()
-        workspace.FeedBlob("Y2", np.zeros((1, 1)))
-        transformer.FuseConvBN(net)
-
-        # Ensure fusion
-        assert tu.numOps(net) == 1
-        workspace.RunNetOnce(net)
-        postTransformOutput = workspace.FetchBlob("Y2").flatten()
-        # Check that there is no numerical difference
-        assert np.allclose(
-            preTransformOutput,
-            postTransformOutput,
-            rtol=5e-02,
-            atol=1e-03
-        )
-
-    @given(
-        size=st.integers(7, 10),
-        input_channels=st.integers(1, 10),
-        seed=st.integers(0, 65535),
-        order=st.sampled_from(["NCHW", "NHWC"]),
-        epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-    )
-    def test_transformer_FuseConvBNNoConvBiasDuplicatedName(self, size, input_channels, seed, order, epsilon):
-        workspace.ResetWorkspace()
-        net = core.Net("net")
-        c = input_channels
-        h = size
-        w = size
-        k = 3
-        net.Conv(["X", "w"], ["Y"], stride=1, pad=0, kernel=k, order=order)
-        net.SpatialBN(
-            ["Y", "scale", "_bias0", "mean", "var"],
-            ["Y2"],
-            is_test=True,
-            order=order,
-            epsilon=epsilon,
-        )
-
-        np.random.seed(seed)
-        if order == "NCHW":
-            tu.randBlobFloat32("X", 1, c, h, w)
-            tu.randBlobFloat32("w", c, c, k, k)
-        else:
-            tu.randBlobFloat32("X", 1, h, w, c)
-            tu.randBlobFloat32("w", c, k, k, c)
-        tu.randBlobsFloat32(["scale", "_bias0", "mean"], c)
-        # This is necessary because 1/sqrt(var) is used and if var is too small
-        # we get floating point artifacts that cause test failures
-        tu.randBlobFloat32("var", c, offset=0.5)
-        workspace.RunNetOnce(net)
-        preTransformOutput = workspace.FetchBlob("Y2").flatten()
-        workspace.FeedBlob("Y2", np.zeros((1, 1)))
-        transformer.FuseConvBN(net)
-
-        # Ensure fusion
-        assert tu.numOps(net) == 1
-        workspace.RunNetOnce(net)
-        postTransformOutput = workspace.FetchBlob("Y2").flatten()
-        print("pre")
-        print(preTransformOutput)
-        print("after")
-        print(postTransformOutput)
-        # Check that there is no numerical difference
-        assert np.allclose(
-            preTransformOutput,
-            postTransformOutput,
-            rtol=5e-02,
-            atol=1e-03
-        )
-
-    @given(
-        size=st.integers(7, 10),
-        input_channels=st.integers(1, 10),
-        kt=st.integers(3, 5),
-        kh=st.integers(3, 5),
-        kw=st.integers(3, 5),
-        seed=st.integers(0, 65535),
-        epsilon=st.floats(min_value=1e-5, max_value=1e-2),
-    )
-    def test_transformer_FuseConv3DBN(
-        self, size, input_channels, kt, kh, kw, seed, epsilon
-    ):
-        workspace.ResetWorkspace()
-        net = core.Net("net")
-        c = input_channels
-        t = size
-        h = size
-        w = size
-        net.Conv(
-            ["X", "w", "b"],
-            ["Y"],
-            kernels=[kt, kh, kw],
-        )
-        net.SpatialBN(
-            ["Y", "scale", "bias", "mean", "var"],
-            ["Y2"],
-            is_test=True,
-            epsilon=epsilon,
-        )
-
-        np.random.seed(seed)
-        tu.randBlobFloat32("X", 1, c, t, h, w)
-        tu.randBlobFloat32("w", c, c, kt, kh, kw)
-        tu.randBlobsFloat32(["b", "scale", "bias", "mean"], c)
-        # This is necessary because 1/sqrt(var) is used and if var is too small
-        # we get floating point artifacts that cause test failures
-        tu.randBlobFloat32("var", c, offset=0.5)
-        workspace.RunNetOnce(net)
-        preTransformOutput = workspace.FetchBlob("Y2").flatten()
-        workspace.FeedBlob("Y2", np.zeros((1, 1)))
-        transformer.FuseConvBN(net)
-
-        # Ensure fusion
-        assert tu.numOps(net) == 1
-        workspace.RunNetOnce(net)
-        postTransformOutput = workspace.FetchBlob("Y2").flatten()
-        # Check that there is no numerical difference
-        assert np.allclose(
-            preTransformOutput,
-            postTransformOutput,
-            rtol=1e-02,
-            atol=1e-04
-        )
-
-    def test_converterDontEnforceUnusedInputs(self):
-        net = core.Net("net")
-        net.Relu(["X"], ["Y"])
-        net.Proto().external_input.extend(["fake"])
-        # This should now work
-        transformer.AddNNPACK(net)  # just testing the converter
-
-    def test_converterDontEnforceUnusedOutputs(self):
-        net = core.Net("net")
-        net.Relu(["X"], ["Y"])
-        net.Proto().external_output.extend(["fake"])
-        transformer.AddNNPACK(net)  # just testing the converter
diff --git a/caffe2/python/trt/__init__.py b/caffe2/python/trt/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/trt/data/binoculars.jpeg b/caffe2/python/trt/data/binoculars.jpeg
deleted file mode 100644
index 4bdbacda0035..000000000000
Binary files a/caffe2/python/trt/data/binoculars.jpeg and /dev/null differ
diff --git a/caffe2/python/trt/data/class_labels.txt b/caffe2/python/trt/data/class_labels.txt
deleted file mode 100644
index f40829ed0fc3..000000000000
--- a/caffe2/python/trt/data/class_labels.txt
+++ /dev/null
@@ -1,1000 +0,0 @@
-tench
-goldfish
-great white shark
-tiger shark
-hammerhead
-electric ray
-stingray
-cock
-hen
-ostrich
-brambling
-goldfinch
-house finch
-junco
-indigo bunting
-robin
-bulbul
-jay
-magpie
-chickadee
-water ouzel
-kite
-bald eagle
-vulture
-great grey owl
-European fire salamander
-common newt
-eft
-spotted salamander
-axolotl
-bullfrog
-tree frog
-tailed frog
-loggerhead
-leatherback turtle
-mud turtle
-terrapin
-box turtle
-banded gecko
-common iguana
-American chameleon
-whiptail
-agama
-frilled lizard
-alligator lizard
-Gila monster
-green lizard
-African chameleon
-Komodo dragon
-African crocodile
-American alligator
-triceratops
-thunder snake
-ringneck snake
-hognose snake
-green snake
-king snake
-garter snake
-water snake
-vine snake
-night snake
-boa constrictor
-rock python
-Indian cobra
-green mamba
-sea snake
-horned viper
-diamondback
-sidewinder
-trilobite
-harvestman
-scorpion
-black and gold garden spider
-barn spider
-garden spider
-black widow
-tarantula
-wolf spider
-tick
-centipede
-black grouse
-ptarmigan
-ruffed grouse
-prairie chicken
-peacock
-quail
-partridge
-African grey
-macaw
-sulphur-crested cockatoo
-lorikeet
-coucal
-bee eater
-hornbill
-hummingbird
-jacamar
-toucan
-drake
-red-breasted merganser
-goose
-black swan
-tusker
-echidna
-platypus
-wallaby
-koala
-wombat
-jellyfish
-sea anemone
-brain coral
-flatworm
-nematode
-conch
-snail
-slug
-sea slug
-chiton
-chambered nautilus
-Dungeness crab
-rock crab
-fiddler crab
-king crab
-American lobster
-spiny lobster
-crayfish
-hermit crab
-isopod
-white stork
-black stork
-spoonbill
-flamingo
-little blue heron
-American egret
-bittern
-crane
-limpkin
-European gallinule
-American coot
-bustard
-ruddy turnstone
-red-backed sandpiper
-redshank
-dowitcher
-oystercatcher
-pelican
-king penguin
-albatross
-grey whale
-killer whale
-dugong
-sea lion
-Chihuahua
-Japanese spaniel
-Maltese dog
-Pekinese
-Shih-Tzu
-Blenheim spaniel
-papillon
-toy terrier
-Rhodesian ridgeback
-Afghan hound
-basset
-beagle
-bloodhound
-bluetick
-black-and-tan coonhound
-Walker hound
-English foxhound
-redbone
-borzoi
-Irish wolfhound
-Italian greyhound
-whippet
-Ibizan hound
-Norwegian elkhound
-otterhound
-Saluki
-Scottish deerhound
-Weimaraner
-Staffordshire bullterrier
-American Staffordshire terrier
-Bedlington terrier
-Border terrier
-Kerry blue terrier
-Irish terrier
-Norfolk terrier
-Norwich terrier
-Yorkshire terrier
-wire-haired fox terrier
-Lakeland terrier
-Sealyham terrier
-Airedale
-cairn
-Australian terrier
-Dandie Dinmont
-Boston bull
-miniature schnauzer
-giant schnauzer
-standard schnauzer
-Scotch terrier
-Tibetan terrier
-silky terrier
-soft-coated wheaten terrier
-West Highland white terrier
-Lhasa
-flat-coated retriever
-curly-coated retriever
-golden retriever
-Labrador retriever
-Chesapeake Bay retriever
-German short-haired pointer
-vizsla
-English setter
-Irish setter
-Gordon setter
-Brittany spaniel
-clumber
-English springer
-Welsh springer spaniel
-cocker spaniel
-Sussex spaniel
-Irish water spaniel
-kuvasz
-schipperke
-groenendael
-malinois
-briard
-kelpie
-komondor
-Old English sheepdog
-Shetland sheepdog
-collie
-Border collie
-Bouvier des Flandres
-Rottweiler
-German shepherd
-Doberman
-miniature pinscher
-Greater Swiss Mountain dog
-Bernese mountain dog
-Appenzeller
-EntleBucher
-boxer
-bull mastiff
-Tibetan mastiff
-French bulldog
-Great Dane
-Saint Bernard
-Eskimo dog
-malamute
-Siberian husky
-dalmatian
-affenpinscher
-basenji
-pug
-Leonberg
-Newfoundland
-Great Pyrenees
-Samoyed
-Pomeranian
-chow
-keeshond
-Brabancon griffon
-Pembroke
-Cardigan
-toy poodle
-miniature poodle
-standard poodle
-Mexican hairless
-timber wolf
-white wolf
-red wolf
-coyote
-dingo
-dhole
-African hunting dog
-hyena
-red fox
-kit fox
-Arctic fox
-grey fox
-tabby
-tiger cat
-Persian cat
-Siamese cat
-Egyptian cat
-cougar
-lynx
-leopard
-snow leopard
-jaguar
-lion
-tiger
-cheetah
-brown bear
-American black bear
-ice bear
-sloth bear
-mongoose
-meerkat
-tiger beetle
-ladybug
-ground beetle
-long-horned beetle
-leaf beetle
-dung beetle
-rhinoceros beetle
-weevil
-fly
-bee
-ant
-grasshopper
-cricket
-walking stick
-cockroach
-mantis
-cicada
-leafhopper
-lacewing
-dragonfly
-damselfly
-admiral
-ringlet
-monarch
-cabbage butterfly
-sulphur butterfly
-lycaenid
-starfish
-sea urchin
-sea cucumber
-wood rabbit
-hare
-Angora
-hamster
-porcupine
-fox squirrel
-marmot
-beaver
-guinea pig
-sorrel
-zebra
-hog
-wild boar
-warthog
-hippopotamus
-ox
-water buffalo
-bison
-ram
-bighorn
-ibex
-hartebeest
-impala
-gazelle
-Arabian camel
-llama
-weasel
-mink
-polecat
-black-footed ferret
-otter
-skunk
-badger
-armadillo
-three-toed sloth
-orangutan
-gorilla
-chimpanzee
-gibbon
-siamang
-guenon
-patas
-baboon
-macaque
-langur
-colobus
-proboscis monkey
-marmoset
-capuchin
-howler monkey
-titi
-spider monkey
-squirrel monkey
-Madagascar cat
-indri
-Indian elephant
-African elephant
-lesser panda
-giant panda
-barracouta
-eel
-coho
-rock beauty
-anemone fish
-sturgeon
-gar
-lionfish
-puffer
-abacus
-abaya
-academic gown
-accordion
-acoustic guitar
-aircraft carrier
-airliner
-airship
-altar
-ambulance
-amphibian
-analog clock
-apiary
-apron
-ashcan
-assault rifle
-backpack
-bakery
-balance beam
-balloon
-ballpoint
-Band Aid
-banjo
-bannister
-barbell
-barber chair
-barbershop
-barn
-barometer
-barrel
-barrow
-baseball
-basketball
-bassinet
-bassoon
-bathing cap
-bath towel
-bathtub
-beach wagon
-beacon
-beaker
-bearskin
-beer bottle
-beer glass
-bell cote
-bib
-bicycle-built-for-two
-bikini
-binder
-binoculars
-birdhouse
-boathouse
-bobsled
-bolo tie
-bonnet
-bookcase
-bookshop
-bottlecap
-bow
-bow tie
-brass
-brassiere
-breakwater
-breastplate
-broom
-bucket
-buckle
-bulletproof vest
-bullet train
-butcher shop
-cab
-caldron
-candle
-cannon
-canoe
-can opener
-cardigan
-car mirror
-carousel
-carpenter's kit
-carton
-car wheel
-cash machine
-cassette
-cassette player
-castle
-catamaran
-CD player
-cello
-cellular telephone
-chain
-chainlink fence
-chain mail
-chain saw
-chest
-chiffonier
-chime
-china cabinet
-Christmas stocking
-church
-cinema
-cleaver
-cliff dwelling
-cloak
-clog
-cocktail shaker
-coffee mug
-coffeepot
-coil
-combination lock
-computer keyboard
-confectionery
-container ship
-convertible
-corkscrew
-cornet
-cowboy boot
-cowboy hat
-cradle
-crane
-crash helmet
-crate
-crib
-Crock Pot
-croquet ball
-crutch
-cuirass
-dam
-desk
-desktop computer
-dial telephone
-diaper
-digital clock
-digital watch
-dining table
-dishrag
-dishwasher
-disk brake
-dock
-dogsled
-dome
-doormat
-drilling platform
-drum
-drumstick
-dumbbell
-Dutch oven
-electric fan
-electric guitar
-electric locomotive
-entertainment center
-envelope
-espresso maker
-face powder
-feather boa
-file
-fireboat
-fire engine
-fire screen
-flagpole
-flute
-folding chair
-football helmet
-forklift
-fountain
-fountain pen
-four-poster
-freight car
-French horn
-frying pan
-fur coat
-garbage truck
-gasmask
-gas pump
-goblet
-go-kart
-golf ball
-golfcart
-gondola
-gong
-gown
-grand piano
-greenhouse
-grille
-grocery store
-guillotine
-hair slide
-hair spray
-half track
-hammer
-hamper
-hand blower
-hand-held computer
-handkerchief
-hard disc
-harmonica
-harp
-harvester
-hatchet
-holster
-home theater
-honeycomb
-hook
-hoopskirt
-horizontal bar
-horse cart
-hourglass
-iPod
-iron
-jack-o'-lantern
-jean
-jeep
-jersey
-jigsaw puzzle
-jinrikisha
-joystick
-kimono
-knee pad
-knot
-lab coat
-ladle
-lampshade
-laptop
-lawn mower
-lens cap
-letter opener
-library
-lifeboat
-lighter
-limousine
-liner
-lipstick
-Loafer
-lotion
-loudspeaker
-loupe
-lumbermill
-magnetic compass
-mailbag
-mailbox
-maillot
-maillot
-manhole cover
-maraca
-marimba
-mask
-matchstick
-maypole
-maze
-measuring cup
-medicine chest
-megalith
-microphone
-microwave
-military uniform
-milk can
-minibus
-miniskirt
-minivan
-missile
-mitten
-mixing bowl
-mobile home
-Model T
-modem
-monastery
-monitor
-moped
-mortar
-mortarboard
-mosque
-mosquito net
-motor scooter
-mountain bike
-mountain tent
-mouse
-mousetrap
-moving van
-muzzle
-nail
-neck brace
-necklace
-nipple
-notebook
-obelisk
-oboe
-ocarina
-odometer
-oil filter
-organ
-oscilloscope
-overskirt
-oxcart
-oxygen mask
-packet
-paddle
-paddlewheel
-padlock
-paintbrush
-pajama
-palace
-panpipe
-paper towel
-parachute
-parallel bars
-park bench
-parking meter
-passenger car
-patio
-pay-phone
-pedestal
-pencil box
-pencil sharpener
-perfume
-Petri dish
-photocopier
-pick
-pickelhaube
-picket fence
-pickup
-pier
-piggy bank
-pill bottle
-pillow
-ping-pong ball
-pinwheel
-pirate
-pitcher
-plane
-planetarium
-plastic bag
-plate rack
-plow
-plunger
-Polaroid camera
-pole
-police van
-poncho
-pool table
-pop bottle
-pot
-potter's wheel
-power drill
-prayer rug
-printer
-prison
-projectile
-projector
-puck
-punching bag
-purse
-quill
-quilt
-racer
-racket
-radiator
-radio
-radio telescope
-rain barrel
-recreational vehicle
-reel
-reflex camera
-refrigerator
-remote control
-restaurant
-revolver
-rifle
-rocking chair
-rotisserie
-rubber eraser
-rugby ball
-rule
-running shoe
-safe
-safety pin
-saltshaker
-sandal
-sarong
-sax
-scabbard
-scale
-school bus
-schooner
-scoreboard
-screen
-screw
-screwdriver
-seat belt
-sewing machine
-shield
-shoe shop
-shoji
-shopping basket
-shopping cart
-shovel
-shower cap
-shower curtain
-ski
-ski mask
-sleeping bag
-slide rule
-sliding door
-slot
-snorkel
-snowmobile
-snowplow
-soap dispenser
-soccer ball
-sock
-solar dish
-sombrero
-soup bowl
-space bar
-space heater
-space shuttle
-spatula
-speedboat
-spider web
-spindle
-sports car
-spotlight
-stage
-steam locomotive
-steel arch bridge
-steel drum
-stethoscope
-stole
-stone wall
-stopwatch
-stove
-strainer
-streetcar
-stretcher
-studio couch
-stupa
-submarine
-suit
-sundial
-sunglass
-sunglasses
-sunscreen
-suspension bridge
-swab
-sweatshirt
-swimming trunks
-swing
-switch
-syringe
-table lamp
-tank
-tape player
-teapot
-teddy
-television
-tennis ball
-thatch
-theater curtain
-thimble
-thresher
-throne
-tile roof
-toaster
-tobacco shop
-toilet seat
-torch
-totem pole
-tow truck
-toyshop
-tractor
-trailer truck
-tray
-trench coat
-tricycle
-trimaran
-tripod
-triumphal arch
-trolleybus
-trombone
-tub
-turnstile
-typewriter keyboard
-umbrella
-unicycle
-upright
-vacuum
-vase
-vault
-velvet
-vending machine
-vestment
-viaduct
-violin
-volleyball
-waffle iron
-wall clock
-wallet
-wardrobe
-warplane
-washbasin
-washer
-water bottle
-water jug
-water tower
-whiskey jug
-whistle
-wig
-window screen
-window shade
-Windsor tie
-wine bottle
-wing
-wok
-wooden spoon
-wool
-worm fence
-wreck
-yawl
-yurt
-web site
-comic book
-crossword puzzle
-street sign
-traffic light
-book jacket
-menu
-plate
-guacamole
-consomme
-hot pot
-trifle
-ice cream
-ice lolly
-French loaf
-bagel
-pretzel
-cheeseburger
-hotdog
-mashed potato
-head cabbage
-broccoli
-cauliflower
-zucchini
-spaghetti squash
-acorn squash
-butternut squash
-cucumber
-artichoke
-bell pepper
-cardoon
-mushroom
-Granny Smith
-strawberry
-orange
-lemon
-fig
-pineapple
-banana
-jackfruit
-custard apple
-pomegranate
-hay
-carbonara
-chocolate sauce
-dough
-meat loaf
-pizza
-potpie
-burrito
-red wine
-espresso
-cup
-eggnog
-alp
-bubble
-cliff
-coral reef
-geyser
-lakeside
-promontory
-sandbar
-seashore
-valley
-volcano
-ballplayer
-groom
-scuba diver
-rapeseed
-daisy
-yellow lady's slipper
-corn
-acorn
-hip
-buckeye
-coral fungus
-agaric
-gyromitra
-stinkhorn
-earthstar
-hen-of-the-woods
-bolete
-ear
-toilet tissue
diff --git a/caffe2/python/trt/data/reflex_camera.jpeg b/caffe2/python/trt/data/reflex_camera.jpeg
deleted file mode 100644
index fecc7616f54e..000000000000
Binary files a/caffe2/python/trt/data/reflex_camera.jpeg and /dev/null differ
diff --git a/caffe2/python/trt/data/tabby_tiger_cat.jpg b/caffe2/python/trt/data/tabby_tiger_cat.jpg
deleted file mode 100644
index ffcd2be2c674..000000000000
Binary files a/caffe2/python/trt/data/tabby_tiger_cat.jpg and /dev/null differ
diff --git a/caffe2/python/trt/test_pt_onnx_trt.py b/caffe2/python/trt/test_pt_onnx_trt.py
deleted file mode 100644
index 3ba67ea21244..000000000000
--- a/caffe2/python/trt/test_pt_onnx_trt.py
+++ /dev/null
@@ -1,190 +0,0 @@
-###################################################################################################
-# ATTENTION! This test will most probably fail if you install TensorRT 6.0.1 only.
-# That's because it's shipped with older version of ONNX parser not supporting some
-# required features. To make it work please use new version: https://github.com/onnx/onnx-tensorrt
-# Just clone it and do something like this:
-#
-# ~/pt/third_party/onnx-tensorrt$ mkdir build/
-# ~/pt/third_party/onnx-tensorrt$ cd build/
-# ~/pt/third_party/onnx-tensorrt/build$ cmake ..
-# ~/pt/third_party/onnx-tensorrt/build$ make
-# ~/pt/third_party/onnx-tensorrt/build$ sudo cp libnvonnxparser.so.6.0.1 /usr/lib/x86_64-linux-gnu
-#
-# This note is valid for 6.0.1 release only. September 18th, 2019.
-###################################################################################################
-
-import os
-import unittest
-
-from PIL import Image
-import numpy as np
-import torch
-import torchvision.models as models
-
-import pycuda.driver as cuda
-# This import causes pycuda to automatically manage CUDA context creation and cleanup.
-import pycuda.autoinit
-
-import tensorrt as trt
-TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
-
-def allocate_buffers(engine):
-    h_input = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(0)),
-                                    dtype=trt.nptype(trt.float32))
-    h_output = cuda.pagelocked_empty(trt.volume(engine.get_binding_shape(1)),
-                                     dtype=trt.nptype(trt.float32))
-    d_input = cuda.mem_alloc(h_input.nbytes)
-    d_output = cuda.mem_alloc(h_output.nbytes)
-    stream = cuda.Stream()
-    return h_input, d_input, h_output, d_output, stream
-
-def load_normalized_test_case(input_shape, test_image, pagelocked_buffer, normalization_hint):
-    def normalize_image(image):
-        c, h, w = input_shape
-        image_arr = np.asarray(image.resize((w, h), Image.ANTIALIAS)).transpose([2, 0, 1])\
-            .astype(trt.nptype(trt.float32)).ravel()
-        if (normalization_hint == 0):
-            return (image_arr / 255.0 - 0.45) / 0.225
-        elif (normalization_hint == 1):
-            return (image_arr / 256.0 - 0.5)
-    np.copyto(pagelocked_buffer, normalize_image(Image.open(test_image)))
-    return test_image
-
-class Test_PT_ONNX_TRT(unittest.TestCase):
-    def __enter__(self):
-        return self
-
-    def setUp(self):
-        data_path = os.path.join(os.path.dirname(__file__), 'data')
-        self.image_files=["binoculars.jpeg", "reflex_camera.jpeg", "tabby_tiger_cat.jpg"]
-        for index, f in enumerate(self.image_files):
-            self.image_files[index] = os.path.abspath(os.path.join(data_path, f))
-            if not os.path.exists(self.image_files[index]):
-                raise FileNotFoundError(self.image_files[index] + " does not exist.")
-        with open(os.path.abspath(os.path.join(data_path, "class_labels.txt")), 'r') as f:
-            self.labels = f.read().split('\n')
-
-    def build_engine_onnx(self, model_file):
-        with trt.Builder(TRT_LOGGER) as builder, builder.create_network(flags = 1) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
-            builder_config = builder.create_builder_config()
-            builder_config.max_workspace_size = 1 << 33
-            with open(model_file, 'rb') as model:
-                if not parser.parse(model.read()):
-                    for error in range(parser.num_errors):
-                        self.fail("ERROR: {}".format(parser.get_error(error)))
-            return builder.build_engine(network, builder_config)
-
-    def _test_model(self, model_name, input_shape = (3, 224, 224), normalization_hint = 0):
-
-        model = getattr(models, model_name)(pretrained=True)
-
-        shape = (1,) + input_shape
-        dummy_input  = (torch.randn(shape),)
-        onnx_name = model_name + ".onnx"
-
-        torch.onnx.export(model,
-                          dummy_input,
-                          onnx_name,
-                          input_names = [],
-                          output_names = [],
-                          verbose=False,
-                          export_params=True,
-                          opset_version=9)
-
-        with self.build_engine_onnx(onnx_name) as engine:
-            h_input, d_input, h_output, d_output, stream = allocate_buffers(engine)
-            with engine.create_execution_context() as context:
-                err_count = 0
-                for index, f in enumerate(self.image_files):
-                    test_case = load_normalized_test_case(input_shape, f,\
-                        h_input, normalization_hint)
-                    cuda.memcpy_htod_async(d_input, h_input, stream)
-
-                    context.execute_async_v2(bindings=[d_input, d_output],
-                                             stream_handle=stream.handle)
-                    cuda.memcpy_dtoh_async(h_output, d_output, stream)
-                    stream.synchronize()
-
-                    amax = np.argmax(h_output)
-                    pred = self.labels[amax]
-                    if "_".join(pred.split()) not in\
-                            os.path.splitext(os.path.basename(test_case))[0]:
-                        err_count = err_count + 1
-                self.assertLessEqual(err_count, 1, "Too many recognition errors")
-
-    def test_alexnet(self):
-        self._test_model("alexnet", (3, 227, 227))
-
-    def test_resnet18(self):
-        self._test_model("resnet18")
-    def test_resnet34(self):
-        self._test_model("resnet34")
-    def test_resnet50(self):
-        self._test_model("resnet50")
-    def test_resnet101(self):
-        self._test_model("resnet101")
-    @unittest.skip("Takes 2m")
-    def test_resnet152(self):
-        self._test_model("resnet152")
-
-    def test_resnet50_2(self):
-        self._test_model("wide_resnet50_2")
-    @unittest.skip("Takes 2m")
-    def test_resnet101_2(self):
-        self._test_model("wide_resnet101_2")
-
-    def test_squeezenet1_0(self):
-        self._test_model("squeezenet1_0")
-    def test_squeezenet1_1(self):
-        self._test_model("squeezenet1_1")
-
-    def test_googlenet(self):
-        self._test_model("googlenet")
-    def test_inception_v3(self):
-        self._test_model("inception_v3")
-
-    def test_mnasnet0_5(self):
-        self._test_model("mnasnet0_5", normalization_hint = 1)
-    def test_mnasnet1_0(self):
-        self._test_model("mnasnet1_0", normalization_hint = 1)
-
-    def test_mobilenet_v2(self):
-        self._test_model("mobilenet_v2", normalization_hint = 1)
-
-    def test_shufflenet_v2_x0_5(self):
-        self._test_model("shufflenet_v2_x0_5")
-    def test_shufflenet_v2_x1_0(self):
-        self._test_model("shufflenet_v2_x1_0")
-
-    def test_vgg11(self):
-        self._test_model("vgg11")
-    def test_vgg11_bn(self):
-        self._test_model("vgg11_bn")
-    def test_vgg13(self):
-        self._test_model("vgg13")
-    def test_vgg13_bn(self):
-        self._test_model("vgg13_bn")
-    def test_vgg16(self):
-        self._test_model("vgg16")
-    def test_vgg16_bn(self):
-        self._test_model("vgg16_bn")
-    def test_vgg19(self):
-        self._test_model("vgg19")
-    def test_vgg19_bn(self):
-        self._test_model("vgg19_bn")
-
-    @unittest.skip("Takes 13m")
-    def test_densenet121(self):
-        self._test_model("densenet121")
-    @unittest.skip("Takes 25m")
-    def test_densenet161(self):
-        self._test_model("densenet161")
-    @unittest.skip("Takes 27m")
-    def test_densenet169(self):
-        self._test_model("densenet169")
-    @unittest.skip("Takes 44m")
-    def test_densenet201(self):
-        self._test_model("densenet201")
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/trt/test_trt.py b/caffe2/python/trt/test_trt.py
deleted file mode 100644
index 495dc27fcd5b..000000000000
--- a/caffe2/python/trt/test_trt.py
+++ /dev/null
@@ -1,279 +0,0 @@
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-import onnx
-import onnx.defs
-from onnx.helper import make_node, make_graph, make_tensor_value_info, make_model
-from onnx.backend.base import namedtupledict
-from caffe2.python.models.download import ModelDownloader
-import caffe2.python.onnx.backend as c2
-from caffe2.python.onnx.workspace import Workspace
-from caffe2.python.trt.transform import convert_onnx_model_to_trt_op, transform_caffe2_net
-from caffe2.python.onnx.tests.test_utils import TestCase
-import numpy as np
-import os.path
-import time
-import unittest
-import tarfile
-import tempfile
-import shutil
-from urllib.request import urlretrieve
-
-def _print_net(net):
-    for i in net.external_input:
-        print("Input: {}".format(i))
-    for i in net.external_output:
-        print("Output: {}".format(i))
-    for op in net.op:
-        print("Op {}".format(op.type))
-        for x in op.input:
-            print("  input: {}".format(x))
-        for y in op.output:
-            print("  output: {}".format(y))
-
-
-def _base_url(opset_version):
-    return 'https://s3.amazonaws.com/download.onnx/models/opset_{}'.format(opset_version)
-
-# TODO: This is copied from https://github.com/onnx/onnx/blob/master/onnx/backend/test/runner/__init__.py. Maybe we should
-# expose a model retrival API from ONNX
-def _download_onnx_model(model_name, opset_version):
-    onnx_home = os.path.expanduser(os.getenv('ONNX_HOME', os.path.join('~', '.onnx')))
-    models_dir = os.getenv('ONNX_MODELS',
-                           os.path.join(onnx_home, 'models'))
-    model_dir = os.path.join(models_dir, model_name)
-    if not os.path.exists(os.path.join(model_dir, 'model.onnx')):
-        if os.path.exists(model_dir):
-            bi = 0
-            while True:
-                dest = '{}.old.{}'.format(model_dir, bi)
-                if os.path.exists(dest):
-                    bi += 1
-                    continue
-                shutil.move(model_dir, dest)
-                break
-        os.makedirs(model_dir)
-
-        # On Windows, NamedTemporaryFile can not be opened for a
-        # second time
-        url = '{}/{}.tar.gz'.format(_base_url(opset_version), model_name)
-        download_file = tempfile.NamedTemporaryFile(delete=False)
-        try:
-            download_file.close()
-            print('Start downloading model {} from {}'.format(
-                model_name, url))
-            urlretrieve(url, download_file.name)
-            print('Done')
-            with tarfile.open(download_file.name) as t:
-                t.extractall(models_dir)
-        except Exception as e:
-            print('Failed to prepare data for model {}: {}'.format(
-                model_name, e))
-            raise
-        finally:
-            os.remove(download_file.name)
-    return model_dir
-
-class TensorRTOpTest(TestCase):
-    def setUp(self):
-        self.opset_version = onnx.defs.onnx_opset_version()
-
-    def _test_relu_graph(self, X, batch_size, trt_max_batch_size):
-        node_def = make_node("Relu", ["X"], ["Y"])
-        Y_c2 = c2.run_node(node_def, {"X": X})
-        graph_def = make_graph(
-            [node_def],
-            name="test",
-            inputs=[make_tensor_value_info("X", onnx.TensorProto.FLOAT, [batch_size, 1, 3, 2])],
-            outputs=[make_tensor_value_info("Y", onnx.TensorProto.FLOAT, [batch_size, 1, 3, 2])])
-        model_def = make_model(graph_def, producer_name='relu-test')
-        op_outputs = [x.name for x in model_def.graph.output]
-        op = convert_onnx_model_to_trt_op(model_def, max_batch_size=trt_max_batch_size)
-        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
-        op.device_option.CopyFrom(device_option)
-        Y_trt = None
-        ws = Workspace()
-        with core.DeviceScope(device_option):
-            ws.FeedBlob("X", X)
-            ws.RunOperatorsOnce([op])
-            output_values = [ws.FetchBlob(name) for name in op_outputs]
-            Y_trt = namedtupledict('Outputs', op_outputs)(*output_values)
-        np.testing.assert_almost_equal(Y_c2, Y_trt)
-
-
-    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
-    def test_relu_graph_simple(self):
-        X = np.random.randn(1, 1, 3, 2).astype(np.float32)
-        self._test_relu_graph(X, 1, 50)
-
-
-    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
-    def test_relu_graph_big_batch(self):
-        X = np.random.randn(52, 1, 3, 2).astype(np.float32)
-        self._test_relu_graph(X, 52, 50)
-
-    def _test_onnx_importer(self, model_name, data_input_index, opset_version=onnx.defs.onnx_opset_version()):
-        model_dir = _download_onnx_model(model_name, opset_version)
-        model_def = onnx.load(os.path.join(model_dir, 'model.onnx'))
-        input_blob_dims = [int(x.dim_value) for x in model_def.graph.input[data_input_index].type.tensor_type.shape.dim]
-        op_inputs = [x.name for x in model_def.graph.input]
-        op_outputs = [x.name for x in model_def.graph.output]
-        print("{}".format(op_inputs))
-        data = np.random.randn(*input_blob_dims).astype(np.float32)
-        Y_c2 = c2.run_model(model_def, {op_inputs[data_input_index]: data})
-        op = convert_onnx_model_to_trt_op(model_def, verbosity=3)
-        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
-        op.device_option.CopyFrom(device_option)
-        Y_trt = None
-        ws = Workspace()
-        with core.DeviceScope(device_option):
-            ws.FeedBlob(op_inputs[data_input_index], data)
-            if opset_version >= 5:
-                # Some newer models from ONNX Zoo come with pre-set "data_0" input
-                ws.FeedBlob("data_0", data)
-            ws.RunOperatorsOnce([op])
-            output_values = [ws.FetchBlob(name) for name in op_outputs]
-            Y_trt = namedtupledict('Outputs', op_outputs)(*output_values)
-        np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
-
-    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
-    def test_resnet50(self):
-        self._test_onnx_importer('resnet50', 0, 9)
-
-    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
-    def test_bvlc_alexnet(self):
-        self._test_onnx_importer('bvlc_alexnet', 0, 9)
-
-    @unittest.skip("Until fixing Unsqueeze op")
-    def test_densenet121(self):
-        self._test_onnx_importer('densenet121', -1, 3)
-
-    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
-    def test_inception_v1(self):
-        self._test_onnx_importer('inception_v1', -3, 9)
-
-    @unittest.skip("Until fixing Unsqueeze op")
-    def test_inception_v2(self):
-        self._test_onnx_importer('inception_v2', 0, 9)
-
-    @unittest.skip('Need to revisit our ChannelShuffle exporter to avoid generating 5D tensor')
-    def test_shufflenet(self):
-        self._test_onnx_importer('shufflenet', 0)
-
-    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
-    def test_squeezenet(self):
-        self._test_onnx_importer('squeezenet', -1, 9)
-
-    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
-    def test_vgg16(self):
-        self._test_onnx_importer('vgg16', 0, 9)
-
-    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
-    def test_vgg19(self):
-        self._test_onnx_importer('vgg19', -2, 9)
-
-
-class TensorRTTransformTest(TestCase):
-    def setUp(self):
-        self.model_downloader = ModelDownloader()
-
-    def _add_head_tail(self, pred_net, new_head, new_tail):
-        orig_head = pred_net.external_input[0]
-        orig_tail = pred_net.external_output[0]
-
-        # Add head
-        head = caffe2_pb2.OperatorDef()
-        head.type = "Copy"
-        head.input.append(new_head)
-        head.output.append(orig_head)
-        dummy = caffe2_pb2.NetDef()
-        dummy.op.extend(pred_net.op)
-        del pred_net.op[:]
-        pred_net.op.extend([head])
-        pred_net.op.extend(dummy.op)
-        pred_net.external_input[0] = new_head
-
-        # Add tail
-        tail = caffe2_pb2.OperatorDef()
-        tail.type = "Copy"
-        tail.input.append(orig_tail)
-        tail.output.append(new_tail)
-        pred_net.op.extend([tail])
-        pred_net.external_output[0] = new_tail
-
-    @unittest.skipIf(not workspace.C.use_trt, "No TensortRT support")
-    def test_resnet50_core(self):
-        N = 2
-        warmup = 20
-        repeat = 100
-        print("Batch size: {}, repeat inference {} times, warmup {} times".format(N, repeat, warmup))
-        init_net, pred_net, _ = self.model_downloader.get_c2_model('resnet50')
-        self._add_head_tail(pred_net, 'real_data', 'real_softmax')
-        input_blob_dims = (N, 3, 224, 224)
-        input_name = "real_data"
-
-        device_option = core.DeviceOption(caffe2_pb2.CUDA, 0)
-        init_net.device_option.CopyFrom(device_option)
-        pred_net.device_option.CopyFrom(device_option)
-        for op in pred_net.op:
-            op.device_option.CopyFrom(device_option)
-            op.engine = 'CUDNN'
-        net_outputs = pred_net.external_output
-        Y_c2 = None
-        data =  np.random.randn(*input_blob_dims).astype(np.float32)
-        c2_time = 1
-        workspace.SwitchWorkspace("gpu_test", True)
-        with core.DeviceScope(device_option):
-            workspace.FeedBlob(input_name, data)
-            workspace.RunNetOnce(init_net)
-            workspace.CreateNet(pred_net)
-            for _ in range(warmup):
-                workspace.RunNet(pred_net.name)
-            start = time.time()
-            for _ in range(repeat):
-                workspace.RunNet(pred_net.name)
-            end = time.time()
-            c2_time = end - start
-            output_values = [workspace.FetchBlob(name) for name in net_outputs]
-            Y_c2 = namedtupledict('Outputs', net_outputs)(*output_values)
-        workspace.ResetWorkspace()
-
-        # Fill the workspace with the weights
-        with core.DeviceScope(device_option):
-            workspace.RunNetOnce(init_net)
-
-        # Cut the graph
-        start = time.time()
-        pred_net_cut = transform_caffe2_net(pred_net,
-                                            {input_name: input_blob_dims},
-                                            build_serializable_op=False)
-        del init_net, pred_net
-        pred_net_cut.device_option.CopyFrom(device_option)
-        for op in pred_net_cut.op:
-            op.device_option.CopyFrom(device_option)
-        #_print_net(pred_net_cut)
-
-        Y_trt = None
-        input_name = pred_net_cut.external_input[0]
-        print("C2 runtime: {}s".format(c2_time))
-        with core.DeviceScope(device_option):
-            workspace.FeedBlob(input_name, data)
-            workspace.CreateNet(pred_net_cut)
-            end = time.time()
-            print("Conversion time: {:.2f}s".format(end -start))
-
-            for _ in range(warmup):
-                workspace.RunNet(pred_net_cut.name)
-            start = time.time()
-            for _ in range(repeat):
-                workspace.RunNet(pred_net_cut.name)
-            end = time.time()
-            trt_time = end - start
-            print("TRT runtime: {}s, improvement: {}%".format(trt_time, (c2_time-trt_time)/c2_time*100))
-            output_values = [workspace.FetchBlob(name) for name in net_outputs]
-            Y_trt = namedtupledict('Outputs', net_outputs)(*output_values)
-        np.testing.assert_allclose(Y_c2, Y_trt, rtol=1e-3)
diff --git a/caffe2/python/trt/transform.py b/caffe2/python/trt/transform.py
deleted file mode 100644
index aee27d6826fb..000000000000
--- a/caffe2/python/trt/transform.py
+++ /dev/null
@@ -1,108 +0,0 @@
-## @package onnx
-#Module caffe2.python.trt.transform
-
-"""
-TensorRT related transformation
-Note that ONNX-TRT enforce an NCHW input!
-"""
-
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import workspace
-import caffe2.python._import_c_extension as C
-import numpy as np
-
-def _dim_values_to_list(dim_values):
-    return [x.dim_value for x in dim_values]
-
-
-def _get_output_shapes(output_value_infos):
-    names = [x.name for x in output_value_infos]
-    shapes = [_dim_values_to_list(x.type.tensor_type.shape.dim) for x in output_value_infos]
-    return dict(zip(names, shapes))
-
-
-def check_gpu_():
-    try:
-        C.get_cuda_version()
-    except Exception as e:
-        raise Exception("TensorRT related functions require CUDA support") from e
-
-def convert_onnx_model_to_trt_op(onnx_model,
-        max_batch_size=64,
-        max_workspace_size=2*1024*1024,
-        verbosity=1,
-        debug_builder=False):
-    """
-    Convert the whole ONNX model to a TensorRT C2 op
-    """
-    check_gpu_()
-    trt_str = C.onnx_to_trt_op(onnx_model.SerializeToString(),
-                               _get_output_shapes(onnx_model.graph.output),
-                               max_batch_size,
-                               max_workspace_size,
-                               verbosity,
-                               debug_builder)
-    op = caffe2_pb2.OperatorDef()
-    op.ParseFromString(trt_str)
-    return op
-
-
-# Assume the workspace is already filled with init weights
-def _infer_shapes(pred_net, inputs):
-    workspace.RunNetOnce(pred_net)
-    hints = {}
-    for op in pred_net.op:
-        for o in op.output:
-            if o not in hints:
-                blob = workspace.FetchBlob(o)
-                if hasattr(blob, 'shape'):
-                    hints[o] = blob.shape
-        for i in op.input:
-            if i not in hints:
-                blob = workspace.FetchBlob(i)
-                if hasattr(blob, 'shape'):
-                    hints[i] = blob.shape
-
-    return hints
-
-
-def transform_caffe2_net(
-        pred_net,
-        input_shapes,
-        populate_shapes = False,
-        max_batch_size=64,
-        max_workspace_size=2*1024*1024,
-        verbosity=1,
-        debug_builder=False,
-        build_serializable_op=True):
-    """
-    Transform the caffe2_net by collapsing TRT-runnable nodes into trt c2 ops
-    """
-    check_gpu_()
-
-    # Hacky way to infer shapes as not all our operators have shape inference function.
-    # Normally this is not needed
-    shape_hints = {}
-    if populate_shapes:
-        input_data = {}
-        for k,v in input_shapes.items():
-            input_data[k] = np.random.randn(*v).astype(np.float32)
-        shape_hints = _infer_shapes(pred_net, input_data)
-
-    for k,v in input_shapes.items():
-        shape_hints[k] = v
-    pred_net_str = C.transform_trt(pred_net.SerializeToString(),
-                                   shape_hints,
-                                   max_batch_size,
-                                   max_workspace_size,
-                                   verbosity,
-                                   debug_builder,
-                                   build_serializable_op)
-    pred_net_cut = caffe2_pb2.NetDef()
-    pred_net_cut.ParseFromString(pred_net_str)
-    return pred_net_cut
diff --git a/caffe2/python/tt_core.py b/caffe2/python/tt_core.py
deleted file mode 100644
index 314718b76c9d..000000000000
--- a/caffe2/python/tt_core.py
+++ /dev/null
@@ -1,241 +0,0 @@
-## @package tt_core
-# Module caffe2.python.tt_core
-
-
-
-
-import numpy as np
-
-"""
-The following methods are various utility methods for using the Tensor-Train
-decomposition, or TT-decomposition introduced by I. V. Oseledets (2011) in his
-paper (http://epubs.siam.org/doi/abs/10.1137/090752286).
-
-Broadly speaking, these methods are used to replace fully connected layers in
-neural networks with Tensor-Train layers introduced by A. Novikov et. al. (2015)
-in their paper (http://arxiv.org/abs/1509.06569). More details about each of
-the methods are provided in each respective docstring.
-"""
-
-
-def init_tt_cores(inp_sizes, out_sizes, tt_ranks, seed=1234):
-    """
-    Initialize randomized orthogonalized TT-cores.
-
-    This method should be used when a TT-layer is trained from scratch. The
-    sizes of each of the cores are specified by the inp_sizes and out_sizes, and
-    the respective tt_ranks will dictate the ranks of each of the cores. Note
-    that a larger set of tt_ranks will result in slower computation but will
-    result in more accurate approximations. The size of the ith core is:
-
-        tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].
-
-    Note that the following relationships of lengths of each input is expected:
-
-        len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.
-
-    Args:
-        inp_sizes: list of the input dimensions of the respective cores
-        out_sizes: list of the output dimensions of the respective cores
-        tt_ranks: list of the ranks of the respective cores
-        seed: integer to seed the random number generator
-
-    Returns:
-        cores: One-dimensional list of cores concatentated along an axis
-    """
-    np.random.seed(seed)
-
-    # Assert that the sizes of each input is correct
-    assert(len(inp_sizes) == len(out_sizes)), \
-           "The number of input dimensions (" + str(len(inp_sizes)) + \
-           ") must be equal to the number of output dimensions (" + \
-           str(len(out_sizes)) + ")."
-
-    assert(len(tt_ranks) == len(inp_sizes) + 1), \
-           "The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
-           "one more than the number of input and output dims (" + \
-           str(len(out_sizes)) + ")."
-
-    # Convert to numpy arrays
-    inp_sizes = np.array(inp_sizes)
-    out_sizes = np.array(out_sizes)
-    tt_ranks = np.array(tt_ranks)
-
-    # Initialize the cores array
-    cores_len = np.sum(
-        inp_sizes * out_sizes * tt_ranks[1:] * tt_ranks[:-1])
-    cores = np.zeros(cores_len)
-    cores_idx = 0
-    rv = 1
-
-    # Compute the full list of cores by computing each individual one
-    for i in range(inp_sizes.shape[0]):
-        shape = [tt_ranks[i],
-                 inp_sizes[i],
-                 out_sizes[i],
-                 tt_ranks[i + 1]]
-
-        # Precompute the shape of each core
-        tall_shape = (np.prod(shape[:3]), shape[3])
-
-        # Randomly initialize the current core using a normal distribution
-        curr_core = np.dot(rv, np.random.normal(
-            0, 1, size=(shape[0], np.prod(shape[1:]))))
-        curr_core = curr_core.reshape(tall_shape)
-
-        # Orthogonalize the initialized current core and append to cores list
-        if i < inp_sizes.shape[0] - 1:
-            curr_core, rv = np.linalg.qr(curr_core)
-        cores[cores_idx:cores_idx +
-              curr_core.size] = curr_core.flatten()
-        cores_idx += curr_core.size
-
-    # Normalize the list of arrays using this Glarot trick
-    glarot_style = (np.prod(inp_sizes) *
-                    np.prod(tt_ranks))**(1.0 / inp_sizes.shape[0])
-
-    return (0.1 / glarot_style) * np.array(cores).astype(np.float32)
-
-
-def matrix_to_tt(W, inp_sizes, out_sizes, tt_ranks):
-    """
-    Convert a matrix into the TT-format.
-
-    This method will consume a 2D weight matrix such as those used in fully
-    connected layers in a neural network and will compute the TT-decomposition
-    of the weight matrix and return the TT-cores of the resulting computation.
-    This method should be used when converting a trained, fully connected layer,
-    into a TT-layer for increased speed and decreased parameter size. The size
-    of the ith core is:
-
-        tt_ranks[i] * inp_sizes[i] * out_sizes[i] * tt_ranks[i + 1].
-
-    Note that the following relationships of lengths of each input is expected:
-
-        len(inp_sizes) == len(out_sizes) == len(tt_ranks) - 1.
-
-    We also require that np.prod(inp_sizes) == W.shape[0] and that
-    np.prod(out_sizes) == W.shape[1].
-
-    Args:
-        W: two-dimensional weight matrix numpy array representing a fully
-           connected layer to be converted to TT-format; note that the weight
-           matrix is transposed before decomposed because we want to emulate the
-           X * W^T operation that the FC layer performs.
-        inp_sizes: list of the input dimensions of the respective cores
-        out_sizes: list of the output dimensions of the respective cores
-        tt_ranks: list of the ranks of the respective cores
-
-    Returns:
-        new_cores: One-dimensional list of cores concatentated along an axis
-   """
-
-    # Assert that the sizes of each input is correct
-    assert(len(inp_sizes) == len(out_sizes)), \
-           "The number of input dimensions (" + str(len(inp_sizes)) + \
-           ") must be equal to the number of output dimensions (" + \
-           str(len(out_sizes)) + ")."
-
-    assert(len(tt_ranks) == len(inp_sizes) + 1), \
-           "The number of tt-ranks (" + str(len(tt_ranks)) + ") must be " + \
-           "one more than the number of input and output dimensions (" + \
-           str(len(out_sizes)) + ")."
-
-    assert(W.shape[0] == np.prod(inp_sizes)), \
-           "The product of the input sizes (" + str(np.prod(inp_sizes)) + \
-           ") must be equal to first dimension of W (" + str(W.shape[0]) + ")."
-
-    assert(W.shape[1] == np.prod(out_sizes)), \
-           "The product of the output sizes (" + str(np.prod(out_sizes)) + \
-           ") must be equal to second dimension of W (" + str(W.shape[1]) + ")."
-
-    # W is transposed so that the multiplication X * W^T can be computed, just
-    # as it is in the FC layer.
-    W = W.transpose()
-
-    # Convert to numpy arrays
-    inp_sizes = np.array(inp_sizes)
-    out_sizes = np.array(out_sizes)
-    tt_ranks = np.array(tt_ranks)
-
-    # Copy the original weight matrix in order to permute and reshape the weight
-    # matrix. In addition, the inp_sizes and out_sizes are combined to a single
-    # sizes array to use the tt_svd helper method, which only consumes a single
-    # sizes array.
-    W_copy = W.copy()
-    total_inp_size = inp_sizes.size
-    W_copy = np.reshape(W_copy, np.concatenate((inp_sizes, out_sizes)))
-    order = np.repeat(np.arange(0, total_inp_size), 2) + \
-            np.tile([0, total_inp_size], total_inp_size)
-    W_copy = np.transpose(W_copy, axes=order)
-    W_copy = np.reshape(W_copy, inp_sizes * out_sizes)
-
-    # Use helper method to convert the W matrix copy into the preliminary
-    # cores array.
-    cores = tt_svd(W_copy, inp_sizes * out_sizes, tt_ranks)
-
-    # Permute the dimensions of each of the cores to be compatible with the
-    # TT-layer.
-    new_cores = np.zeros(cores.shape).astype(np.float32)
-    idx = 0
-    for i in range(len(inp_sizes)):
-        shape = (tt_ranks[i], inp_sizes[i], out_sizes[i], tt_ranks[i + 1])
-        current_core = cores[idx:idx + np.prod(shape)].reshape(shape)
-        current_core = current_core.transpose((1, 3, 0, 2))
-        new_cores[new_cores.shape[0] - idx - np.prod(shape):
-                  new_cores.shape[0] - idx] \
-                  = current_core.flatten()
-        idx += np.prod(shape)
-
-    return new_cores
-
-
-def tt_svd(W, sizes, tt_ranks):
-    """
-    Helper method for the matrix_to_tt() method performing the TT-SVD
-    decomposition.
-
-    Uses the TT-decomposition algorithm to convert a matrix to TT-format using
-    multiple reduced SVD operations.
-
-    Args:
-        W: two-dimensional weight matrix representing a fully connected layer to
-           be converted to TT-format preprocessed by the matrix_to_tt() method.
-        sizes: list of the dimensions of each of the cores
-        tt_ranks: list of the ranks of the respective cores
-
-    Returns:
-        cores: One-dimensional list of cores concatentated along an axis
-   """
-
-    assert(len(tt_ranks) == len(sizes) + 1)
-
-    C = W.copy()
-    total_size = sizes.size
-    core = np.zeros(np.sum(tt_ranks[:-1] * sizes * tt_ranks[1:]),
-                    dtype='float32')
-
-    # Compute iterative reduced SVD operations and store each resulting U matrix
-    # as an individual core.
-    pos = 0
-    for i in range(0, total_size - 1):
-        shape = tt_ranks[i] * sizes[i]
-        C = np.reshape(C, [shape, -1])
-        U, S, V = np.linalg.svd(C, full_matrices=False)
-        U = U[:, 0:tt_ranks[i + 1]]
-        S = S[0:tt_ranks[i + 1]]
-        V = V[0:tt_ranks[i + 1], :]
-
-        core[pos:pos + tt_ranks[i] * sizes[i] * tt_ranks[i + 1]] = U.ravel()
-        pos += tt_ranks[i] * sizes[i] * tt_ranks[i + 1]
-        C = np.dot(np.diag(S), V)
-
-    core[pos:pos + tt_ranks[total_size - 1] *
-         sizes[total_size - 1] * tt_ranks[total_size]] = C.ravel()
-    return core
-
-
-# TODO(Surya) Write a method to convert an entire network where all fully
-# connected layers are replaced by an TT layer.
-def fc_net_to_tt_net(net):
-    pass
diff --git a/caffe2/python/tt_core_test.py b/caffe2/python/tt_core_test.py
deleted file mode 100644
index 7c12fc7aaeb8..000000000000
--- a/caffe2/python/tt_core_test.py
+++ /dev/null
@@ -1,82 +0,0 @@
-
-
-
-
-
-import numpy as np
-import unittest
-
-from caffe2.python import core, workspace, tt_core
-import caffe2.python.hypothesis_test_util as hu
-
-
-class TestTTSVD(hu.HypothesisTestCase):
-    def test_full_tt_svd(self):
-        size = 256
-        np.random.seed(1234)
-        X = np.expand_dims(
-            np.random.rand(size).astype(np.float32), axis=0)
-        W = np.random.rand(size, size).astype(np.float32)
-        b = np.zeros(size).astype(np.float32)
-        inp_sizes = [4, 4, 4, 4]
-        out_sizes = [4, 4, 4, 4]
-
-        op_fc = core.CreateOperator(
-            "FC",
-            ["X", "W", "b"],
-            ["Y"],
-        )
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("W", W)
-        workspace.FeedBlob("b", b)
-        workspace.RunOperatorOnce(op_fc)
-        Y_fc = workspace.FetchBlob("Y").flatten()
-
-        # Testing TT-decomposition with high ranks
-        full_tt_ranks = [1, 16, 256, 16, 1]
-        full_cores = tt_core.matrix_to_tt(W, inp_sizes, out_sizes,
-                                          full_tt_ranks)
-
-        full_op_tt = core.CreateOperator(
-            "TT",
-            ["X", "b", "cores"],
-            ["Y"],
-            inp_sizes=inp_sizes,
-            out_sizes=out_sizes,
-            tt_ranks=full_tt_ranks,
-        )
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("b", b)
-        workspace.FeedBlob("cores", full_cores)
-        workspace.RunOperatorOnce(full_op_tt)
-        Y_full_tt = workspace.FetchBlob("Y").flatten()
-
-        assert(len(Y_fc) == len(Y_full_tt))
-        self.assertAlmostEqual(np.linalg.norm(Y_fc - Y_full_tt), 0, delta=1e-3)
-
-        # Testing TT-decomposition with minimal ranks
-        sparse_tt_ranks = [1, 1, 1, 1, 1]
-        sparse_cores = tt_core.matrix_to_tt(W, inp_sizes, out_sizes,
-                                            sparse_tt_ranks)
-
-        sparse_op_tt = core.CreateOperator(
-            "TT",
-            ["X", "b", "cores"],
-            ["Y"],
-            inp_sizes=inp_sizes,
-            out_sizes=out_sizes,
-            tt_ranks=sparse_tt_ranks,
-        )
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("b", b)
-        workspace.FeedBlob("cores", sparse_cores)
-        workspace.RunOperatorOnce(sparse_op_tt)
-        Y_sparse_tt = workspace.FetchBlob("Y").flatten()
-
-        assert(len(Y_fc) == len(Y_sparse_tt))
-        self.assertAlmostEqual(np.linalg.norm(Y_fc - Y_sparse_tt),
-                                39.974, delta=1e-3)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/caffe2/python/utils.py b/caffe2/python/utils.py
deleted file mode 100644
index d0a47f0c67be..000000000000
--- a/caffe2/python/utils.py
+++ /dev/null
@@ -1,429 +0,0 @@
-# @package utils
-# Module caffe2.python.utils
-
-
-
-
-
-from caffe2.proto import caffe2_pb2
-from google.protobuf.message import DecodeError, Message
-from google.protobuf import text_format
-
-import sys
-import collections
-import copy
-import functools
-import numpy as np
-
-OPTIMIZER_ITERATION_NAME = "optimizer_iteration"
-OPTIMIZER_ITERATION_LR_NAME = "optimizer_iteration_lr"
-ITERATION_MUTEX_NAME = "iteration_mutex"
-ITERATION_MUTEX_LR_NAME = "iteration_mutex_lr"
-
-
-def OpAlmostEqual(op_a, op_b, ignore_fields=None):
-    '''
-    Two ops are identical except for each field in the `ignore_fields`.
-    '''
-    ignore_fields = ignore_fields or []
-    if not isinstance(ignore_fields, list):
-        ignore_fields = [ignore_fields]
-
-    assert all(isinstance(f, str) for f in ignore_fields), (
-        'Expect each field is text type, but got {}'.format(ignore_fields))
-
-    def clean_op(op):
-        op = copy.deepcopy(op)
-        for field in ignore_fields:
-            if op.HasField(field):
-                op.ClearField(field)
-        return op
-
-    op_a = clean_op(op_a)
-    op_b = clean_op(op_b)
-    return op_a == op_b or str(op_a) == str(op_b)
-
-
-def CaffeBlobToNumpyArray(blob):
-    if (blob.num != 0):
-        # old style caffe blob.
-        return (np.asarray(blob.data, dtype=np.float32)
-                .reshape(blob.num, blob.channels, blob.height, blob.width))
-    else:
-        # new style caffe blob.
-        return (np.asarray(blob.data, dtype=np.float32)
-                .reshape(blob.shape.dim))
-
-
-def Caffe2TensorToNumpyArray(tensor):
-    if tensor.data_type == caffe2_pb2.TensorProto.FLOAT:
-        return np.asarray(
-            tensor.float_data, dtype=np.float32).reshape(tensor.dims)
-    elif tensor.data_type == caffe2_pb2.TensorProto.DOUBLE:
-        return np.asarray(
-            tensor.double_data, dtype=np.float64).reshape(tensor.dims)
-    elif tensor.data_type == caffe2_pb2.TensorProto.INT64:
-        return np.asarray(
-            tensor.int64_data, dtype=np.int64).reshape(tensor.dims)
-    elif tensor.data_type == caffe2_pb2.TensorProto.INT32:
-        return np.asarray(
-            tensor.int32_data, dtype=int).reshape(tensor.dims)   # pb.INT32=>int use int32_data
-    elif tensor.data_type == caffe2_pb2.TensorProto.INT16:
-        return np.asarray(
-            tensor.int32_data, dtype=np.int16).reshape(tensor.dims)  # pb.INT16=>np.int16 use int32_data
-    elif tensor.data_type == caffe2_pb2.TensorProto.UINT16:
-        return np.asarray(
-            tensor.int32_data, dtype=np.uint16).reshape(tensor.dims)  # pb.UINT16=>np.uint16 use int32_data
-    elif tensor.data_type == caffe2_pb2.TensorProto.INT8:
-        return np.asarray(
-            tensor.int32_data, dtype=np.int8).reshape(tensor.dims)  # pb.INT8=>np.int8 use int32_data
-    elif tensor.data_type == caffe2_pb2.TensorProto.UINT8:
-        return np.asarray(
-            tensor.int32_data, dtype=np.uint8).reshape(tensor.dims)  # pb.UINT8=>np.uint8 use int32_data
-    else:
-        # TODO: complete the data type: bool, float16, byte, int64, string
-        raise RuntimeError(
-            "Tensor data type not supported yet: " + str(tensor.data_type))
-
-
-def NumpyArrayToCaffe2Tensor(arr, name=None):
-    tensor = caffe2_pb2.TensorProto()
-    tensor.dims.extend(arr.shape)
-    if name:
-        tensor.name = name
-    if arr.dtype == np.float32:
-        tensor.data_type = caffe2_pb2.TensorProto.FLOAT
-        tensor.float_data.extend(list(arr.flatten().astype(float)))
-    elif arr.dtype == np.float64:
-        tensor.data_type = caffe2_pb2.TensorProto.DOUBLE
-        tensor.double_data.extend(list(arr.flatten().astype(np.float64)))
-    elif arr.dtype == np.int64:
-        tensor.data_type = caffe2_pb2.TensorProto.INT64
-        tensor.int64_data.extend(list(arr.flatten().astype(np.int64)))
-    elif arr.dtype == int or arr.dtype == np.int32:
-        tensor.data_type = caffe2_pb2.TensorProto.INT32
-        tensor.int32_data.extend(arr.flatten().astype(int).tolist())
-    elif arr.dtype == np.int16:
-        tensor.data_type = caffe2_pb2.TensorProto.INT16
-        tensor.int32_data.extend(list(arr.flatten().astype(np.int16)))  # np.int16=>pb.INT16 use int32_data
-    elif arr.dtype == np.uint16:
-        tensor.data_type = caffe2_pb2.TensorProto.UINT16
-        tensor.int32_data.extend(list(arr.flatten().astype(np.uint16)))  # np.uint16=>pb.UNIT16 use int32_data
-    elif arr.dtype == np.int8:
-        tensor.data_type = caffe2_pb2.TensorProto.INT8
-        tensor.int32_data.extend(list(arr.flatten().astype(np.int8)))   # np.int8=>pb.INT8 use int32_data
-    elif arr.dtype == np.uint8:
-        tensor.data_type = caffe2_pb2.TensorProto.UINT8
-        tensor.int32_data.extend(list(arr.flatten().astype(np.uint8)))   # np.uint8=>pb.UNIT8 use int32_data
-    else:
-        # TODO: complete the data type: bool, float16, byte, string
-        raise RuntimeError(
-            "Numpy data type not supported yet: " + str(arr.dtype))
-    return tensor
-
-
-def MakeArgument(key, value):
-    """Makes an argument based on the value type."""
-    argument = caffe2_pb2.Argument()
-    argument.name = key
-    iterable = isinstance(value, collections.abc.Iterable)
-
-    # Fast tracking common use case where a float32 array of tensor parameters
-    # needs to be serialized.  The entire array is guaranteed to have the same
-    # dtype, so no per-element checking necessary and no need to convert each
-    # element separately.
-    if isinstance(value, np.ndarray) and value.dtype.type is np.float32:
-        argument.floats.extend(value.flatten().tolist())
-        return argument
-
-    if isinstance(value, np.ndarray):
-        value = value.flatten().tolist()
-    elif isinstance(value, np.generic):
-        # convert numpy scalar to native python type
-        value = value.item()
-
-    if type(value) is float:
-        argument.f = value
-    elif type(value) in [bool, int]:
-        # We make a relaxation that a boolean variable will also be stored as
-        # int.
-        argument.i = value
-    elif isinstance(value, bytes):
-        argument.s = value
-    elif isinstance(value, str):
-        argument.s = value.encode('utf-8')
-    elif isinstance(value, caffe2_pb2.NetDef):
-        argument.n.CopyFrom(value)
-    elif isinstance(value, Message):
-        argument.s = value.SerializeToString()
-    elif iterable and all(type(v) in [float, np.float_] for v in value):
-        argument.floats.extend(
-            v.item() if type(v) is np.float_ else v for v in value
-        )
-    elif iterable and all(
-        type(v) in [bool, int, np.int_] for v in value
-    ):
-        argument.ints.extend(
-            v.item() if type(v) is np.int_ else v for v in value
-        )
-    elif iterable and all(
-        isinstance(v, bytes) or isinstance(v, str) for v in value
-    ):
-        argument.strings.extend(
-            v.encode('utf-8') if isinstance(v, str) else v
-            for v in value
-        )
-    elif iterable and all(isinstance(v, caffe2_pb2.NetDef) for v in value):
-        argument.nets.extend(value)
-    elif iterable and all(isinstance(v, Message) for v in value):
-        argument.strings.extend(v.SerializeToString() for v in value)
-    else:
-        if iterable:
-            raise ValueError(
-                "Unknown iterable argument type: key={} value={}, value "
-                "type={}[{}]".format(
-                    key, value, type(value), set(type(v) for v in value)
-                )
-            )
-        else:
-            raise ValueError(
-                "Unknown argument type: key={} value={}, value type={}".format(
-                    key, value, type(value)
-                )
-            )
-    return argument
-
-
-def TryReadProtoWithClass(cls, s):
-    """Reads a protobuffer with the given proto class.
-
-    Inputs:
-      cls: a protobuffer class.
-      s: a string of either binary or text protobuffer content.
-
-    Outputs:
-      proto: the protobuffer of cls
-
-    Throws:
-      google.protobuf.message.DecodeError: if we cannot decode the message.
-    """
-    obj = cls()
-    try:
-        text_format.Parse(s, obj)
-        return obj
-    except (text_format.ParseError, UnicodeDecodeError):
-        obj.ParseFromString(s)
-        return obj
-
-
-def GetContentFromProto(obj, function_map):
-    """Gets a specific field from a protocol buffer that matches the given class
-    """
-    for cls, func in function_map.items():
-        if type(obj) is cls:
-            return func(obj)
-
-
-def GetContentFromProtoString(s, function_map):
-    for cls, func in function_map.items():
-        try:
-            obj = TryReadProtoWithClass(cls, s)
-            return func(obj)
-        except DecodeError:
-            continue
-    else:
-        raise DecodeError("Cannot find a fit protobuffer class.")
-
-
-def ConvertProtoToBinary(proto_class, filename, out_filename):
-    """Convert a text file of the given protobuf class to binary."""
-    with open(filename) as f:
-        proto = TryReadProtoWithClass(proto_class, f.read())
-    with open(out_filename, 'w') as fid:
-        fid.write(proto.SerializeToString())
-
-
-def GetGPUMemoryUsageStats():
-    """Get GPU memory usage stats from CUDAContext/HIPContext. This requires flag
-       --caffe2_gpu_memory_tracking to be enabled"""
-    from caffe2.python import workspace, core
-    workspace.RunOperatorOnce(
-        core.CreateOperator(
-            "GetGPUMemoryUsage",
-            [],
-            ["____mem____"],
-            device_option=core.DeviceOption(workspace.GpuDeviceType, 0),
-        ),
-    )
-    b = workspace.FetchBlob("____mem____")
-    return {
-        'total_by_gpu': b[0, :],
-        'max_by_gpu': b[1, :],
-        'total': np.sum(b[0, :]),
-        'max_total': np.sum(b[1, :])
-    }
-
-
-def ResetBlobs(blobs):
-    from caffe2.python import workspace, core
-    workspace.RunOperatorOnce(
-        core.CreateOperator(
-            "Free",
-            list(blobs),
-            list(blobs),
-            device_option=core.DeviceOption(caffe2_pb2.CPU),
-        ),
-    )
-
-
-class DebugMode:
-    '''
-    This class allows to drop you into an interactive debugger
-    if there is an unhandled exception in your python script
-
-    Example of usage:
-
-    def main():
-        # your code here
-        pass
-
-    if __name__ == '__main__':
-        from caffe2.python.utils import DebugMode
-        DebugMode.run(main)
-    '''
-
-    @classmethod
-    def run(cls, func):
-        try:
-            return func()
-        except KeyboardInterrupt:
-            raise
-        except Exception:
-            import pdb
-
-            print(
-                'Entering interactive debugger. Type "bt" to print '
-                'the full stacktrace. Type "help" to see command listing.')
-            print(sys.exc_info()[1])
-            print
-
-            pdb.post_mortem()
-            sys.exit(1)
-            raise
-
-
-def raiseIfNotEqual(a, b, msg):
-    if a != b:
-        raise Exception("{}. {} != {}".format(msg, a, b))
-
-
-def debug(f):
-    '''
-    Use this method to decorate your function with DebugMode's functionality
-
-    Example:
-
-    @debug
-    def test_foo(self):
-        raise Exception("Bar")
-
-    '''
-
-    @functools.wraps(f)
-    def wrapper(*args, **kwargs):
-        def func():
-            return f(*args, **kwargs)
-        return DebugMode.run(func)
-
-    return wrapper
-
-
-def BuildUniqueMutexIter(
-    init_net,
-    net,
-    iter=None,
-    iter_mutex=None,
-    iter_val=0
-):
-    '''
-    Often, a mutex guarded iteration counter is needed. This function creates a
-    mutex iter in the net uniquely (if the iter already existing, it does
-    nothing)
-
-    This function returns the iter blob
-    '''
-    iter = iter if iter is not None else OPTIMIZER_ITERATION_NAME
-    iter_mutex = iter_mutex if iter_mutex is not None else ITERATION_MUTEX_NAME
-    from caffe2.python import core
-    if not init_net.BlobIsDefined(iter):
-        # Add training operators.
-        with core.DeviceScope(
-                core.DeviceOption(caffe2_pb2.CPU,
-                                  extra_info=["device_type_override:cpu"])
-        ):
-            iteration = init_net.ConstantFill(
-                [],
-                iter,
-                shape=[1],
-                value=iter_val,
-                dtype=core.DataType.INT64,
-            )
-            iter_mutex = init_net.CreateMutex([], [iter_mutex])
-            net.AtomicIter([iter_mutex, iteration], [iteration])
-    else:
-        iteration = init_net.GetBlobRef(iter)
-    return iteration
-
-
-def EnumClassKeyVals(cls):
-    # cls can only be derived from object
-    assert type(cls) == type
-    # Enum attribute keys are all capitalized and values are strings
-    enum = {}
-    for k in dir(cls):
-        if k == k.upper():
-            v = getattr(cls, k)
-            if isinstance(v, str):
-                assert v not in enum.values(), (
-                    "Failed to resolve {} as Enum: "
-                    "duplicate entries {}={}, {}={}".format(
-                        cls, k, v, [key for key in enum if enum[key] == v][0], v
-                    )
-                )
-                enum[k] = v
-    return enum
-
-
-def ArgsToDict(args):
-    """
-    Convert a list of arguments to a name, value dictionary. Assumes that
-    each argument has a name. Otherwise, the argument is skipped.
-    """
-    ans = {}
-    for arg in args:
-        if not arg.HasField("name"):
-            continue
-        for d in arg.DESCRIPTOR.fields:
-            if d.name == "name":
-                continue
-            if d.label == d.LABEL_OPTIONAL and arg.HasField(d.name):
-                ans[arg.name] = getattr(arg, d.name)
-                break
-            elif d.label == d.LABEL_REPEATED:
-                list_ = getattr(arg, d.name)
-                if len(list_) > 0:
-                    ans[arg.name] = list_
-                    break
-        else:
-            ans[arg.name] = None
-    return ans
-
-
-def NHWC2NCHW(tensor):
-    assert tensor.ndim >= 1
-    return tensor.transpose((0, tensor.ndim - 1) + tuple(range(1, tensor.ndim - 1)))
-
-
-def NCHW2NHWC(tensor):
-    assert tensor.ndim >= 2
-    return tensor.transpose((0,) + tuple(range(2, tensor.ndim)) + (1,))
diff --git a/caffe2/python/utils_test.py b/caffe2/python/utils_test.py
deleted file mode 100644
index ef809bfd8154..000000000000
--- a/caffe2/python/utils_test.py
+++ /dev/null
@@ -1,40 +0,0 @@
-
-
-
-
-
-from caffe2.python import core, utils, test_util
-
-import numpy as np
-
-
-class TestUtils(test_util.TestCase):
-    def testArgsToDict(self):
-        args = [utils.MakeArgument("int1", 3),
-                utils.MakeArgument("float1", 4.0),
-                utils.MakeArgument("string1", "foo"),
-                utils.MakeArgument("intlist1", np.array([3, 4])),
-                utils.MakeArgument("floatlist1", np.array([5.0, 6.0])),
-                utils.MakeArgument("stringlist1", np.array(["foo", "bar"]))]
-        dict_ = utils.ArgsToDict(args)
-        expected = {"int1" : 3,
-                    "float1" : 4.0,
-                    "string1" : b"foo",
-                    "intlist1" : [3, 4],
-                    "floatlist1" : [5.0, 6.0],
-                    "stringlist1" : [b"foo", b"bar"]}
-        self.assertEqual(dict_, expected, "dictionary version of arguments "
-                         "doesn't match original")
-
-    def testBuildUniqueMutexIter(self):
-        init_net = core.Net("init_net")
-        net = core.Net("net")
-        utils.BuildUniqueMutexIter(init_net, net)
-
-        for op in init_net.Proto().op:
-            self.assertEqual(op.device_option.extra_info[0],
-                             "device_type_override:cpu")
-
-        for op in net.Proto().op:
-            self.assertEqual(op.device_option.extra_info[0],
-                             "device_type_override:cpu")
diff --git a/caffe2/python/visualize.py b/caffe2/python/visualize.py
deleted file mode 100644
index 92190d1e62a0..000000000000
--- a/caffe2/python/visualize.py
+++ /dev/null
@@ -1,175 +0,0 @@
-## @package visualize
-# Module caffe2.python.visualize
-"""Functions that could be used to visualize Tensors.
-
-This is adapted from the old-time iceberk package that Yangqing wrote... Oh gold
-memories. Before decaf and caffe. Why iceberk? Because I was at Berkeley,
-bears are vegetarian, and iceberg lettuce has layers of leaves.
-
-(This joke is so lame.)
-"""
-
-import numpy as np
-from matplotlib import cm, pyplot
-
-
-def ChannelFirst(arr):
-    """Convert a HWC array to CHW."""
-    ndim = arr.ndim
-    return arr.swapaxes(ndim - 1, ndim - 2).swapaxes(ndim - 2, ndim - 3)
-
-
-def ChannelLast(arr):
-    """Convert a CHW array to HWC."""
-    ndim = arr.ndim
-    return arr.swapaxes(ndim - 3, ndim - 2).swapaxes(ndim - 2, ndim - 1)
-
-
-class PatchVisualizer:
-    """PatchVisualizer visualizes patches.
-  """
-
-    def __init__(self, gap=1):
-        self.gap = gap
-
-    def ShowSingle(self, patch, cmap=None):
-        """Visualizes one single patch.
-
-    The input patch could be a vector (in which case we try to infer the shape
-    of the patch), a 2-D matrix, or a 3-D matrix whose 3rd dimension has 3
-    channels.
-    """
-        if len(patch.shape) == 1:
-            patch = patch.reshape(self.get_patch_shape(patch))
-        elif len(patch.shape) > 2 and patch.shape[2] != 3:
-            raise ValueError("The input patch shape isn't correct.")
-        # determine color
-        if len(patch.shape) == 2 and cmap is None:
-            cmap = cm.gray
-        pyplot.imshow(patch, cmap=cmap)
-        return patch
-
-    def ShowMultiple(self, patches, ncols=None, cmap=None, bg_func=np.mean):
-        """Visualize multiple patches.
-
-    In the passed in patches matrix, each row is a patch, in the shape of either
-    n*n, n*n*1 or n*n*3, either in a flattened format (so patches would be a
-    2-D array), or a multi-dimensional tensor. We will try our best to figure
-    out automatically the patch size.
-    """
-        num_patches = patches.shape[0]
-        if ncols is None:
-            ncols = int(np.ceil(np.sqrt(num_patches)))
-        nrows = int(np.ceil(num_patches / float(ncols)))
-        if len(patches.shape) == 2:
-            patches = patches.reshape(
-                (patches.shape[0], ) + self.get_patch_shape(patches[0])
-            )
-        patch_size_expand = np.array(patches.shape[1:3]) + self.gap
-        image_size = patch_size_expand * np.array([nrows, ncols]) - self.gap
-        if len(patches.shape) == 4:
-            if patches.shape[3] == 1:
-                # gray patches
-                patches = patches.reshape(patches.shape[:-1])
-                image_shape = tuple(image_size)
-                if cmap is None:
-                    cmap = cm.gray
-            elif patches.shape[3] == 3:
-                # color patches
-                image_shape = tuple(image_size) + (3, )
-            else:
-                raise ValueError("The input patch shape isn't expected.")
-        else:
-            image_shape = tuple(image_size)
-            if cmap is None:
-                cmap = cm.gray
-        image = np.ones(image_shape) * bg_func(patches)
-        for pid in range(num_patches):
-            row = pid // ncols * patch_size_expand[0]
-            col = pid % ncols * patch_size_expand[1]
-            image[row:row+patches.shape[1], col:col+patches.shape[2]] = \
-                patches[pid]
-        pyplot.imshow(image, cmap=cmap, interpolation='nearest')
-        pyplot.axis('off')
-        return image
-
-    def ShowImages(self, patches, *args, **kwargs):
-        """Similar to ShowMultiple, but always normalize the values between 0 and 1
-    for better visualization of image-type data.
-    """
-        patches = patches - np.min(patches)
-        patches /= np.max(patches) + np.finfo(np.float64).eps
-        return self.ShowMultiple(patches, *args, **kwargs)
-
-    def ShowChannels(self, patch, cmap=None, bg_func=np.mean):
-        """ This function shows the channels of a patch.
-
-    The incoming patch should have shape [w, h, num_channels], and each channel
-    will be visualized as a separate gray patch.
-    """
-        if len(patch.shape) != 3:
-            raise ValueError("The input patch shape isn't correct.")
-        patch_reordered = np.swapaxes(patch.T, 1, 2)
-        return self.ShowMultiple(patch_reordered, cmap=cmap, bg_func=bg_func)
-
-    def get_patch_shape(self, patch):
-        """Gets the shape of a single patch.
-
-    Basically it tries to interpret the patch as a square, and also check if it
-    is in color (3 channels)
-    """
-        edgeLen = np.sqrt(patch.size)
-        if edgeLen != np.floor(edgeLen):
-            # we are given color patches
-            edgeLen = np.sqrt(patch.size / 3.)
-            if edgeLen != np.floor(edgeLen):
-                raise ValueError("I can't figure out the patch shape.")
-            return (edgeLen, edgeLen, 3)
-        else:
-            edgeLen = int(edgeLen)
-            return (edgeLen, edgeLen)
-
-
-_default_visualizer = PatchVisualizer()
-"""Utility functions that directly point to functions in the default visualizer.
-
-These functions don't return anything, so you won't see annoying printouts of
-the visualized images. If you want to save the images for example, you should
-explicitly instantiate a patch visualizer, and call those functions.
-"""
-
-
-class NHWC:
-    @staticmethod
-    def ShowSingle(*args, **kwargs):
-        _default_visualizer.ShowSingle(*args, **kwargs)
-
-    @staticmethod
-    def ShowMultiple(*args, **kwargs):
-        _default_visualizer.ShowMultiple(*args, **kwargs)
-
-    @staticmethod
-    def ShowImages(*args, **kwargs):
-        _default_visualizer.ShowImages(*args, **kwargs)
-
-    @staticmethod
-    def ShowChannels(*args, **kwargs):
-        _default_visualizer.ShowChannels(*args, **kwargs)
-
-
-class NCHW:
-    @staticmethod
-    def ShowSingle(patch, *args, **kwargs):
-        _default_visualizer.ShowSingle(ChannelLast(patch), *args, **kwargs)
-
-    @staticmethod
-    def ShowMultiple(patch, *args, **kwargs):
-        _default_visualizer.ShowMultiple(ChannelLast(patch), *args, **kwargs)
-
-    @staticmethod
-    def ShowImages(patch, *args, **kwargs):
-        _default_visualizer.ShowImages(ChannelLast(patch), *args, **kwargs)
-
-    @staticmethod
-    def ShowChannels(patch, *args, **kwargs):
-        _default_visualizer.ShowChannels(ChannelLast(patch), *args, **kwargs)
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
deleted file mode 100644
index b4f47896b2ff..000000000000
--- a/caffe2/python/workspace.py
+++ /dev/null
@@ -1,783 +0,0 @@
-## @package workspace
-# Module caffe2.python.workspace
-
-
-
-
-import collections
-import contextlib
-from google.protobuf.message import Message
-from multiprocessing import Process
-import os
-from collections import defaultdict
-import logging
-import numpy as np
-from past.builtins import basestring
-import shutil
-import socket
-import tempfile
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import scope, utils
-from caffe2.python.lazy import TriggerLazyImport
-
-import caffe2.python._import_c_extension as C
-
-logger = logging.getLogger(__name__)
-
-Blobs = C.blobs
-ResetBlob = C.reset_blob
-CreateBlob = C.create_blob
-CurrentWorkspace = C.current_workspace
-DeserializeBlob = C.deserialize_blob
-GlobalInit = C.global_init
-HasBlob = C.has_blob
-RegisteredOperators = C.registered_operators
-SerializeBlob = C.serialize_blob
-SwitchWorkspace = C.switch_workspace
-RootFolder = C.root_folder
-Workspaces = C.workspaces
-BenchmarkNet = C.benchmark_net
-BenchmarkNetOnce = C.benchmark_net_once
-GetStats = C.get_stats
-CreateOfflineTensor = C.create_offline_tensor
-
-operator_tracebacks = defaultdict(dict)
-
-is_asan = C.is_asan
-has_fbgemm = C.has_fbgemm
-has_cuda_support = C.has_cuda_support
-has_hip_support = C.has_hip_support
-has_gpu_support = C.has_gpu_support
-if has_cuda_support:
-    GpuDeviceType = caffe2_pb2.CUDA
-    NumCudaDevices = C.num_cuda_devices
-    # This is a duplicate of NumCudaDevices. Remove
-    # NumCudaDevices once replaced everywhere in the code
-    NumGpuDevices = C.num_cuda_devices
-    GetCUDAVersion = C.get_cuda_version
-    GetCuDNNVersion = C.get_cudnn_version
-
-    def GetGpuPeerAccessPattern():
-        return np.asarray(C.get_cuda_peer_access_pattern())
-
-    GetDeviceProperties = C.get_device_properties
-    GetGPUMemoryInfo = C.get_gpu_memory_info
-else:
-    # pyre-fixme[9]: incompatible type assignment
-    NumCudaDevices = lambda: 0 # noqa
-    # pyre-fixme[9]: incompatible type assignment
-    GetCUDAVersion = lambda: 0 # noqa
-    # pyre-fixme[9]: incompatible type assignment
-    GetCuDNNVersion = lambda: 0 # noqa
-
-if has_hip_support:
-    GpuDeviceType = caffe2_pb2.HIP
-    # pyre-fixme[9]: incompatible type assignment
-    NumGpuDevices = C.num_hip_devices
-    GetHIPVersion = C.get_hip_version
-
-    def GetGpuPeerAccessPattern():
-        return np.asarray(C.get_hip_peer_access_pattern())
-    GetDeviceProperties = C.get_device_properties
-    GetGPUMemoryInfo = C.get_gpu_memory_info
-
-if not has_gpu_support:
-    # setting cuda as the default GpuDeviceType as some tests
-    # like core, scope tests use GpuDeviceType even without gpu support
-    GpuDeviceType = caffe2_pb2.CUDA
-    # pyre-fixme[9]: incompatible type assignment
-    NumGpuDevices = lambda: 0 # noqa
-    GetDeviceProperties = lambda x: None # noqa
-    GetGpuPeerAccessPattern = lambda: np.array([]) # noqa
-    # pyre-fixme[9]: incompatible type assignment
-    GetGPUMemoryInfo = lambda: None # noqa
-
-IsNUMAEnabled = C.is_numa_enabled
-GetNumNUMANodes = C.get_num_numa_nodes
-GetBlobNUMANode = C.get_blob_numa_node
-GetBlobSizeBytes = C.get_blob_size_bytes
-
-
-def FillRandomNetworkInputs(net, input_dims, input_types):
-    C.fill_random_network_inputs(net.Proto().SerializeToString(), input_dims, input_types)
-
-
-def _GetFreeFlaskPort():
-    """Get a free flask port."""
-    # We will prefer to use 5000. If not, we will then pick a random port.
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    result = sock.connect_ex(('127.0.0.1', 5000))
-    if result == 0:
-        return 5000
-    else:
-        s = socket.socket()
-        s.bind(('', 0))
-        port = s.getsockname()[1]
-        s.close()
-        # Race condition: between the interval we close the socket and actually
-        # start a mint process, another process might have occupied the port. We
-        # don't do much here as this is mostly for convenience in research
-        # rather than 24x7 service.
-        return port
-
-def StartMint(root_folder=None, port=None):
-    """Start a mint instance.
-
-    TODO(Yangqing): this does not work well under ipython yet. According to
-        https://github.com/ipython/ipython/issues/5862
-    writing up some fix is a todo item.
-    """
-    from caffe2.python.mint import app
-    if root_folder is None:
-        # Get the root folder from the current workspace
-        root_folder = C.root_folder()
-    if port is None:
-        port = _GetFreeFlaskPort()
-    process = Process(
-        target=app.main,
-        args=(
-            ['-p', str(port), '-r', root_folder],
-        )
-    )
-    process.start()
-    print('Mint running at http://{}:{}'.format(socket.getfqdn(), port))
-    return process
-
-
-def StringifyProto(obj):
-    """Stringify a protocol buffer object.
-
-  Inputs:
-    obj: a protocol buffer object, or a Pycaffe2 object that has a Proto()
-        function.
-  Outputs:
-    string: the output protobuf string.
-  Raises:
-    AttributeError: if the passed in object does not have the right attribute.
-  """
-    if isinstance(obj, basestring):
-        return obj
-    else:
-        if isinstance(obj, Message):
-            # First, see if this object is a protocol buffer, which we can
-            # simply serialize with the SerializeToString() call.
-            return obj.SerializeToString()
-        elif hasattr(obj, 'Proto'):
-            return obj.Proto().SerializeToString()
-        else:
-            raise ValueError("Unexpected argument to StringifyProto of type " +
-                             type(obj).__name__)
-
-
-def ResetWorkspace(root_folder=None):
-    if root_folder is None:
-        # Reset the workspace, but keep the current root folder setting.
-        return C.reset_workspace(C.root_folder())
-    else:
-        if not os.path.exists(root_folder):
-            os.makedirs(root_folder)
-        return C.reset_workspace(root_folder)
-
-
-def CreateNet(net, overwrite=False, input_blobs=None):
-    TriggerLazyImport()
-    if input_blobs is None:
-        input_blobs = []
-    for input_blob in input_blobs:
-        C.create_blob(input_blob)
-    return CallWithExceptionIntercept(
-        C.create_net,
-        C.Workspace.current._last_failed_op_net_position,
-        GetNetName(net),
-        StringifyProto(net), overwrite,
-    )
-
-
-def Predictor(init_net, predict_net):
-    return C.Predictor(StringifyProto(init_net), StringifyProto(predict_net))
-
-
-def GetOperatorCost(operator, blobs):
-    return C.get_operator_cost(StringifyProto(operator), blobs)
-
-
-def RunOperatorOnce(operator):
-    return C.run_operator_once(StringifyProto(operator))
-
-
-def RunOperatorMultiple(operator, num_runs):
-    return C.run_operator_multiple(StringifyProto(operator), num_runs)
-
-
-def RunOperatorsOnce(operators):
-    for op in operators:
-        success = RunOperatorOnce(op)
-        if not success:
-            return False
-    return True
-
-
-def ClearGlobalNetObserver():
-    return C.clear_global_net_observer()
-
-
-def CallWithExceptionIntercept(func, op_id_fetcher, net_name, *args, **kwargs):
-    try:
-        return func(*args, **kwargs)
-    except Exception:
-        op_id = op_id_fetcher()
-        net_tracebacks = operator_tracebacks.get(net_name, None)
-        logger.warning(
-            'Original python traceback for operator `{}` in network '
-            '`{}` in exception above (most recent call last):'.format(
-                op_id, net_name))
-        if net_tracebacks and op_id in net_tracebacks:
-            tb = net_tracebacks[op_id]
-            for line in reversed(tb):
-                logger.warning('  File "{}", line {}, in {}'.format(
-                    line[0], line[1], line[2]))
-        raise
-
-
-def RunNetOnce(net):
-    return CallWithExceptionIntercept(
-        C.run_net_once,
-        C.Workspace.current._last_failed_op_net_position,
-        GetNetName(net),
-        StringifyProto(net),
-    )
-
-
-def RunNet(name, num_iter=1, allow_fail=False):
-    """Runs a given net.
-
-    Inputs:
-      name: the name of the net, or a reference to the net.
-      num_iter: number of iterations to run
-      allow_fail: if True, does not assert on net exec failure but returns False
-    Returns:
-      True or an exception.
-    """
-    return CallWithExceptionIntercept(
-        C.run_net,
-        C.Workspace.current._last_failed_op_net_position,
-        GetNetName(name),
-        StringifyNetName(name), num_iter, allow_fail,
-    )
-
-
-def RunPlan(plan_or_step):
-    # TODO(jiayq): refactor core.py/workspace.py to avoid circular deps
-    import caffe2.python.core as core
-    if isinstance(plan_or_step, core.ExecutionStep):
-        plan_or_step = core.Plan(plan_or_step)
-    return C.run_plan(StringifyProto(plan_or_step))
-
-
-def RunPlanInBackground(plan_or_step):
-    # TODO(jiayq): refactor core.py/workspace.py to avoid circular deps
-    import caffe2.python.core as core
-    if isinstance(plan_or_step, core.ExecutionStep):
-        plan_or_step = core.Plan(plan_or_step)
-    return C.run_plan_in_background(StringifyProto(plan_or_step))
-
-
-def InferShapesAndTypes(nets, blob_dimensions=None, nets_proto=False,
-                        blob_types=None):
-    """Infers the shapes and types for the specified nets.
-
-    Inputs:
-      nets: the list of nets
-      blob_dimensions (optional): a dictionary of blobs and their dimensions.
-          If not specified, the workspace blobs are used.
-      nets_proto (optional): a boolean flag indicating whether the protobuffer
-          representation is passed to the routine.
-    Returns:
-      A tuple of (shapes, types) dictionaries keyed by blob name.
-    """
-    if nets_proto:
-        net_protos = [StringifyProto(n) for n in nets]
-    else:
-        net_protos = [StringifyProto(n.Proto()) for n in nets]
-    if blob_dimensions is None:
-        assert blob_types is None
-        blobdesc_prototxt = C.infer_shapes_and_types_from_workspace(net_protos)
-    elif blob_types is None:
-        blobdesc_prototxt = C.infer_shapes_and_types_from_map(
-            net_protos, blob_dimensions
-        )
-    else:
-        blobdesc_prototxt = C.infer_shapes_and_types_from_map(
-            net_protos, blob_dimensions, blob_types
-        )
-    blobdesc_proto = caffe2_pb2.TensorShapes()
-    blobdesc_proto.ParseFromString(blobdesc_prototxt)
-    shapes = {}
-    types = {}
-    for ts in blobdesc_proto.shapes:
-        if not ts.unknown_shape:
-            shapes[ts.name] = list(ts.dims)
-            types[ts.name] = ts.data_type
-
-    return (shapes, types)
-
-
-def _StringifyName(name, expected_type):
-    if isinstance(name, basestring):
-        return name
-    assert type(name).__name__ == expected_type, \
-        "Expected a string or %s" % expected_type
-    return str(name)
-
-
-def StringifyBlobName(name):
-    return _StringifyName(name, "BlobReference")
-
-
-def StringifyNetName(name):
-    return _StringifyName(name, "Net")
-
-
-def GetNetName(net):
-    if isinstance(net, basestring):
-        return net
-    if type(net).__name__ == "Net" or type(net).__name__ == "NetWithShapeInference":
-        return net.Name()
-    if isinstance(net, caffe2_pb2.NetDef):
-        return net.name
-    raise Exception("Not a Net object: {}".format(str(net)))
-
-
-def FeedBlob(name, arr, device_option=None):
-    """Feeds a blob into the workspace.
-
-    Inputs:
-      name: the name of the blob.
-      arr: either a TensorProto object or a numpy array object to be fed into
-          the workspace.
-      device_option (optional): the device option to feed the data with.
-    Returns:
-      True or False, stating whether the feed is successful.
-    """
-    ws = C.Workspace.current
-    return _Workspace_feed_blob(ws, name, arr, device_option)
-
-
-def FetchBlobs(names):
-    """Fetches a list of blobs from the workspace.
-
-    Inputs:
-        names: list of names of blobs - strings or BlobReferences
-    Returns:
-        list of fetched blobs
-    """
-    return [FetchBlob(name) for name in names]
-
-
-def FetchBlob(name):
-    """Fetches a blob from the workspace.
-
-    Inputs:
-      name: the name of the blob - a string or a BlobReference
-    Returns:
-      Fetched blob (numpy array or string) if successful
-    """
-    result = C.fetch_blob(StringifyBlobName(name))
-    if isinstance(result, tuple):
-        raise TypeError(
-            "Use FetchInt8Blob to fetch Int8 Blob {}".format(
-                StringifyBlobName(name)
-            )
-        )
-    return result
-
-
-def FetchTorch(name):
-    ws = C.Workspace.current
-    return ws.blobs[name].to_torch()
-
-
-Int8Tensor = collections.namedtuple(
-    'Int8Tensor', ['data', 'scale', 'zero_point']
-)
-
-
-def FetchInt8Blob(name):
-    """Fetches an Int8 blob from the workspace. It shared backend implementation
-    with FetchBlob but it is recommended when fetching Int8 Blobs
-
-    Inputs:
-      name: the name of the Int8 blob - a string or a BlobReference
-    Returns:
-      data: int8 numpy array, data
-      scale: float, fake quantization scale
-      zero_point: int, fake quantization offset
-    """
-    result = C.fetch_blob(StringifyBlobName(name))
-    assert isinstance(result, tuple), \
-        'You are not fetching an Int8Blob {}. Please use FetchBlob'.format(
-            StringifyBlobName(name))
-    return Int8Tensor(*result)
-
-
-def FetchInt8BlobRealVal(name):
-    """Fetches an Int8 blob from the workspace and return its real value representation.
-
-    Inputs:
-      name: the name of the Int8 blob - a string or a BlobReference
-    Returns:
-      real value representation of int8 numpy array
-    """
-    result = C.fetch_blob(StringifyBlobName(name))
-    assert isinstance(result, tuple), \
-        'You are not fetching an Int8Blob {}. Please use FetchBlob'.format(
-            StringifyBlobName(name))
-    int8_blob = Int8Tensor(*result)
-    return (int8_blob.data.astype(np.int32) - int(int8_blob.zero_point)).astype(
-        np.float32) * int8_blob.scale
-
-
-def RemoveBlob(name) -> None:
-    ws = C.Workspace.current
-    _Workspace_remove_blob(ws, name)
-
-def _Workspace_fetch_int8_blob(ws, name):
-    """Fetches an Int8 blob from the workspace. It shared backend implementation
-    with FetchBlob but it is recommended when fetching Int8 Blobs
-
-    Inputs:
-      name: the name of the Int8 blob - a string or a BlobReference
-    Returns:
-      data: int8 numpy array, data
-      scale: float, fake quantization scale
-      zero_point: int, fake quantization offset
-    """
-    result = ws.fetch_blob(name)
-    assert isinstance(result, tuple), \
-        'You are not fetching an Int8Blob {}. Please use fetch_blob'.format(
-            StringifyBlobName(name))
-    return Int8Tensor(*result)
-
-
-C.Workspace.fetch_int8_blob = _Workspace_fetch_int8_blob
-
-
-def ApplyTransform(transform_key, net):
-    """Apply a Transform to a NetDef protobuf object, and returns the new
-    transformed NetDef.
-
-    Inputs:
-      transform_key: the name of the transform, as it is stored in the registry
-      net: a NetDef protobuf object
-    Returns:
-      Transformed NetDef protobuf object.
-    """
-    transformed_net = caffe2_pb2.NetDef()
-    transformed_str = C.apply_transform(
-        str(transform_key).encode('utf-8'),
-        net.SerializeToString(),
-    )
-    transformed_net.ParseFromString(transformed_str)
-    return transformed_net
-
-
-def ApplyTransformIfFaster(transform_key, net, init_net, **kwargs):
-    """Apply a Transform to a NetDef protobuf object, and returns the new
-    transformed NetDef, only if it runs faster than the original.
-
-    The runs are performed on the current active workspace (gWorkspace).
-    You should initialize that workspace before making a call to this function.
-
-    Inputs:
-      transform_key: the name of the transform, as it is stored in the registry
-      net: a NetDef protobuf object
-      init_net: The net to initialize the workspace.
-      warmup_runs (optional):
-        Determines how many times the net is run before testing.
-        Will be 5 by default.
-      main_runs (optional):
-        Determines how many times the net is run during testing.
-        Will be 10 by default.
-      improvement_threshold (optional):
-        Determines the factor which the new net needs to be faster
-        in order to replace the old. Will be 1.01 by default.
-
-    Returns:
-      Either a Transformed NetDef protobuf object, or the original netdef.
-    """
-
-    warmup_runs = kwargs['warmup_runs'] if 'warmup_runs' in kwargs else 5
-    main_runs = kwargs['main_runs'] if 'main_runs' in kwargs else 10
-    improvement_threshold = kwargs['improvement_threshold'] \
-        if 'improvement_threshold' in kwargs else 1.01
-
-    transformed_net = caffe2_pb2.NetDef()
-    transformed_str = C.apply_transform_if_faster(
-        str(transform_key).encode('utf-8'),
-        net.SerializeToString(),
-        init_net.SerializeToString(),
-        warmup_runs,
-        main_runs,
-        float(improvement_threshold),
-    )
-    transformed_net.ParseFromString(transformed_str)
-    return transformed_net
-
-
-def GetNameScope():
-    """Return the current namescope string. To be used to fetch blobs"""
-    return scope.CurrentNameScope()
-
-
-class _BlobDict:
-    """Provides python dict compatible way to do fetching and feeding"""
-
-    def __getitem__(self, key):
-        return FetchBlob(key)
-
-    def __setitem__(self, key, value):
-        return FeedBlob(key, value)
-
-    def __len__(self):
-        return len(C.blobs())
-
-    def __iter__(self):
-        return C.blobs().__iter__()
-
-    def __contains__(self, item):
-        return C.has_blob(item)
-
-
-blobs = _BlobDict()
-
-
-################################################################################
-# Utilities for immediate mode
-#
-# Caffe2's immediate mode implements the following behavior: between the two
-# function calls StartImmediate() and StopImmediate(), for any operator that is
-# called through CreateOperator(), we will also run that operator in a workspace
-# that is specific to the immediate mode. The user is explicitly expected to
-# make sure that these ops have proper inputs and outputs, i.e. one should not
-# run an op where an external input is not created or fed.
-#
-# Users can use FeedImmediate() and FetchImmediate() to interact with blobs
-# in the immediate workspace.
-#
-# Once StopImmediate() is called, all contents in the immediate workspace is
-# freed up so one can continue using normal runs.
-#
-# The immediate mode is solely for debugging purposes and support will be very
-# sparse.
-################################################################################
-
-_immediate_mode = False
-_immediate_workspace_name = "_CAFFE2_IMMEDIATE"
-_immediate_root_folder = ''
-
-
-def IsImmediate():
-    return _immediate_mode
-
-
-@contextlib.contextmanager
-def WorkspaceGuard(workspace_name):
-    current = CurrentWorkspace()
-    SwitchWorkspace(workspace_name, True)
-    yield
-    SwitchWorkspace(current)
-
-
-def StartImmediate(i_know=False):
-    global _immediate_mode
-    global _immediate_root_folder
-    if IsImmediate():
-        # already in immediate mode. We will kill the previous one
-        # and start from fresh.
-        StopImmediate()
-    _immediate_mode = True
-    with WorkspaceGuard(_immediate_workspace_name):
-        _immediate_root_folder = tempfile.mkdtemp()
-        ResetWorkspace(_immediate_root_folder)
-    if i_know:
-        # if the user doesn't want to see the warning message, sure...
-        return
-    print("""
-    Enabling immediate mode in caffe2 python is an EXTREMELY EXPERIMENTAL
-    feature and may very easily go wrong. This is because Caffe2 uses a
-    declarative way of defining operators and models, which is essentially
-    not meant to run things in an interactive way. Read the following carefully
-    to make sure that you understand the caveats.
-
-    (1) You need to make sure that the sequences of operators you create are
-    actually runnable sequentially. For example, if you create an op that takes
-    an input X, somewhere earlier you should have already created X.
-
-    (2) Caffe2 immediate uses one single workspace, so if the set of operators
-    you run are intended to be under different workspaces, they will not run.
-    To create boundaries between such use cases, you can call FinishImmediate()
-    and StartImmediate() manually to flush out everything no longer needed.
-
-    (3) Underlying objects held by the immediate mode may interfere with your
-    normal run. For example, if there is a leveldb that you opened in immediate
-    mode and did not close, your main run will fail because leveldb does not
-    support double opening. Immediate mode may also occupy a lot of memory esp.
-    on GPUs. Call FinishImmediate() as soon as possible when you no longer
-    need it.
-
-    (4) Immediate is designed to be slow. Every immediate call implicitly
-    creates a temp operator object, runs it, and destroys the operator. This
-    slow-speed run is by design to discourage abuse. For most use cases other
-    than debugging, do NOT turn on immediate mode.
-
-    (5) If there is anything FATAL happening in the underlying C++ code, the
-    immediate mode will immediately (pun intended) cause the runtime to crash.
-
-    Thus you should use immediate mode with extra care. If you still would
-    like to, have fun [https://xkcd.com/149/].
-    """)
-
-
-def StopImmediate():
-    """Stops an immediate mode run."""
-    # Phew, that was a dangerous ride.
-    global _immediate_mode
-    global _immediate_root_folder
-    if not IsImmediate():
-        return
-    with WorkspaceGuard(_immediate_workspace_name):
-        ResetWorkspace()
-    shutil.rmtree(_immediate_root_folder)
-    _immediate_root_folder = ''
-    _immediate_mode = False
-
-
-def ImmediateBlobs():
-    with WorkspaceGuard(_immediate_workspace_name):
-        return Blobs()
-
-
-def RunOperatorImmediate(op):
-    with WorkspaceGuard(_immediate_workspace_name):
-        RunOperatorOnce(op)
-
-
-def FetchImmediate(*args, **kwargs):
-    with WorkspaceGuard(_immediate_workspace_name):
-        return FetchBlob(*args, **kwargs)
-
-
-def FeedImmediate(*args, **kwargs):
-    with WorkspaceGuard(_immediate_workspace_name):
-        return FeedBlob(*args, **kwargs)
-
-
-# C.Workspace methods.
-
-def _Workspace_create_net_with_exception_intercept(ws, net, overwrite=False):
-    return CallWithExceptionIntercept(
-        ws._create_net,
-        ws._last_failed_op_net_position,
-        GetNetName(net),
-        StringifyProto(net), overwrite,
-    )
-
-
-def _Workspace_run(ws, obj):
-    if hasattr(obj, 'Proto'):
-        obj = obj.Proto()
-    if isinstance(obj, caffe2_pb2.PlanDef):
-        return ws._run_plan(obj.SerializeToString())
-    if isinstance(obj, caffe2_pb2.NetDef):
-        return CallWithExceptionIntercept(
-            ws._run_net,
-            ws._last_failed_op_net_position,
-            GetNetName(obj),
-            obj.SerializeToString(),
-        )
-        # return ws._run_net(obj.SerializeToString())
-    if isinstance(obj, caffe2_pb2.OperatorDef):
-        return ws._run_operator(obj.SerializeToString())
-    raise ValueError(
-        "Don't know how to do Workspace.run() on {}".format(type(obj)))
-
-
-def _Workspace_feed_blob(ws, name, arr, device_option=None):
-    if type(arr) is caffe2_pb2.TensorProto:
-        arr = utils.Caffe2TensorToNumpyArray(arr)
-    if type(arr) is np.ndarray and arr.dtype.kind in 'SU':
-        # Plain NumPy strings are weird, let's use objects instead
-        arr = arr.astype(object)
-
-    if device_option is None:
-        device_option = scope.CurrentDeviceScope()
-
-    if device_option and device_option.device_type == caffe2_pb2.CUDA:
-        if arr.dtype == np.dtype('float64'):
-            logger.warning(
-                "CUDA operators do not support 64-bit doubles, " +
-                "please use arr.astype(np.float32) or np.int32 for ints." +
-                " Blob: {}".format(name) +
-                " type: {}".format(str(arr.dtype))
-            )
-
-    name = StringifyBlobName(name)
-    if device_option is not None:
-        return ws.create_blob(name).feed(arr, device_option)
-    else:
-        return ws.create_blob(name).feed(arr)
-
-
-def _Workspace_remove_blob(ws, blob):
-    ws._remove_blob(str(blob))
-
-
-Workspace = C.Workspace
-Workspace.create_net = _Workspace_create_net_with_exception_intercept
-Workspace.run = _Workspace_run
-Workspace.feed_blob = _Workspace_feed_blob
-Workspace.remove_blob = _Workspace_remove_blob
-
-# C.Blob methods.
-
-
-def _Blob_feed(blob, arg, device_option=None):
-    # conservative type check to avoid unnecessary import
-    if type(arg).__name__ == 'Tensor' and type(arg).__module__ == 'torch':
-        import torch
-        if isinstance(arg, torch.Tensor):
-            assert device_option is None, \
-                "device_option doesn't make sense with PyTorch tensors"
-            handle = torch._C._tensor_impl_raw_handle(arg)
-            blob._wrap_tensor_impl(handle)
-            return True  # _feed() returns True for some reason
-    if device_option is not None:
-        device_option = StringifyProto(device_option)
-    return blob._feed(arg, device_option)
-
-
-C.Blob.feed = _Blob_feed
-
-
-def _Tensor_to_torch(tensor):
-    """
-    PyTorch tensor interop (TensorCPU methods)
-
-    Can be accessed as:
-      workspace.Workspace.current.blobs['foo'].tensor().to_torch()
-    """
-    # avoiding circular dependency
-    import torch
-    handle = tensor._tensor_impl_raw_handle()
-    return torch._C._wrap_tensor_impl(handle)
-
-C.TensorCPU.to_torch = _Tensor_to_torch
-
-
-def _Blob_to_torch(blob):
-    if not blob.is_tensor():
-        raise RuntimeError("Blob has to be a tensor")
-    return blob.as_tensor().to_torch()
-
-C.Blob.to_torch = _Blob_to_torch
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
deleted file mode 100644
index a860bec116be..000000000000
--- a/caffe2/python/workspace_test.py
+++ /dev/null
@@ -1,933 +0,0 @@
-import errno
-import os
-import shutil
-import tempfile
-import unittest
-from collections import namedtuple
-from typing import List
-
-import caffe2.python.hypothesis_test_util as htu
-import hypothesis.strategies as st
-import numpy as np
-import torch
-from torch import Tensor
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, test_util, workspace, model_helper, brew
-from hypothesis import given, settings
-
-
-class TestWorkspace(unittest.TestCase):
-    def setUp(self):
-        self.net = core.Net("test-net")
-        self.testblob_ref = self.net.ConstantFill(
-            [], "testblob", shape=[1, 2, 3, 4], value=1.0
-        )
-        workspace.ResetWorkspace()
-
-    def testWorkspaceHasBlobWithNonexistingName(self):
-        self.assertEqual(workspace.HasBlob("non-existing"), False)
-
-    def testRunOperatorOnce(self):
-        self.assertEqual(
-            workspace.RunOperatorOnce(self.net.Proto().op[0].SerializeToString()), True
-        )
-        self.assertEqual(workspace.HasBlob("testblob"), True)
-        blobs = workspace.Blobs()
-        self.assertEqual(len(blobs), 1)
-        self.assertEqual(blobs[0], "testblob")
-
-    def testGetOperatorCost(self):
-        op = core.CreateOperator(
-            "Conv2D",
-            ["X", "W"],
-            ["Y"],
-            stride_h=1,
-            stride_w=1,
-            pad_t=1,
-            pad_l=1,
-            pad_b=1,
-            pad_r=1,
-            kernel=3,
-        )
-        X = np.zeros((1, 8, 8, 8))
-        W = np.zeros((1, 1, 3, 3))
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("W", W)
-        op_cost = workspace.GetOperatorCost(op.SerializeToString(), ["X", "W"])
-        self.assertTupleEqual(
-            op_cost,
-            namedtuple("Cost", ["flops", "bytes_written", "bytes_read"])(
-                1152, 256, 4168
-            ),
-        )
-
-    def testRunNetOnce(self):
-        self.assertEqual(
-            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True
-        )
-        self.assertEqual(workspace.HasBlob("testblob"), True)
-
-    def testCurrentWorkspaceWrapper(self):
-        self.assertNotIn("testblob", workspace.C.Workspace.current.blobs)
-        self.assertEqual(
-            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True
-        )
-        self.assertEqual(workspace.HasBlob("testblob"), True)
-        self.assertIn("testblob", workspace.C.Workspace.current.blobs)
-        workspace.ResetWorkspace()
-        self.assertNotIn("testblob", workspace.C.Workspace.current.blobs)
-
-    def testRunPlan(self):
-        plan = core.Plan("test-plan")
-        plan.AddStep(core.ExecutionStep("test-step", self.net))
-        self.assertEqual(workspace.RunPlan(plan.Proto().SerializeToString()), True)
-        self.assertEqual(workspace.HasBlob("testblob"), True)
-
-    def testRunPlanInBackground(self):
-        plan = core.Plan("test-plan")
-        plan.AddStep(core.ExecutionStep("test-step", self.net))
-        background_plan = workspace.RunPlanInBackground(plan)
-        while not background_plan.is_done():
-            pass
-        self.assertEqual(background_plan.is_succeeded(), True)
-        self.assertEqual(workspace.HasBlob("testblob"), True)
-
-    def testConstructPlanFromSteps(self):
-        step = core.ExecutionStep("test-step-as-plan", self.net)
-        self.assertEqual(workspace.RunPlan(step), True)
-        self.assertEqual(workspace.HasBlob("testblob"), True)
-
-    def testResetWorkspace(self):
-        self.assertEqual(
-            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True
-        )
-        self.assertEqual(workspace.HasBlob("testblob"), True)
-        self.assertEqual(workspace.ResetWorkspace(), True)
-        self.assertEqual(workspace.HasBlob("testblob"), False)
-
-    def testTensorAccess(self):
-        ws = workspace.C.Workspace()
-
-        """ test in-place modification """
-        ws.create_blob("tensor").feed(np.array([1.1, 1.2, 1.3]))
-        tensor = ws.blobs["tensor"].tensor()
-        tensor.data[0] = 3.3
-        val = np.array([3.3, 1.2, 1.3])
-        np.testing.assert_array_equal(tensor.data, val)
-        np.testing.assert_array_equal(ws.blobs["tensor"].fetch(), val)
-
-        """ test in-place initialization """
-        tensor.init([2, 3], core.DataType.INT32)
-        for x in range(2):
-            for y in range(3):
-                tensor.data[x, y] = 0
-        tensor.data[1, 1] = 100
-        val = np.zeros([2, 3], dtype=np.int32)
-        val[1, 1] = 100
-        np.testing.assert_array_equal(tensor.data, val)
-        np.testing.assert_array_equal(ws.blobs["tensor"].fetch(), val)
-
-        """ strings cannot be initialized from python """
-        with self.assertRaises(RuntimeError):
-            tensor.init([3, 4], core.DataType.STRING)
-
-        """ feed (copy) data into tensor """
-        val = np.array([[b"abc", b"def"], [b"ghi", b"jkl"]], dtype=object)
-        tensor.feed(val)
-        self.assertEqual(tensor.data[0, 0], b"abc")
-        np.testing.assert_array_equal(ws.blobs["tensor"].fetch(), val)
-
-        val = np.array([1.1, 10.2])
-        tensor.feed(val)
-        val[0] = 5.2
-        self.assertEqual(tensor.data[0], 1.1)
-
-        """ fetch (copy) data from tensor """
-        val = np.array([1.1, 1.2])
-        tensor.feed(val)
-        val2 = tensor.fetch()
-        tensor.data[0] = 5.2
-        val3 = tensor.fetch()
-        np.testing.assert_array_equal(val, val2)
-        self.assertEqual(val3[0], 5.2)
-
-    def testFetchFeedBlob(self):
-        self.assertEqual(
-            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True
-        )
-        fetched = workspace.FetchBlob("testblob")
-        # check if fetched is correct.
-        self.assertEqual(fetched.shape, (1, 2, 3, 4))
-        np.testing.assert_array_equal(fetched, 1.0)
-        fetched[:] = 2.0
-        self.assertEqual(workspace.FeedBlob("testblob", fetched), True)
-        fetched_again = workspace.FetchBlob("testblob")
-        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
-        np.testing.assert_array_equal(fetched_again, 2.0)
-
-    def testFetchFeedBlobViaBlobReference(self):
-        self.assertEqual(
-            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True
-        )
-        fetched = workspace.FetchBlob(self.testblob_ref)
-        # check if fetched is correct.
-        self.assertEqual(fetched.shape, (1, 2, 3, 4))
-        np.testing.assert_array_equal(fetched, 1.0)
-        fetched[:] = 2.0
-        self.assertEqual(workspace.FeedBlob(self.testblob_ref, fetched), True)
-        fetched_again = workspace.FetchBlob("testblob")  # fetch by name now
-        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
-        np.testing.assert_array_equal(fetched_again, 2.0)
-
-    def testFetchFeedBlobTypes(self):
-        for dtype in [
-            np.float16,
-            np.float32,
-            np.float64,
-            bool,
-            np.int8,
-            np.int16,
-            np.int32,
-            np.int64,
-            np.uint8,
-            np.uint16,
-        ]:
-            try:
-                rng = np.iinfo(dtype).max * 2
-            except ValueError:
-                rng = 1000
-            data = ((np.random.rand(2, 3, 4) - 0.5) * rng).astype(dtype)
-            self.assertEqual(workspace.FeedBlob("testblob_types", data), True)
-            fetched_back = workspace.FetchBlob("testblob_types")
-            self.assertEqual(fetched_back.shape, (2, 3, 4))
-            self.assertEqual(fetched_back.dtype, dtype)
-            np.testing.assert_array_equal(fetched_back, data)
-
-    def testFetchFeedBlobBool(self):
-        """Special case for bool to ensure coverage of both true and false."""
-        data = np.zeros((2, 3, 4)).astype(bool)
-        data.flat[::2] = True
-        self.assertEqual(workspace.FeedBlob("testblob_types", data), True)
-        fetched_back = workspace.FetchBlob("testblob_types")
-        self.assertEqual(fetched_back.shape, (2, 3, 4))
-        self.assertEqual(fetched_back.dtype, bool)
-        np.testing.assert_array_equal(fetched_back, data)
-
-    def testGetBlobSizeBytes(self):
-        for dtype in [
-            np.float16,
-            np.float32,
-            np.float64,
-            bool,
-            np.int8,
-            np.int16,
-            np.int32,
-            np.int64,
-            np.uint8,
-            np.uint16,
-        ]:
-            data = np.random.randn(2, 3).astype(dtype)
-            self.assertTrue(workspace.FeedBlob("testblob_sizeBytes", data), True)
-            self.assertEqual(
-                workspace.GetBlobSizeBytes("testblob_sizeBytes"),
-                6 * np.dtype(dtype).itemsize,
-            )
-        strs1 = np.array([b"Hello World!", b"abcd"])
-        strs2 = np.array([b"element1", b"element2"])
-        strs1_len, strs2_len = 0, 0
-        for str in strs1:
-            strs1_len += len(str)
-        for str in strs2:
-            strs2_len += len(str)
-        self.assertTrue(workspace.FeedBlob("testblob_str1", strs1), True)
-        self.assertTrue(workspace.FeedBlob("testblob_str2", strs2), True)
-        # size of blob "testblob_str1" = size_str1 * meta_.itemsize() + strs1_len
-        # size of blob "testblob_str2" = size_str2 * meta_.itemsize() + strs2_len
-        self.assertEqual(
-            workspace.GetBlobSizeBytes("testblob_str1")
-            - workspace.GetBlobSizeBytes("testblob_str2"),
-            strs1_len - strs2_len,
-        )
-
-    def testFetchFeedBlobZeroDim(self):
-        data = np.empty(shape=(2, 0, 3), dtype=np.float32)
-        self.assertEqual(workspace.FeedBlob("testblob_empty", data), True)
-        fetched_back = workspace.FetchBlob("testblob_empty")
-        self.assertEqual(fetched_back.shape, (2, 0, 3))
-        self.assertEqual(fetched_back.dtype, np.float32)
-
-    def testFetchFeedLongStringTensor(self):
-        # long strings trigger array of object creation
-        strs = np.array(
-            [
-                b" ".join(10 * [b"long string"]),
-                b" ".join(128 * [b"very long string"]),
-                b"small \0\1\2 string",
-                b"Hello, world! I have special \0 symbols \1!",
-            ]
-        )
-        workspace.FeedBlob("my_str_tensor", strs)
-        strs2 = workspace.FetchBlob("my_str_tensor")
-        self.assertEqual(strs.shape, strs2.shape)
-        for i in range(0, strs.shape[0]):
-            self.assertEqual(strs[i], strs2[i])
-
-    def testFetchFeedShortStringTensor(self):
-        # small strings trigger NPY_STRING array
-        strs = np.array([b"elem1", b"elem 2", b"element 3"])
-        workspace.FeedBlob("my_str_tensor_2", strs)
-        strs2 = workspace.FetchBlob("my_str_tensor_2")
-        self.assertEqual(strs.shape, strs2.shape)
-        for i in range(0, strs.shape[0]):
-            self.assertEqual(strs[i], strs2[i])
-
-    def testFetchFeedPlainString(self):
-        # this is actual string, not a tensor of strings
-        s = b"Hello, world! I have special \0 symbols \1!"
-        workspace.FeedBlob("my_plain_string", s)
-        s2 = workspace.FetchBlob("my_plain_string")
-        self.assertEqual(s, s2)
-
-    def testFetchBlobs(self):
-        s1 = b"test1"
-        s2 = b"test2"
-        workspace.FeedBlob("s1", s1)
-        workspace.FeedBlob("s2", s2)
-        fetch1, fetch2 = workspace.FetchBlobs(["s1", "s2"])
-        self.assertEqual(s1, fetch1)
-        self.assertEqual(s2, fetch2)
-
-    def testFetchFeedViaBlobDict(self):
-        self.assertEqual(
-            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True
-        )
-        fetched = workspace.blobs["testblob"]
-        # check if fetched is correct.
-        self.assertEqual(fetched.shape, (1, 2, 3, 4))
-        np.testing.assert_array_equal(fetched, 1.0)
-        fetched[:] = 2.0
-        workspace.blobs["testblob"] = fetched
-        fetched_again = workspace.blobs["testblob"]
-        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
-        np.testing.assert_array_equal(fetched_again, 2.0)
-
-        self.assertTrue("testblob" in workspace.blobs)
-        self.assertFalse("non_existant" in workspace.blobs)
-        self.assertEqual(len(workspace.blobs), 1)
-        for key in workspace.blobs:
-            self.assertEqual(key, "testblob")
-
-    def testTorchInterop(self):
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "ConstantFill", [], "foo", shape=(4,), value=2, dtype=10
-            )
-        )
-        t = workspace.FetchTorch("foo")
-        t.resize_(5)
-        t[4] = t[2] = 777
-        np.testing.assert_array_equal(t.numpy(), np.array([2, 2, 777, 2, 777]))
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("foo"), np.array([2, 2, 777, 2, 777])
-        )
-
-        z = torch.ones((4,), dtype=torch.int64)
-        workspace.FeedBlob("bar", z)
-        workspace.RunOperatorOnce(
-            core.CreateOperator("Reshape", ["bar"], ["bar", "_"], shape=(2, 2))
-        )
-        z[0, 1] = 123
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("bar"), np.array([[1, 123], [1, 1]])
-        )
-        np.testing.assert_array_equal(z, np.array([[1, 123], [1, 1]]))
-
-
-class TestMultiWorkspaces(unittest.TestCase):
-    def setUp(self):
-        workspace.SwitchWorkspace("default")
-        workspace.ResetWorkspace()
-
-    def testCreateWorkspace(self):
-        self.net = core.Net("test-net")
-        self.net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
-        self.assertEqual(
-            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True
-        )
-        self.assertEqual(workspace.HasBlob("testblob"), True)
-        self.assertEqual(workspace.SwitchWorkspace("test", True), None)
-        self.assertEqual(workspace.HasBlob("testblob"), False)
-        self.assertEqual(workspace.SwitchWorkspace("default"), None)
-        self.assertEqual(workspace.HasBlob("testblob"), True)
-
-        try:
-            # The following should raise an error.
-            workspace.SwitchWorkspace("non-existing")
-            # so this should never happen.
-            self.assertEqual(True, False)
-        except RuntimeError:
-            pass
-
-        workspaces = workspace.Workspaces()
-        self.assertTrue("default" in workspaces)
-        self.assertTrue("test" in workspaces)
-
-
-@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
-class TestWorkspaceGPU(test_util.TestCase):
-    def setUp(self):
-        workspace.ResetWorkspace()
-        self.net = core.Net("test-net")
-        self.net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
-        self.net.RunAllOnGPU()
-
-    def testFetchBlobGPU(self):
-        self.assertEqual(
-            workspace.RunNetOnce(self.net.Proto().SerializeToString()), True
-        )
-        fetched = workspace.FetchBlob("testblob")
-        # check if fetched is correct.
-        self.assertEqual(fetched.shape, (1, 2, 3, 4))
-        np.testing.assert_array_equal(fetched, 1.0)
-        fetched[:] = 2.0
-        self.assertEqual(workspace.FeedBlob("testblob", fetched), True)
-        fetched_again = workspace.FetchBlob("testblob")
-        self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
-        np.testing.assert_array_equal(fetched_again, 2.0)
-
-    def testGetGpuPeerAccessPattern(self):
-        pattern = workspace.GetGpuPeerAccessPattern()
-        self.assertEqual(type(pattern), np.ndarray)
-        self.assertEqual(pattern.ndim, 2)
-        self.assertEqual(pattern.shape[0], pattern.shape[1])
-        self.assertEqual(pattern.shape[0], workspace.NumGpuDevices())
-
-    @unittest.skipIf(
-        not workspace.has_cuda_support, "Tensor interop doesn't yet work on ROCm"
-    )
-    def testTorchInterop(self):
-        # CUDA has convenient mem stats, let's use them to make sure we didn't
-        # leak memory
-        initial_mem = torch.cuda.memory_allocated()
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "ConstantFill",
-                [],
-                "foo",
-                shape=(4,),
-                value=2,
-                dtype=10,
-                device_option=core.DeviceOption(workspace.GpuDeviceType),
-            )
-        )
-        t = workspace.FetchTorch("foo")
-        t.resize_(5)
-        self.assertTrue(t.is_cuda)
-        t[4] = t[2] = 777
-        np.testing.assert_array_equal(t.cpu().numpy(), np.array([2, 2, 777, 2, 777]))
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("foo"), np.array([2, 2, 777, 2, 777])
-        )
-
-        z = torch.ones((4,), dtype=torch.int64, device="cuda")
-        workspace.FeedBlob("bar", z)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "Reshape",
-                ["bar"],
-                ["bar", "_"],
-                shape=(2, 2),
-                device_option=core.DeviceOption(workspace.GpuDeviceType),
-            )
-        )
-        z[0, 1] = 123
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("bar"), np.array([[1, 123], [1, 1]])
-        )
-        np.testing.assert_array_equal(z.cpu(), np.array([[1, 123], [1, 1]]))
-
-        self.assertGreater(torch.cuda.memory_allocated(), initial_mem)
-        # clean up everything
-        del t
-        del z
-        workspace.ResetWorkspace()
-        self.assertEqual(torch.cuda.memory_allocated(), initial_mem)
-
-
-@unittest.skipIf(not workspace.C.use_mkldnn, "No MKLDNN support.")
-class TestWorkspaceIDEEP(test_util.TestCase):
-    def testFeedFetchBlobIDEEP(self):
-        arr = np.random.randn(2, 3).astype(np.float32)
-        workspace.FeedBlob("testblob_ideep", arr, core.DeviceOption(caffe2_pb2.IDEEP))
-        fetched = workspace.FetchBlob("testblob_ideep")
-        np.testing.assert_array_equal(arr, fetched)
-
-
-class TestImmedibate(test_util.TestCase):
-    def testImmediateEnterExit(self):
-        workspace.StartImmediate(i_know=True)
-        self.assertTrue(workspace.IsImmediate())
-        workspace.StopImmediate()
-        self.assertFalse(workspace.IsImmediate())
-
-    def testImmediateRunsCorrectly(self):
-        workspace.StartImmediate(i_know=True)
-        net = core.Net("test-net")
-        net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
-        self.assertEqual(workspace.ImmediateBlobs(), ["testblob"])
-        content = workspace.FetchImmediate("testblob")
-        # Also, the immediate mode should not invade the original namespace,
-        # so we check if this is so.
-        with self.assertRaises(RuntimeError):
-            workspace.FetchBlob("testblob")
-        np.testing.assert_array_equal(content, 1.0)
-        content[:] = 2.0
-        self.assertTrue(workspace.FeedImmediate("testblob", content))
-        np.testing.assert_array_equal(workspace.FetchImmediate("testblob"), 2.0)
-        workspace.StopImmediate()
-        with self.assertRaises(RuntimeError):
-            content = workspace.FetchImmediate("testblob")
-
-    def testImmediateRootFolder(self):
-        workspace.StartImmediate(i_know=True)
-        # for testing we will look into the _immediate_root_folder variable
-        # but in normal usage you should not access that.
-        self.assertTrue(len(workspace._immediate_root_folder) > 0)
-        root_folder = workspace._immediate_root_folder
-        self.assertTrue(os.path.isdir(root_folder))
-        workspace.StopImmediate()
-        self.assertTrue(len(workspace._immediate_root_folder) == 0)
-        # After termination, immediate mode should have the root folder
-        # deleted.
-        self.assertFalse(os.path.exists(root_folder))
-
-
-class TestCppEnforceAsException(test_util.TestCase):
-    def testEnforce(self):
-        op = core.CreateOperator("Relu", ["X"], ["Y"])
-        with self.assertRaises(RuntimeError):
-            workspace.RunOperatorOnce(op)
-
-
-class TestCWorkspace(htu.HypothesisTestCase):
-    def test_net_execution(self):
-        ws = workspace.C.Workspace()
-        self.assertEqual(ws.nets, {})
-        self.assertEqual(ws.blobs, {})
-        net = core.Net("test-net")
-        net.ConstantFill([], "testblob", shape=[1, 2, 3, 4], value=1.0)
-        ws.create_net(net)
-        # If we do not specify overwrite, this should raise an error.
-        with self.assertRaises(RuntimeError):
-            ws.create_net(net)
-        # But, if we specify overwrite, this should pass.
-        ws.create_net(net, True)
-        # Overwrite can also be a kwarg.
-        ws.create_net(net, overwrite=True)
-        self.assertIn("testblob", ws.blobs)
-        self.assertEqual(len(ws.nets), 1)
-        net_name = net.Proto().name
-        self.assertIn("test-net", net_name)
-        net = ws.nets[net_name].run()
-        blob = ws.blobs["testblob"]
-        np.testing.assert_array_equal(
-            np.ones((1, 2, 3, 4), dtype=np.float32), blob.fetch()
-        )
-
-    @given(name=st.text(), value=st.floats(min_value=-1, max_value=1.0))
-    def test_operator_run(self, name, value):
-        ws = workspace.C.Workspace()
-        op = core.CreateOperator("ConstantFill", [], [name], shape=[1], value=value)
-        ws.run(op)
-        self.assertIn(name, ws.blobs)
-        np.testing.assert_allclose(
-            [value], ws.blobs[name].fetch(), atol=1e-4, rtol=1e-4
-        )
-
-    @given(
-        blob_name=st.text(),
-        net_name=st.text(),
-        value=st.floats(min_value=-1, max_value=1.0),
-    )
-    def test_net_run(self, blob_name, net_name, value):
-        ws = workspace.C.Workspace()
-        net = core.Net(net_name)
-        net.ConstantFill([], [blob_name], shape=[1], value=value)
-        ws.run(net)
-        self.assertIn(blob_name, ws.blobs)
-        self.assertNotIn(net_name, ws.nets)
-        np.testing.assert_allclose(
-            [value], ws.blobs[blob_name].fetch(), atol=1e-4, rtol=1e-4
-        )
-
-    @given(
-        blob_name=st.text(),
-        net_name=st.text(),
-        plan_name=st.text(),
-        value=st.floats(min_value=-1, max_value=1.0),
-    )
-    def test_plan_run(self, blob_name, plan_name, net_name, value):
-        ws = workspace.C.Workspace()
-        plan = core.Plan(plan_name)
-        net = core.Net(net_name)
-        net.ConstantFill([], [blob_name], shape=[1], value=value)
-
-        plan.AddStep(core.ExecutionStep("step", nets=[net], num_iter=1))
-
-        ws.run(plan)
-        self.assertIn(blob_name, ws.blobs)
-        self.assertIn(net.Name(), ws.nets)
-        np.testing.assert_allclose(
-            [value], ws.blobs[blob_name].fetch(), atol=1e-4, rtol=1e-4
-        )
-
-    @given(
-        blob_name=st.text(),
-        net_name=st.text(),
-        value=st.floats(min_value=-1, max_value=1.0),
-    )
-    def test_net_create(self, blob_name, net_name, value):
-        ws = workspace.C.Workspace()
-        net = core.Net(net_name)
-        net.ConstantFill([], [blob_name], shape=[1], value=value)
-        ws.create_net(net).run()
-        self.assertIn(blob_name, ws.blobs)
-        self.assertIn(net.Name(), ws.nets)
-        np.testing.assert_allclose(
-            [value], ws.blobs[blob_name].fetch(), atol=1e-4, rtol=1e-4
-        )
-
-    @given(
-        name=st.text(),
-        value=htu.tensor(),
-        device_option=st.sampled_from(htu.device_options),
-    )
-    def test_array_serde(self, name, value, device_option):
-        ws = workspace.C.Workspace()
-        ws.create_blob(name).feed(value, device_option=device_option)
-        self.assertIn(name, ws.blobs)
-        blob = ws.blobs[name]
-        np.testing.assert_equal(value, ws.blobs[name].fetch())
-        serde_blob = ws.create_blob("{}_serde".format(name))
-        serde_blob.deserialize(blob.serialize(name))
-        np.testing.assert_equal(value, serde_blob.fetch())
-
-    @given(name=st.text(), value=st.text())
-    def test_string_serde(self, name, value):
-        value = value.encode("ascii", "ignore")
-        ws = workspace.C.Workspace()
-        ws.create_blob(name).feed(value)
-        self.assertIn(name, ws.blobs)
-        blob = ws.blobs[name]
-        self.assertEqual(value, ws.blobs[name].fetch())
-        serde_blob = ws.create_blob("{}_serde".format(name))
-        serde_blob.deserialize(blob.serialize(name))
-        self.assertEqual(value, serde_blob.fetch())
-
-    def test_exception(self):
-        ws = workspace.C.Workspace()
-
-        with self.assertRaises(TypeError):
-            ws.create_net("...")
-
-
-class TestPredictor(unittest.TestCase):
-    def _create_model(self):
-        m = model_helper.ModelHelper()
-        y = brew.fc(
-            m,
-            "data",
-            "y",
-            dim_in=4,
-            dim_out=2,
-            weight_init=("ConstantFill", dict(value=1.0)),
-            bias_init=("ConstantFill", dict(value=0.0)),
-            axis=0,
-        )
-        m.net.AddExternalOutput(y)
-        return m
-
-    # Use this test with a bigger model to see how using Predictor allows to
-    # avoid issues with low protobuf size limit in Python
-    #
-    # def test_predictor_predefined(self):
-    #     workspace.ResetWorkspace()
-    #     path = 'caffe2/caffe2/test/assets/'
-    #     with open(path + 'squeeze_predict_net.pb') as f:
-    #         self.predict_net = f.read()
-    #     with open(path + 'squeeze_init_net.pb') as f:
-    #         self.init_net = f.read()
-    #     self.predictor = workspace.Predictor(self.init_net, self.predict_net)
-
-    #     inputs = [np.zeros((1, 3, 256, 256), dtype='f')]
-    #     outputs = self.predictor.run(inputs)
-    #     self.assertEqual(len(outputs), 1)
-    #     self.assertEqual(outputs[0].shape, (1, 1000, 1, 1))
-    #     self.assertAlmostEqual(outputs[0][0][0][0][0], 5.19026289e-05)
-
-    def test_predictor_memory_model(self):
-        workspace.ResetWorkspace()
-        m = self._create_model()
-        workspace.FeedBlob("data", np.zeros([4], dtype="float32"))
-        self.predictor = workspace.Predictor(
-            workspace.StringifyProto(m.param_init_net.Proto()),
-            workspace.StringifyProto(m.net.Proto()),
-        )
-
-        inputs = np.array([1, 3, 256, 256], dtype="float32")
-        outputs = self.predictor.run([inputs])
-        np.testing.assert_array_almost_equal(
-            np.array([[516, 516]], dtype="float32"), outputs
-        )
-
-
-class TestTransform(htu.HypothesisTestCase):
-    @given(
-        input_dim=st.integers(min_value=1, max_value=10),
-        output_dim=st.integers(min_value=1, max_value=10),
-        batch_size=st.integers(min_value=1, max_value=10),
-    )
-    def test_simple_transform(self, input_dim, output_dim, batch_size):
-        m = model_helper.ModelHelper()
-        fc1 = brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
-        fc2 = brew.fc(m, fc1, "fc2", dim_in=output_dim, dim_out=output_dim)
-        conv = brew.conv(
-            m,
-            fc2,
-            "conv",
-            dim_in=output_dim,
-            dim_out=output_dim,
-            use_cudnn=True,
-            engine="CUDNN",
-            kernel=3,
-        )
-
-        conv.Relu([], conv).Softmax([], "pred").LabelCrossEntropy(
-            ["label"], ["xent"]
-        ).AveragedLoss([], "loss")
-
-        transformed_net_proto = workspace.ApplyTransform("ConvToNNPack", m.net.Proto())
-
-        self.assertEqual(transformed_net_proto.op[2].engine, "NNPACK")
-
-    @given(
-        input_dim=st.integers(min_value=1, max_value=10),
-        output_dim=st.integers(min_value=1, max_value=10),
-        batch_size=st.integers(min_value=1, max_value=10),
-    )
-    @settings(deadline=10000)
-    def test_registry_invalid(self, input_dim, output_dim, batch_size):
-        m = model_helper.ModelHelper()
-        brew.fc(m, "data", "fc1", dim_in=input_dim, dim_out=output_dim)
-        with self.assertRaises(RuntimeError):
-            workspace.ApplyTransform("definitely_not_a_real_transform", m.net.Proto())
-
-    @given(value=st.floats(min_value=-1, max_value=1))
-    @settings(deadline=10000)
-    def test_apply_transform_if_faster(self, value):
-
-        init_net = core.Net("init_net")
-        init_net.ConstantFill([], ["data"], shape=[5, 5, 5, 5], value=value)
-        init_net.ConstantFill([], ["conv_w"], shape=[5, 5, 3, 3], value=value)
-        init_net.ConstantFill([], ["conv_b"], shape=[5], value=value)
-
-        self.assertEqual(
-            workspace.RunNetOnce(init_net.Proto().SerializeToString()), True
-        )
-
-        m = model_helper.ModelHelper()
-        conv = brew.conv(
-            m,
-            "data",
-            "conv",
-            dim_in=5,
-            dim_out=5,
-            kernel=3,
-            use_cudnn=True,
-            engine="CUDNN",
-        )
-
-        conv.Relu([], conv).Softmax([], "pred").AveragedLoss([], "loss")
-
-        self.assertEqual(workspace.RunNetOnce(m.net.Proto().SerializeToString()), True)
-
-        proto = workspace.ApplyTransformIfFaster(
-            "ConvToNNPack", m.net.Proto(), init_net.Proto()
-        )
-        self.assertEqual(workspace.RunNetOnce(proto.SerializeToString()), True)
-        proto = workspace.ApplyTransformIfFaster(
-            "ConvToNNPack",
-            m.net.Proto(),
-            init_net.Proto(),
-            warmup_runs=10,
-            main_runs=100,
-            improvement_threshold=2.0,
-        )
-        self.assertEqual(workspace.RunNetOnce(proto.SerializeToString()), True)
-
-
-class MyModule(torch.jit.ScriptModule):
-    def __init__(self):
-        super().__init__()
-        self.mult = torch.nn.Parameter(torch.tensor([[1, 2, 3, 4, 5.0]]))
-
-    @torch.jit.script_method
-    def forward(self, x):
-        return self.mult.mm(x)
-
-    @torch.jit.script_method
-    def multi_input(self, x: torch.Tensor, y: torch.Tensor, z: int = 2) -> torch.Tensor:
-        return x + y + z
-
-    @torch.jit.script_method
-    def multi_input_tensor_list(self, tensor_list: List[Tensor]) -> Tensor:
-        return tensor_list[0] + tensor_list[1] + tensor_list[2]
-
-    @torch.jit.script_method
-    def multi_output(self, x):
-        return (x, x + 1)
-
-
-@unittest.skipIf(
-    "ScriptModule" not in core._REGISTERED_OPERATORS,
-    "Script module integration in Caffe2 is not enabled",
-)
-class TestScriptModule(test_util.TestCase):
-    def _createFeedModule(self):
-        workspace.FeedBlob("m", MyModule())
-
-    def testCreation(self):
-        m = MyModule()
-        workspace.FeedBlob("module", m)
-        m2 = workspace.FetchBlob("module")
-        self.assertTrue(m2 is not None)
-
-    def testForward(self):
-        self._createFeedModule()
-        val = np.random.rand(5, 5).astype(np.float32)
-        param = np.array([[1, 2, 3, 4, 5]]).astype(np.float32)
-        workspace.FeedBlob("w", val)
-        workspace.RunOperatorOnce(
-            core.CreateOperator("ScriptModule", ["m", "w"], ["y"])
-        )
-        np.testing.assert_almost_equal(
-            workspace.FetchBlob("y"), np.matmul(param, val), decimal=5
-        )
-
-    def testMultiInputOutput(self):
-        self._createFeedModule()
-        val = np.random.rand(5, 5).astype(np.float32)
-        workspace.FeedBlob("w", val)
-        val2 = np.random.rand(5, 5).astype(np.float32)
-        workspace.FeedBlob("w2", val2)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "ScriptModule", ["m", "w", "w2"], ["y"], method="multi_input"
-            )
-        )
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "ScriptModule", ["m", "w"], ["y1", "y2"], method="multi_output"
-            )
-        )
-        np.testing.assert_almost_equal(
-            workspace.FetchBlob("y"), val + val2 + 2, decimal=5
-        )
-        np.testing.assert_almost_equal(workspace.FetchBlob("y1"), val, decimal=5)
-        np.testing.assert_almost_equal(workspace.FetchBlob("y2"), val + 1, decimal=5)
-
-    def testMultiTensorListInput(self):
-        self._createFeedModule()
-        val = np.random.rand(5, 5).astype(np.float32)
-        workspace.FeedBlob("w", val)
-        val2 = np.random.rand(5, 5).astype(np.float32)
-        workspace.FeedBlob("w2", val2)
-        val3 = np.random.rand(5, 5).astype(np.float32)
-        workspace.FeedBlob("w3", val3)
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "ScriptModule",
-                ["m", "w", "w2", "w3"],
-                ["y"],
-                method="multi_input_tensor_list",
-                pass_inputs_as_tensor_list=True,
-            )
-        )
-        np.testing.assert_almost_equal(
-            workspace.FetchBlob("y"), val + val2 + val3, decimal=5
-        )
-
-    def testSerialization(self):
-        tmpdir = tempfile.mkdtemp()
-        try:
-            self._createFeedModule()
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "Save",
-                    ["m"],
-                    [],
-                    absolute_path=1,
-                    db=os.path.join(tmpdir, "db"),
-                    db_type="minidb",
-                )
-            )
-            workspace.ResetWorkspace()
-
-            self.assertFalse(workspace.HasBlob("m"))
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "Load",
-                    [],
-                    [],
-                    absolute_path=1,
-                    db=os.path.join(tmpdir, "db"),
-                    db_type="minidb",
-                    load_all=1,
-                )
-            )
-            self.assertTrue(workspace.HasBlob("m"))
-            # TODO: make caffe2 side load return python-sided module
-            # right now it returns the base class (torch._C.ScriptModule)
-            # self.assertTrue(isinstance(workspace.FetchBlob('m'), torch.jit.ScriptModule))
-
-            # do something with the module
-            val = np.random.rand(5, 5).astype(np.float32)
-            param = np.array([[1, 2, 3, 4, 5]]).astype(np.float32)
-            workspace.FeedBlob("w", val)
-            workspace.RunOperatorOnce(
-                core.CreateOperator("ScriptModule", ["m", "w"], ["y"])
-            )
-            np.testing.assert_almost_equal(
-                workspace.FetchBlob("y"), np.matmul(param, val), decimal=5
-            )
-        finally:
-            # clean up temp folder.
-            try:
-                shutil.rmtree(tmpdir)
-            except OSError as e:
-                if e.errno != errno.ENOENT:
-                    raise
-
-
-class TestScriptModuleFromString(TestScriptModule):
-    def _createFeedModule(self):
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "ScriptModuleLoad",
-                [],
-                ["m"],
-                serialized_binary=self._get_modules_bytes(MyModule()),
-            )
-        )
-
-    def _get_modules_bytes(self, the_module):
-        import io
-
-        buffer = io.BytesIO()
-        torch.jit.save(the_module, buffer)
-        return buffer.getvalue()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 177bd31b9c97..8bd80f167a5b 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1300,11 +1300,10 @@ endif()
 # ---[ CUB
 if(USE_CUDA)
   find_package(CUB)
-  if(CUB_FOUND)
-    include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
-  else()
-    include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/cub)
+  if(NOT CUB_FOUND)
+    message(FATAL_ERROR "Cannot find CUB.")
   endif()
+  include_directories(SYSTEM ${CUB_INCLUDE_DIRS})
 endif()
 
 if(USE_DISTRIBUTED AND USE_TENSORPIPE)
@@ -1426,25 +1425,6 @@ if(USE_NNAPI AND NOT ANDROID)
   caffe2_update_option(USE_NNAPI OFF)
 endif()
 
-if(USE_ZSTD)
-  if(USE_SYSTEM_ZSTD)
-    find_package(zstd REQUIRED)
-    if(TARGET zstd::libzstd_shared)
-      set(ZSTD_TARGET zstd::libzstd_shared)
-    else()
-      set(ZSTD_TARGET zstd::libzstd_static)
-    endif()
-    list(APPEND Caffe2_DEPENDENCY_LIBS ${ZSTD_TARGET})
-    get_property(ZSTD_INCLUDE_DIR TARGET ${ZSTD_TARGET} PROPERTY INTERFACE_INCLUDE_DIRECTORIES)
-    include_directories(SYSTEM ${ZSTD_INCLUDE_DIR})
-  else()
-    list(APPEND Caffe2_DEPENDENCY_LIBS libzstd_static)
-    include_directories(SYSTEM ${CMAKE_CURRENT_LIST_DIR}/../third_party/zstd/lib)
-    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/../third_party/zstd/build/cmake)
-    set_property(TARGET libzstd_static PROPERTY POSITION_INDEPENDENT_CODE ON)
-  endif()
-endif()
-
 # ---[ Onnx
 if(CAFFE2_CMAKE_BUILDING_WITH_MAIN_REPO AND NOT INTERN_DISABLE_ONNX)
   if(EXISTS "${CAFFE2_CUSTOM_PROTOC_EXECUTABLE}")
@@ -1597,23 +1577,24 @@ if(NOT INTERN_BUILD_MOBILE)
 
   set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE OFF)
 
-  if(USE_MAGMA)
-    find_package(MAGMA)
-  endif()
-  if((USE_CUDA OR USE_ROCM) AND MAGMA_FOUND)
-    set(USE_MAGMA 1)
-    message(STATUS "Compiling with MAGMA support")
-    message(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}")
-    message(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}")
-    message(STATUS "MAGMA V2 check: ${MAGMA_V2}")
+  if(USE_CUDA OR USE_ROCM)
+    if(USE_MAGMA)
+      find_package(MAGMA)
+      if(MAGMA_FOUND)
+        message(STATUS "Compiling with MAGMA support")
+        message(STATUS "MAGMA INCLUDE DIRECTORIES: ${MAGMA_INCLUDE_DIR}")
+        message(STATUS "MAGMA LIBRARIES: ${MAGMA_LIBRARIES}")
+        message(STATUS "MAGMA V2 check: ${MAGMA_V2}")
+      else()
+        message(STATUS "MAGMA not found. Compiling without MAGMA support")
+        caffe2_update_option(USE_MAGMA OFF)
+      endif()
+    endif()
   elseif(USE_MAGMA)
     message(WARNING
       "Not compiling with MAGMA. Suppress this warning with "
       "-DUSE_MAGMA=OFF.")
     caffe2_update_option(USE_MAGMA OFF)
-  else()
-    message(STATUS "MAGMA not found. Compiling without MAGMA support")
-    caffe2_update_option(USE_MAGMA OFF)
   endif()
 
   # ARM specific flags
diff --git a/defs.bzl b/defs.bzl
index 6c32f5f9c8b4..d2978f3bfb97 100644
--- a/defs.bzl
+++ b/defs.bzl
@@ -1,7 +1,7 @@
 def get_blas_gomp_arch_deps():
     return [
         ("x86_64", [
-            "third-party//IntelComposerXE:{}".format(native.read_config("fbcode", "mkl_lp64", "mkl_lp64_omp")),
+            "fbsource//third-party/mkl:{}".format(native.read_config("fbcode", "mkl_lp64", "mkl_lp64_omp")),
         ]),
         ("aarch64", [
             "third-party//OpenBLAS:OpenBLAS",
diff --git a/docs/source/community/design.rst b/docs/source/community/design.rst
index 73ed7e1447b8..16b1500afcdd 100644
--- a/docs/source/community/design.rst
+++ b/docs/source/community/design.rst
@@ -119,7 +119,7 @@ This principle began as **Python First**:
   PyTorch is not a Python binding into a monolithic C++ framework.
   It is built to be deeply integrated into Python. You can use it
   naturally like you would use `NumPy <https://www.numpy.org/>`__,
-  `SciPy <https://www.scipy.org/>`__, `scikit-learn <(https://scikit-learn.org/>`__,
+  `SciPy <https://www.scipy.org/>`__, `scikit-learn <https://scikit-learn.org/>`__,
   or other Python libraries. You can write your new neural network
   layers in Python itself, using your favorite libraries and use
   packages such as `Cython <https://cython.org/>`__ and
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 0f89d2799fa5..fe548737b313 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -2796,6 +2796,7 @@
     "ConstraintViolationError",
     "DynamicDimConstraintPrinter",
     "GuardOnDataDependentSymNode",
+    "PendingUnbackedSymbolNotFound",
     "LoggingShapeGuardPrinter",
     "RelaxedUnspecConstraint",
     "RuntimeAssert",
diff --git a/docs/source/notes/serialization.rst b/docs/source/notes/serialization.rst
index 09fd9e858b87..225486cdedac 100644
--- a/docs/source/notes/serialization.rst
+++ b/docs/source/notes/serialization.rst
@@ -394,3 +394,6 @@ The following utility functions are related to serialization:
 .. autofunction:: set_default_load_endianness
 .. autofunction:: get_default_mmap_options
 .. autofunction:: set_default_mmap_options
+.. autofunction:: add_safe_globals
+.. autofunction:: clear_safe_globals
+.. autofunction:: get_safe_globals
diff --git a/docs/source/torch.compiler_troubleshooting.rst b/docs/source/torch.compiler_troubleshooting.rst
index f98a4dc779b6..7158149c09e1 100644
--- a/docs/source/torch.compiler_troubleshooting.rst
+++ b/docs/source/torch.compiler_troubleshooting.rst
@@ -727,3 +727,11 @@ and C++ backtrace whenever this symbol was created.
 ``TORCHDYNAMO_EXTENDED_DEBUG_CPP`` - provides extended debug information (C++ backtrace)
 for all extended debug settings as well as errors. For example, set this to "1". The C++
 backtrace is slow and very spammy so it is not included by default with extended debugging.
+
+Cold Start Timing and Cache Corruption Debugging
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to measure the cold start compilation time or debug a cache corruption,
+it is possible pass ``TORCHINDUCTOR_FORCE_DISABLE_CACHES=1`` or set
+``torch._inductor.config.force_disable_caches = True`` which will override any
+other caching config option and disable all compile time caching.
diff --git a/functorch/compile/__init__.py b/functorch/compile/__init__.py
index 96b853cd2e27..e7548a5ff6b9 100644
--- a/functorch/compile/__init__.py
+++ b/functorch/compile/__init__.py
@@ -25,7 +25,6 @@
 from torch._functorch.partitioners import (
     default_partition,
     draw_graph,
-    draw_joint_graph,
     min_cut_rematerialization_partition,
 )
 from torch._functorch.python_key import pythonkey_decompose
diff --git a/pyproject.toml b/pyproject.toml
index 3ff4b94447f9..07f075082097 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -51,7 +51,6 @@ ignore = [
     "B019",
     "B023",
     "B028", # No explicit `stacklevel` keyword argument found
-    "B904", # Migrate from TRY200
     "E402",
     "C408", # C408 ignored because we like the dict keyword argument syntax
     "E501", # E501 is not flexible enough, we're using B950 instead
@@ -90,6 +89,7 @@ ignore = [
 ]
 select = [
     "B",
+    "B904", # Re-raised error without specifying the cause via the from keyword
     "C4",
     "G",
     "E",
@@ -133,7 +133,6 @@ select = [
     "RUF017",
     "RUF018", # no assignment in assert
     "TRY002", # ban vanilla raise (todo fix NOQAs)
-    "TRY200", # TODO: migrate from deprecated alias
     "TRY302",
     "TRY401", # verbose-log-message
     "UP",
diff --git a/setup.py b/setup.py
index ce2798847161..93245d971be8 100644
--- a/setup.py
+++ b/setup.py
@@ -151,9 +151,6 @@
 #   USE_REDIS
 #     Whether to use Redis for distributed workflows (Linux only)
 #
-#   USE_ZSTD
-#     Enables use of ZSTD, if the libraries are found
-#
 #   USE_ROCM_KERNEL_ASSERT=1
 #     Enable kernel assert in ROCm platform
 #
@@ -1311,6 +1308,7 @@ def main():
         "include/torch/csrc/onnx/*.h",
         "include/torch/csrc/profiler/*.h",
         "include/torch/csrc/profiler/orchestration/*.h",
+        "include/torch/csrc/profiler/standalone/*.h",
         "include/torch/csrc/profiler/stubs/*.h",
         "include/torch/csrc/profiler/unwind/*.h",
         "include/torch/csrc/profiler/python/*.h",
diff --git a/test/allowlist_for_publicAPI.json b/test/allowlist_for_publicAPI.json
index 52f623ba87d6..ac77325188ee 100644
--- a/test/allowlist_for_publicAPI.json
+++ b/test/allowlist_for_publicAPI.json
@@ -46,6 +46,9 @@
     "scatter",
     "gather"
   ],
+  "torch.csrc.jit.tensorexpr.scripts.bisect": [
+    "bisect"
+  ],
   "torch.cuda.nccl": [
     "init_rank",
     "is_available",
@@ -902,7 +905,9 @@
     "cast",
     "closing",
     "contextmanager",
-    "get_source_lines_and_file"
+    "get_source_lines_and_file",
+    "get_default_mmap_options",
+    "set_default_mmap_options"
   ],
   "torch.sparse": [
     "BFloat16Tensor",
@@ -1104,6 +1109,7 @@
     "_add_relu",
     "_add_relu_",
     "_addmm_activation",
+    "_aminmax",
     "_amp_foreach_non_finite_check_and_unscale_",
     "_amp_update_scale_",
     "_assert_async",
@@ -1313,7 +1319,6 @@
     "_values_copy",
     "_weight_norm",
     "_weight_norm_interface",
-    "aminmax",
     "autocast",
     "broadcast_shapes",
     "candidate",
@@ -1714,7 +1719,12 @@
     "env",
     "get_logger",
     "macros",
-    "record"
+    "record",
+    "DefaultLogsSpecs",
+    "LogsSpecs",
+    "Optional",
+    "Set",
+    "Type"
   ],
   "torch.fx.annotate": [
     "Proxy",
@@ -1932,7 +1942,8 @@
     "track_tensor",
     "track_tensor_tree",
     "wrap_key",
-    "wrapper_and_args_for_make_fx"
+    "wrapper_and_args_for_make_fx",
+    "TorchFunctionMetadataMode"
   ],
   "torch.fx.experimental.rewriter": [
     "Any",
@@ -2013,7 +2024,14 @@
     "parallel_and",
     "parallel_or",
     "safe_expand",
-    "uninteresting_files"
+    "uninteresting_files",
+    "CallMethodKey",
+    "DivideByKey",
+    "PropagateUnbackedSymInts",
+    "ShapeEnvSettings",
+    "log_lru_cache_stats",
+    "PendingUnbackedSymbolNotFound",
+    "lru_cache"
   ],
   "torch.fx.experimental.unification.match": [
     "first",
@@ -2603,5 +2621,135 @@
   ],
   "torch.version": [
     "get_file_path"
+  ],
+  "torch.ao.nn.intrinsic.modules": [
+    "_FusedModule"
+  ],
+  "torch.distributed.benchmarks.benchmark_ddp_rpc": [
+    "BackendType",
+    "DDP",
+    "DistributedOptimizer",
+    "RRef",
+    "TensorPipeRpcBackendOptions"
+  ],
+  "torch.distributed.pipelining": [
+    "ArgsChunkSpec",
+    "KwargsChunkSpec",
+    "Pipe",
+    "PipelineStage",
+    "SplitPoint",
+    "annotate_split_points",
+    "pipe_split",
+    "pipeline"
+  ],
+  "torch.distributed.pipelining.PipelineSchedule": [
+    "ABC",
+    "Any",
+    "Callable",
+    "Dict",
+    "List",
+    "Optional",
+    "Pipe",
+    "PipelineStageBase",
+    "Tuple",
+    "Union",
+    "abstractmethod",
+    "defaultdict",
+    "merge_chunks",
+    "record_function",
+    "split_args_kwargs_into_chunks"
+  ],
+  "torch.distributed.pipelining.microbatch": [
+    "Any",
+    "Dict",
+    "List",
+    "Optional",
+    "Tuple",
+    "tree_flatten",
+    "tree_unflatten"
+  ],
+  "torch.export": [
+    "Constraint",
+    "ShapesCollection"
+  ],
+  "torch.export.dynamic_shapes": [
+    "Constraint",
+    "ShapesCollection"
+  ],
+  "torch.export.graph_signature": [
+    "TokenArgument"
+  ],
+  "torch.fx.experimental.shape_inference.infer_shape": [
+    "DimDynamic",
+    "FakeTensorMode",
+    "LocalSource",
+    "ShapeEnv",
+    "defaultdict",
+    "infer_symbol_values",
+    "make_fx"
+  ],
+  "torch.fx.experimental.shape_inference.infer_symbol_values": [
+    "Any",
+    "DefaultDict",
+    "Dict",
+    "List",
+    "Tuple",
+    "Union"
+  ],
+  "torch.fx.passes.runtime_assert": [
+    "Any",
+    "Dict",
+    "GraphModule",
+    "Optional",
+    "Set",
+    "ShapeEnv",
+    "SymNode",
+    "compatibility",
+    "lazy_format_graph_code"
+  ],
+  "torch.library": [
+    "opcheck",
+    "register_autograd",
+    "register_kernel"
+  ],
+  "torch.mtia": [
+    "DeferredMtiaCallError",
+    "StreamContext"
+  ],
+  "torch.onnx.symbolic_helper": [
+    "Any",
+    "Callable",
+    "List",
+    "Literal",
+    "NoReturn",
+    "Number",
+    "Optional",
+    "Sequence",
+    "Set",
+    "Tuple",
+    "Union"
+  ],
+  "torch.onnx.symbolic_opset18": [
+    "amax",
+    "amin",
+    "aminmax",
+    "embedding_bag",
+    "linalg_vector_norm",
+    "max",
+    "maximum",
+    "min",
+    "minimum"
+  ],
+  "torch.onnx.symbolic_opset20": [
+    "_affine_grid_generator",
+    "_grid_sampler",
+    "convert_grid_sample_mode"
+  ],
+  "torch.utils.data.datapipes.dataframe.dataframe_wrapper": [
+    "Any",
+    "Optional"
+  ],
+  "torch.utils.hipify.hipify_python": [
+    "TrieNode"
   ]
-}
+}
\ No newline at end of file
diff --git a/test/cpp/api/CMakeLists.txt b/test/cpp/api/CMakeLists.txt
index 42b67d8cb25c..b0e296ad2309 100644
--- a/test/cpp/api/CMakeLists.txt
+++ b/test/cpp/api/CMakeLists.txt
@@ -10,6 +10,7 @@ set(TORCH_API_TEST_SOURCES
   ${TORCH_API_TEST_DIR}/functional.cpp
   ${TORCH_API_TEST_DIR}/init.cpp
   ${TORCH_API_TEST_DIR}/integration.cpp
+  ${TORCH_API_TEST_DIR}/ivalue.cpp
   ${TORCH_API_TEST_DIR}/jit.cpp
   ${TORCH_API_TEST_DIR}/memory.cpp
   ${TORCH_API_TEST_DIR}/meta_tensor.cpp
diff --git a/test/cpp/api/ivalue.cpp b/test/cpp/api/ivalue.cpp
new file mode 100644
index 000000000000..fa8dcc25cd4d
--- /dev/null
+++ b/test/cpp/api/ivalue.cpp
@@ -0,0 +1,63 @@
+#include <gtest/gtest.h>
+
+#include <ATen/core/ivalue.h>
+
+#include <c10/util/flat_hash_map.h>
+#include <c10/util/irange.h>
+#include <c10/util/tempfile.h>
+
+#include <torch/torch.h>
+
+#include <test/cpp/api/support.h>
+
+#include <cstdio>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+using namespace torch::test;
+using namespace torch::nn;
+using namespace torch::optim;
+
+TEST(IValueTest, DeepcopyTensors) {
+  torch::Tensor t0 = torch::randn({2, 3});
+  torch::Tensor t1 = torch::randn({3, 4});
+  torch::Tensor t2 = t0.detach();
+  torch::Tensor t3 = t0;
+  torch::Tensor t4 = t1.as_strided({2, 3}, {3, 1}, 2);
+  std::vector<torch::Tensor> tensor_vector = {t0, t1, t2, t3, t4};
+  c10::List<torch::Tensor> tensor_list(tensor_vector);
+  torch::IValue tensor_list_ivalue(tensor_list);
+
+  c10::IValue::CompIdentityIValues ivalue_compare;
+
+  // Make sure our setup configuration is correct
+  ASSERT_TRUE(ivalue_compare(tensor_list[0].get(), tensor_list[3].get()));
+  ASSERT_FALSE(ivalue_compare(tensor_list[0].get(), tensor_list[1].get()));
+  ASSERT_FALSE(ivalue_compare(tensor_list[0].get(), tensor_list[2].get()));
+  ASSERT_FALSE(ivalue_compare(tensor_list[1].get(), tensor_list[4].get()));
+  ASSERT_TRUE(tensor_list[0].get().isAliasOf(tensor_list[2].get()));
+
+  c10::IValue copied_ivalue = tensor_list_ivalue.deepcopy();
+  c10::List<torch::IValue> copied_list = copied_ivalue.toList();
+
+  // Make sure our setup configuration is correct
+  ASSERT_TRUE(ivalue_compare(copied_list[0].get(), copied_list[3].get()));
+  ASSERT_FALSE(ivalue_compare(copied_list[0].get(), copied_list[1].get()));
+  ASSERT_FALSE(ivalue_compare(copied_list[0].get(), copied_list[2].get()));
+  ASSERT_FALSE(ivalue_compare(copied_list[1].get(), copied_list[4].get()));
+  // NOTE: this is actually incorrect. Ideally, these _should_ be aliases.
+  ASSERT_FALSE(copied_list[0].get().isAliasOf(copied_list[2].get()));
+
+  ASSERT_TRUE(copied_list[0].get().toTensor().allclose(
+      tensor_list[0].get().toTensor()));
+  ASSERT_TRUE(copied_list[1].get().toTensor().allclose(
+      tensor_list[1].get().toTensor()));
+  ASSERT_TRUE(copied_list[2].get().toTensor().allclose(
+      tensor_list[2].get().toTensor()));
+  ASSERT_TRUE(copied_list[3].get().toTensor().allclose(
+      tensor_list[3].get().toTensor()));
+  ASSERT_TRUE(copied_list[4].get().toTensor().allclose(
+      tensor_list[4].get().toTensor()));
+}
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
index 4c22ea347156..9139b62f1367 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py
@@ -67,13 +67,13 @@ def _test_clip_grad_norm(
             )
             comm_mode = CommDebugMode()
             with comm_mode:
+                # foreach is default to turn on so we don't need to specify it.
                 total_norm = torch.nn.utils.clip_grad_norm_(
                     model.parameters(),
                     max_norm=max_norm,
                     norm_type=norm_type,
-                    foreach=True,
                 )
-            self.assertEqual(ref_total_norm, total_norm)
+            self.assertEqual(ref_total_norm, total_norm.full_tensor())
             # Expect one all-reduce per mesh dim for partial -> replicate
             expected_all_reduces = len(total_norm.placements)
             self.assertEqual(
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_comm.py b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
index 115c1f93227c..283b8ab2b944 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_comm.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_comm.py
@@ -244,7 +244,7 @@ def _test_reduce_scatter(
         group = fsdp_param_group.mesh_info.shard_process_group
         self.assertEqual(group.size(), self.world_size)
         all_reduce_stream = torch.cuda.Stream()
-        view_out_event = foreach_reduce(
+        post_reduce_event, _ = foreach_reduce(
             fsdp_params,
             unsharded_grads,
             group,
@@ -254,8 +254,10 @@ def _test_reduce_scatter(
             device=self.device,
             all_reduce_group=None,
             all_reduce_stream=all_reduce_stream,
+            all_reduce_grads=True,
+            partial_reduce_output=None,
         )
-        torch.cuda.current_stream().wait_event(view_out_event)
+        torch.cuda.current_stream().wait_event(post_reduce_event)
 
         # Check reduce-scatter correctness
         predivide_factor, postdivide_factor = _get_gradient_divide_factors(
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_init.py b/test/distributed/_composable/fsdp/test_fully_shard_init.py
index 958f375fe2c3..73e078c0b2f2 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_init.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_init.py
@@ -24,6 +24,10 @@
     Shard,
 )
 from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.fsdp._init_utils import (
+    _init_inter_node_process_group,
+    _init_intra_node_process_group,
+)
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     parallelize_module,
@@ -672,7 +676,7 @@ def world_size(self) -> int:
         return 4
 
     @unittest.skipIf(not TEST_CUDA, "no cuda")
-    def test_process_group_init(self):
+    def test_1d_process_group_init(self):
         assert self.world_size == 4, f"{self.world_size}"
         # For convenience, use device mesh's infra to construct the DP PG
         # (in practice, the trainer would do it manually via `new_group()`)
@@ -684,10 +688,10 @@ def test_process_group_init(self):
         dp_pg = ref_dp_mesh.get_group(0)
 
         # Check the `from_group()` API for correctness
-        dp_mesh = DeviceMesh.from_group(dp_pg, "cuda")
+        dp_mesh = DeviceMesh.from_group(dp_pg, "cuda", mesh_dim_names=("dp",))
+        # Only compare the mesh tensors, not `DeviceMesh` objects themselves,
+        # since the ref has a parent mesh, while the `from_group` one does not
         self.assertEqual(dp_mesh.mesh, ref_dp_mesh.mesh)
-        self.assertEqual(dp_mesh, ref_dp_mesh)
-        # self.assertFalse(hasattr(dp_mesh, "_coordinate_on_dim"))
         self.assertEqual(dp_mesh._coordinate_on_dim, ref_dp_mesh._coordinate_on_dim)
         self.assertEqual(dp_mesh._dim_group_infos, ref_dp_mesh._dim_group_infos)
 
@@ -715,6 +719,90 @@ def test_process_group_init(self):
         elif self.rank in (2, 3):
             dist.broadcast(inp, src=2, group=tp_mesh.get_group(0))
 
+        ref_loss = ref_model(inp).sum()
+        ref_loss.backward()
+        loss = model(inp).sum()
+        loss.backward()
+        self.assertEqual(loss, ref_loss)
+        for param, ref_param in zip(model.parameters(), ref_model.parameters()):
+            # Cannot compare `DTensor`s directly since their meshes are not
+            # equal due to the ref parameter's mesh having a parent mesh while
+            # the other's mesh does not
+            self.assertEqual(param.to_local(), ref_param.to_local())
+            self.assertEqual(param.device_mesh.mesh, ref_param.device_mesh.mesh)
+            self.assertEqual(param.grad.to_local(), ref_param.grad.to_local())
+            self.assertEqual(
+                param.grad.device_mesh.mesh, ref_param.grad.device_mesh.mesh
+            )
+
+    @unittest.skipIf(not TEST_CUDA, "no cuda")
+    def test_2d_process_group_init(self):
+        shard_mesh_dim_size = 2
+        assert (
+            self.world_size % shard_mesh_dim_size == 0
+        ), f"Expects {self.world_size} to be divisible by {shard_mesh_dim_size}"
+        replicate_mesh_dim_size = self.world_size // shard_mesh_dim_size
+        mesh_dim_names = ("replicate", "shard")
+        ref_mesh = init_device_mesh(
+            "cuda",
+            (replicate_mesh_dim_size, shard_mesh_dim_size),
+            mesh_dim_names=mesh_dim_names,
+        )
+
+        # Use the global PG as the parent group (in practice, this could be a
+        # subgroup of the global PG)
+        dp_group = dist.distributed_c10d._get_default_group()
+        dp_shard_group = _init_intra_node_process_group(shard_mesh_dim_size)
+        dp_replicate_group = _init_inter_node_process_group(
+            dp_group, replicate_mesh_dim_size
+        )
+        mesh_tensor = torch.tensor(
+            dist.get_process_group_ranks(dp_group), dtype=torch.int
+        ).view(replicate_mesh_dim_size, shard_mesh_dim_size)
+
+        # Check the `from_group()` API for correctness
+        mesh = DeviceMesh.from_group(
+            [dp_replicate_group, dp_shard_group],
+            "cuda",
+            mesh_dim_names=mesh_dim_names,
+            mesh=mesh_tensor,
+        )
+        self.assertEqual(mesh.mesh, ref_mesh.mesh)
+        self.assertEqual(mesh._coordinate_on_dim, ref_mesh._coordinate_on_dim)
+        for (tag, ranks, group_name), (ref_tag, ref_ranks, ref_group_name) in zip(
+            mesh._dim_group_infos, ref_mesh._dim_group_infos
+        ):
+            # Since we manually constructed new subgroups, the test and ref
+            # groups are not the same
+            self.assertEqual(ranks, ref_ranks)
+        for mesh_dim_name in mesh_dim_names:
+            child_mesh = mesh[mesh_dim_name]
+            ref_child_mesh = ref_mesh[mesh_dim_name]
+            self.assertEqual(child_mesh, ref_child_mesh)
+            child_ranks = dist.distributed_c10d.get_process_group_ranks(
+                child_mesh.get_group()
+            )
+            ref_child_ranks = dist.distributed_c10d.get_process_group_ranks(
+                ref_child_mesh.get_group()
+            )
+            self.assertEqual(child_ranks, ref_child_ranks)
+
+        # Check HSDP forward/backward parity
+        torch.manual_seed(42)
+        mlp_dim = 8
+        ref_model = MLP(mlp_dim)
+        for param in ref_model.parameters():
+            dist.broadcast(param.detach(), src=0)
+        model = copy.deepcopy(ref_model)
+
+        # Parallelize the test model with the ref mesh
+        for module in (ref_model.in_proj, ref_model.out_proj, ref_model):
+            fully_shard(module, mesh=ref_mesh)
+        # Parallelize the test model with the new mesh from the PG
+        for module in (model.in_proj, model.out_proj, model):
+            fully_shard(module, mesh=mesh)
+
+        inp = torch.randn((4, mlp_dim), device="cuda")
         ref_loss = ref_model(inp).sum()
         ref_loss.backward()
         loss = model(inp).sum()
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
index fde705bdd069..cab4b2496385 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_state_dict.py
@@ -8,8 +8,8 @@
 
 import torch
 import torch.nn as nn
-from torch.distributed._composable.fsdp import fully_shard
-from torch.distributed._tensor import DTensor
+from torch.distributed._composable.fsdp import CPUOffloadPolicy, fully_shard
+from torch.distributed._tensor import distribute_tensor, DTensor
 from torch.distributed.device_mesh import DeviceMesh, init_device_mesh
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -68,6 +68,42 @@ def _test_1d_state_dict_save_load(self, mlp_dim: int):
         for key, value in ref_sharded_sd.items():
             self.assertEqual(value, sharded_sd[key])
 
+    @skip_if_lt_x_gpu(2)
+    def test_1d_state_dict_cpu_offload(self):
+        mlp_dim = 4
+        offload_policy = CPUOffloadPolicy(pin_memory=True)
+        torch.manual_seed(42)
+        with torch.device("meta"):
+            model = nn.Sequential(
+                nn.Linear(mlp_dim, mlp_dim, bias=False),
+                nn.Linear(mlp_dim, mlp_dim, bias=False),
+            )
+        for module in model:
+            fully_shard(module, offload_policy=offload_policy)
+        fully_shard(model, offload_policy=offload_policy)
+
+        # split full sd into multiple pieces
+        # to test loading with `strict=False`
+        state_dicts = []
+        for name, dtensor in model.named_parameters():
+            full_tensor = torch.randn(dtensor.size())
+            sharded_tensor = distribute_tensor(
+                full_tensor, dtensor.device_mesh, dtensor.placements
+            )
+            state_dicts.append({name: sharded_tensor})
+
+        # check that we can load with some parameters still on meta device
+        for sd in state_dicts:
+            model.load_state_dict(sd, assign=True, strict=False)
+
+        # lazy init without error
+        inp = torch.rand((mlp_dim, mlp_dim), device="cuda")
+        model(inp)
+
+        state_dict = model.state_dict()
+        for name, dtensor in state_dict.items():
+            self.assertEqual(dtensor.device.type, "cpu")
+
     @skip_if_lt_x_gpu(2)
     def test_2d_state_dict_save_load(self):
         dp_size = 2
diff --git a/test/distributed/_composable/fsdp/test_fully_shard_training.py b/test/distributed/_composable/fsdp/test_fully_shard_training.py
index f834cced1eee..392596549d77 100644
--- a/test/distributed/_composable/fsdp/test_fully_shard_training.py
+++ b/test/distributed/_composable/fsdp/test_fully_shard_training.py
@@ -672,6 +672,12 @@ def test_gradient_accumulation(self):
                 "mode": ["all", "root_only", "some_mlps"],
                 "reshard_after_backward": [False, True],
                 "offload_policy": [OffloadPolicy(), CPUOffloadPolicy()],
+                # For HSDP only:
+                # `True`: reduce-scatter only (no all-reduce) each microbatch
+                # until the last microbatch
+                # `False`: neither reduce-scatter nor all-reduce each
+                # microbatch until the last microbatch
+                "reduce_scatter_only": [False, True],
             },
             self._test_gradient_accumulation,
         )
@@ -683,15 +689,20 @@ def _test_gradient_accumulation(
         mode: str,
         reshard_after_backward: bool,
         offload_policy: OffloadPolicy,
+        reduce_scatter_only: bool,  # for HSDP
     ):
         if (
-            not reshard_after_backward
-            and (reshard_after_forward is not False or mode == "some_mlps")
-        ) or (
-            isinstance(offload_policy, CPUOffloadPolicy)
-            and reshard_after_forward is not True
+            (
+                not reshard_after_backward
+                and (reshard_after_forward is not False or mode == "some_mlps")
+            )
+            or (
+                isinstance(offload_policy, CPUOffloadPolicy)
+                and reshard_after_forward is not True
+            )
+            or (mesh.ndim != 2 and reduce_scatter_only)
         ):
-            return  # skip since not common
+            return  # skip since not common or applicable
 
         torch.manual_seed(42)
         batch_size, lin_dim, num_mlps, num_microbatches = (2, 32, 3, 3)
@@ -713,29 +724,35 @@ def _test_gradient_accumulation(
         ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
         optim = torch.optim.Adam(model.parameters(), lr=1e-2)
 
+        def set_grad_sync_flag(
+            module: nn.Module, is_last_microbatch: bool, recurse: bool = True
+        ):
+            if reduce_scatter_only:
+                module.set_requires_all_reduce(is_last_microbatch, recurse=recurse)
+            else:
+                module.set_requires_gradient_sync(is_last_microbatch, recurse=recurse)
+
+        def set_backward_flags(_model: nn.Module, is_last_microbatch: bool):
+            if mode == "all":
+                set_grad_sync_flag(_model, is_last_microbatch)
+                if not reshard_after_backward:
+                    _model.set_reshard_after_backward(is_last_microbatch)
+            elif mode == "some_mlps":
+                for mlp in model[1 : 1 + num_mlps_to_disable_reduce_scatter]:
+                    set_grad_sync_flag(mlp, is_last_microbatch)
+                    if not reshard_after_backward:
+                        mlp.set_reshard_after_backward(is_last_microbatch)
+            elif mode == "root_only":
+                set_grad_sync_flag(model, is_last_microbatch, recurse=False)
+                if not reshard_after_backward:
+                    model.set_reshard_after_backward(is_last_microbatch, recurse=False)
+
         torch.manual_seed(42 + self.rank + 1)
         for iter_idx in range(5):
             with CommDebugMode() as comm_mode:
                 for microbatch_idx in range(num_microbatches):
                     is_last_microbatch = microbatch_idx == num_microbatches - 1
-                    if mode == "all":
-                        model.set_requires_gradient_sync(is_last_microbatch)
-                        if not reshard_after_backward:
-                            model.set_reshard_after_backward(is_last_microbatch)
-                    elif mode == "some_mlps":
-                        for mlp in model[1 : 1 + num_mlps_to_disable_reduce_scatter]:
-                            mlp.set_requires_gradient_sync(is_last_microbatch)
-                            if not reshard_after_backward:
-                                mlp.set_reshard_after_backward(is_last_microbatch)
-                    elif mode == "root_only":
-                        model.set_requires_gradient_sync(
-                            is_last_microbatch, recurse=False
-                        )
-                        if not reshard_after_backward:
-                            model.set_reshard_after_backward(
-                                is_last_microbatch, recurse=False
-                            )
-
+                    set_backward_flags(model, is_last_microbatch)
                     inp = torch.randn(batch_size, lin_dim, device="cuda")
                     losses: List[torch.Tensor] = []
                     for _model in (ref_model, model):
@@ -760,10 +777,15 @@ def _test_gradient_accumulation(
             elif mode == "root_only":
                 # Expect additional reduce-scatters for all MLPs
                 expected_reduce_scatter_count += (num_mlps) * (num_microbatches - 1)
-            self.assertEqual(reduce_scatter_count, expected_reduce_scatter_count)
             expected_all_reduce_count = (
                 expected_reduce_scatter_count if mesh.ndim == 2 else 0
             )
+            if reduce_scatter_only:
+                # Specially for HSDP if only reduce-scattering but not
+                # all-reducing until the last microbatch, expect one
+                # reduce-scatter per MLP plus for the root per microbatch
+                expected_reduce_scatter_count = (num_mlps + 1) * num_microbatches
+            self.assertEqual(reduce_scatter_count, expected_reduce_scatter_count)
             self.assertEqual(all_reduce_count, expected_all_reduce_count)
 
             # Expect one all-gather per MLP plus one for the root's linear in
@@ -903,11 +925,11 @@ def _test_train_parity_2d_mlp(
         model = MLPStack(mlp_dim)
         ref_model = copy.deepcopy(model).cuda()
         replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
-        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=False)
         model.parallelize(
             tp_mesh, dp_mesh, use_activation_checkpointing, reshard_after_forward
         )
-        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=False)
 
         torch.manual_seed(42 + dp_pg.rank() + 1)
         device = torch.device("cuda")
@@ -935,6 +957,8 @@ def test_train_parity_2d_transformer_checkpoint_resume(self):
                 # else construct new ones (requiring eager optim state init)
                 "reuse_model_optim": [False, True],
                 "optimizer_class": [torch.optim.Adam, torch.optim.AdamW],
+                # TODO: need to update `parallelize` before including foreach=True for testing
+                "foreach": [False],
             },
             self._test_train_parity_2d_transformer_checkpoint_resume,
         )
@@ -944,6 +968,7 @@ def _test_train_parity_2d_transformer_checkpoint_resume(
         use_seq_parallel: bool,
         reuse_model_optim: bool,
         optimizer_class: Type[torch.optim.Optimizer],
+        foreach: bool,
     ):
         def train_step(
             _model: nn.Module, _optim: torch.optim.Optimizer, _inp: torch.Tensor
@@ -969,7 +994,9 @@ def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool):
         model_no_cp = parallelize(
             Transformer(model_args), global_mesh, use_seq_parallel
         )
-        optim_no_cp = optimizer_class(model_no_cp.parameters(), lr=1e-2)
+        optim_no_cp = optimizer_class(
+            model_no_cp.parameters(), lr=1e-2, foreach=foreach
+        )
 
         torch.manual_seed(42 + global_mesh["dp"].get_local_rank() + 1)
         inp = torch.randint(0, model_args.vocab_size, (3, 16), device="cuda")
@@ -980,7 +1007,7 @@ def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool):
         # model/optimizer, load checkpoint, and run another iteration
         torch.manual_seed(seed)
         model_cp = parallelize(Transformer(model_args), global_mesh, use_seq_parallel)
-        optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2)
+        optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2, foreach=foreach)
 
         loss_cp1 = train_step(model_cp, optim_cp, inp)
         self.assertEqual(loss_no_cp1, loss_cp1)
@@ -1009,7 +1036,7 @@ def parallelize(_model: Transformer, mesh: DeviceMesh, use_seq_parallel: bool):
             model_cp = parallelize(
                 Transformer(model_args), global_mesh, use_seq_parallel
             )
-            optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2)
+            optim_cp = optimizer_class(model_cp.parameters(), lr=1e-2, foreach=foreach)
         self.assertNotEqual(loss_no_cp2, train_step(model_cp, optim_cp, inp))
 
         sharded_sd = {
@@ -1049,6 +1076,7 @@ def test_2d_mlp_with_nd_mesh(self):
                 "reshard_after_forward": [False, True],
                 "use_activation_checkpointing": [False, True],
                 "mlp_dim": [3, 16, 17],
+                "foreach": [False],
             },
             functools.partial(self._test_2d_mlp_with_nd_mesh, global_mesh),
         )
@@ -1059,6 +1087,7 @@ def _test_2d_mlp_with_nd_mesh(
         reshard_after_forward: bool,
         use_activation_checkpointing: bool,
         mlp_dim: int,
+        foreach: bool,
     ):
         global_mesh = self.init_global_mesh()
         pp_mesh, dp_mesh, tp_mesh = (
@@ -1072,11 +1101,11 @@ def _test_2d_mlp_with_nd_mesh(
         model = MLPStack(mlp_dim)
         ref_model = copy.deepcopy(model).cuda()
         replicate(ref_model, device_ids=[self.rank], process_group=dp_pg)
-        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2)
+        ref_optim = torch.optim.Adam(ref_model.parameters(), lr=1e-2, foreach=foreach)
         model.parallelize(
             tp_mesh, dp_mesh, use_activation_checkpointing, reshard_after_forward
         )
-        optim = torch.optim.Adam(model.parameters(), lr=1e-2)
+        optim = torch.optim.Adam(model.parameters(), lr=1e-2, foreach=foreach)
 
         torch.manual_seed(42 + dp_pg.rank() + 1)
         device = torch.device("cuda")
diff --git a/test/distributed/_tensor/test_optimizers.py b/test/distributed/_tensor/test_optimizers.py
index e7ce18eefa63..512b5c97ce6a 100644
--- a/test/distributed/_tensor/test_optimizers.py
+++ b/test/distributed/_tensor/test_optimizers.py
@@ -84,23 +84,26 @@ def _assert_optimizer(
                 # Default 'rtol' and 'atol' for attr:`~torch.float32` are ``1.3e-6`` and ``1e-5``
                 self.assertEqual(p1, p2, atol=atol, rtol=rtol)
 
+    def test_optimizer_foreach_supported_types_include_DTensor(self):
+        from torch.optim.optimizer import _foreach_supported_types
+
+        self.assertTrue(DTensor in _foreach_supported_types)
+
     @with_comms
     def test_adam_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         # TODO: add fused_adam support
         adam_configs = [
-            {"lr": 0.1},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "weight_decay": 0.05, "foreach": False},
             {"lr": 0.1, "weight_decay": 0.05},
-            {"lr": 0.1, "foreach": True},
-            {"lr": 0.1, "weight_decay": 0.05, "foreach": True},
-            {"lr": 0.1, "weight_decay": 0.05, "amsgrad": True, "foreach": True},
+            {"lr": 0.1, "weight_decay": 0.05, "amsgrad": True},
             {
                 "lr": 0.1,
                 "weight_decay": 0.05,
                 "maximize": True,
                 "amsgrad": True,
-                "foreach": True,
             },
             {"lr": 0.1, "fused": True},
             {"lr": 0.1, "weight_decay": 0.05, "amsgrad": True, "fused": True},
@@ -132,16 +135,15 @@ def test_adamw_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         adamw_configs = [
-            {"lr": 0.1},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "weight_decay": 0.05, "foreach": False},
             {"lr": 0.1, "weight_decay": 0.05},
-            {"lr": 0.1, "weight_decay": 0.05, "foreach": True},
             {
                 "lr": 0.1,
                 "betas": (0.6, 0.66),
                 "eps": 1e-6,
                 "weight_decay": 0.05,
                 "amsgrad": True,
-                "foreach": True,
             },
             {
                 "lr": 0.1,
@@ -150,7 +152,6 @@ def test_adamw_1d_sharding(self):
                 "weight_decay": 0.05,
                 "maximize": True,
                 "amsgrad": True,
-                "foreach": True,
             },
             {"lr": 0.1, "weight_decay": 0.05, "fused": True},
             {
@@ -191,16 +192,17 @@ def test_sgd_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         sgd_configs = [
-            {"lr": 0.1},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "momentum": 0.05, "foreach": False},
             {"lr": 0.1, "momentum": 0.05},
-            {"lr": 0.1, "momentum": 0.05, "foreach": True},
-            {"lr": 0.1, "momentum": 0.06, "dampening": 0.07, "foreach": True},
+            {"lr": 0.1, "momentum": 0.06, "dampening": 0.07},
             {
                 "lr": 0.1,
                 "momentum": 0.08,
                 "weight_decay": 0.05,
                 "nesterov": True,
                 "maximize": True,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -208,7 +210,6 @@ def test_sgd_1d_sharding(self):
                 "weight_decay": 0.05,
                 "nesterov": True,
                 "maximize": True,
-                "foreach": True,
             },
         ]
 
@@ -231,14 +232,15 @@ def test_adagrad_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         adagrad_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "lr_decay": 0.05},
-            {"lr": 0.1, "lr_decay": 0.02, "weight_decay": 0.05},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "lr_decay": 0.05, "foreach": False},
+            {"lr": 0.1, "lr_decay": 0.02, "weight_decay": 0.05, "foreach": False},
             {
                 "lr": 0.1,
                 "lr_decay": 0.02,
                 "weight_decay": 0.05,
                 "initial_accumulator_value": 0.03,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -246,6 +248,7 @@ def test_adagrad_1d_sharding(self):
                 "weight_decay": 0.05,
                 "initial_accumulator_value": 0.03,
                 "eps": 1e-6,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -254,6 +257,7 @@ def test_adagrad_1d_sharding(self):
                 "initial_accumulator_value": 0.03,
                 "eps": 1e-6,
                 "maximize": True,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -262,7 +266,6 @@ def test_adagrad_1d_sharding(self):
                 "initial_accumulator_value": 0.03,
                 "eps": 1e-6,
                 "maximize": True,
-                "foreach": True,
             },
         ]
 
@@ -285,16 +288,23 @@ def test_RMSprop_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         RMSprop_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "alpha": 0.85},
-            {"lr": 0.1, "alpha": 0.88, "eps": 1e-6},
-            {"lr": 0.1, "alpha": 0.88, "eps": 1e-6, "weight_decay": 0.05},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "alpha": 0.85, "foreach": False},
+            {"lr": 0.1, "alpha": 0.88, "eps": 1e-6, "foreach": False},
+            {
+                "lr": 0.1,
+                "alpha": 0.88,
+                "eps": 1e-6,
+                "weight_decay": 0.05,
+                "foreach": False,
+            },
             {
                 "lr": 0.1,
                 "alpha": 0.88,
                 "eps": 1e-6,
                 "weight_decay": 0.05,
                 "momentum": 0.9,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -303,6 +313,7 @@ def test_RMSprop_1d_sharding(self):
                 "weight_decay": 0.05,
                 "momentum": 0.9,
                 "centered": True,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -312,6 +323,7 @@ def test_RMSprop_1d_sharding(self):
                 "momentum": 0.9,
                 "centered": True,
                 "maximize": True,
+                "foreach": False,
             },
             {
                 "lr": 0.1,
@@ -321,7 +333,6 @@ def test_RMSprop_1d_sharding(self):
                 "momentum": 0.9,
                 "centered": True,
                 "maximize": True,
-                "foreach": True,
             },
         ]
 
@@ -344,23 +355,27 @@ def test_adadelta_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         adadelta_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "rho": 0.85},
-            {"lr": 0.1, "rho": 0.88, "eps": 1e-5},
-            {"lr": 0.1, "rho": 0.88, "eps": 1e-6, "weight_decay": 0.05},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "rho": 0.85, "foreach": False},
+            {"lr": 0.1, "rho": 0.88, "eps": 1e-5, "foreach": False},
+            {
+                "lr": 0.1,
+                "rho": 0.88,
+                "eps": 1e-6,
+                "weight_decay": 0.05,
+                "foreach": False,
+            },
             {
                 "lr": 0.1,
                 "rho": 0.88,
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
             },
             {
                 "lr": 0.1,
                 "rho": 0.88,
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
                 "maximize": True,
             },
         ]
@@ -384,15 +399,14 @@ def test_nadam_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         nadam_configs = [
-            {"lr": 0.1},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "weight_decay": 0.05, "foreach": False},
             {"lr": 0.1, "weight_decay": 0.05},
-            {"lr": 0.1, "weight_decay": 0.05, "foreach": True},
             {
                 "lr": 0.1,
                 "betas": (0.6, 0.66),
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
             },
             {
                 "lr": 0.1,
@@ -400,7 +414,6 @@ def test_nadam_1d_sharding(self):
                 "eps": 1e-6,
                 "weight_decay": 0.05,
                 "decoupled_weight_decay": True,
-                "foreach": True,
             },
         ]
 
@@ -423,15 +436,17 @@ def test_radam_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         radam_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "weight_decay": 0.05},
-            {"lr": 0.1, "weight_decay": 0.05, "foreach": True},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "weight_decay": 0.05, "foreach": False},
+            {
+                "lr": 0.1,
+                "weight_decay": 0.05,
+            },
             {
                 "lr": 0.1,
                 "betas": (0.6, 0.66),
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
             },
             {
                 "lr": 0.1,
@@ -439,7 +454,6 @@ def test_radam_1d_sharding(self):
                 "eps": 1e-6,
                 "weight_decay": 0.05,
                 "decoupled_weight_decay": True,
-                "foreach": True,
             },
         ]
 
@@ -462,23 +476,27 @@ def test_adamax_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         adamax_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "betas": (0.6, 0.66)},
-            {"lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6},
-            {"lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "weight_decay": 0.05},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "betas": (0.6, 0.66), "foreach": False},
+            {"lr": 0.1, "betas": (0.6, 0.66), "eps": 1e-6, "foreach": False},
+            {
+                "lr": 0.1,
+                "betas": (0.6, 0.66),
+                "eps": 1e-6,
+                "weight_decay": 0.05,
+                "foreach": False,
+            },
             {
                 "lr": 0.1,
                 "betas": (0.6, 0.66),
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
             },
             {
                 "lr": 0.1,
                 "betas": (0.6, 0.66),
                 "eps": 1e-6,
                 "weight_decay": 0.05,
-                "foreach": True,
                 "maximize": True,
             },
         ]
@@ -502,11 +520,18 @@ def test_asgd_1d_sharding(self):
         mesh = DeviceMesh(self.device_type, list(range(self.world_size)))
 
         asgd_configs = [
-            {"lr": 0.1},
-            {"lr": 0.1, "lambd": 0.001},
-            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85},
-            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "t0": 1e5},
-            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "t0": 1e5, "weight_decay": 0.05},
+            {"lr": 0.1, "foreach": False},
+            {"lr": 0.1, "lambd": 0.001, "foreach": False},
+            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "foreach": False},
+            {"lr": 0.1, "lambd": 0.001, "alpha": 0.85, "t0": 1e5, "foreach": False},
+            {
+                "lr": 0.1,
+                "lambd": 0.001,
+                "alpha": 0.85,
+                "t0": 1e5,
+                "weight_decay": 0.05,
+                "foreach": False,
+            },
             {
                 "lr": 0.1,
                 "lambd": 0.001,
diff --git a/test/distributed/_tensor/test_view_ops.py b/test/distributed/_tensor/test_view_ops.py
index 429e62588651..2ea89e34789b 100644
--- a/test/distributed/_tensor/test_view_ops.py
+++ b/test/distributed/_tensor/test_view_ops.py
@@ -11,9 +11,9 @@
 from torch.distributed._tensor.debug import CommDebugMode
 from torch.distributed._tensor.ops.view_ops import (
     Broadcast,
+    dim_maps,
     Flatten,
     InputDim,
-    ops,
     Repeat,
     Singleton,
     Split,
@@ -130,8 +130,8 @@ def world_size(self) -> int:
         return 6
 
     def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
-        spec = ops[op]
-        rules = spec.dim_map(*args, **kwargs)
+        dim_map = dim_maps[op]
+        rules = dim_map(*args, **kwargs)
         outputs = op(*args, **kwargs)
         flat_args = pytree.arg_tree_leaves(*args)
         in_shape = flat_args[0].shape
@@ -163,7 +163,6 @@ def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
         )
 
         for in_shard in all_sharding_choices:
-            # print(f'   |--- {in_shard}')
             in_dt = distribute_tensor(args[0], device_mesh, in_shard)
 
             comm_mode = CommDebugMode()
@@ -180,7 +179,7 @@ def call_dt_test(self, op, args, kwargs, device_mesh: DeviceMesh):
                 self.assertEqual(outputs, full_out)
 
     def dimmap_test(self, op, args, expected_rule_output):
-        rules = ops[op].dim_map(*args)
+        rules = dim_maps[op](*args)
         self.assertEqual(rules, expected_rule_output)
         self.call_dt_test(op, args, {}, self.device_mesh)
 
@@ -229,7 +228,7 @@ def test_view_ops(self):
         )
 
         with self.assertRaises(AssertionError):
-            ops[torch.broadcast_to].dim_map(randn(24, 36), (1, 2, 4))
+            dim_maps[torch.broadcast_to](randn(24, 36), (1, 2, 4))
 
         self.dimmap_test(
             torch.broadcast_to,
@@ -495,14 +494,14 @@ def test_complex_view_ops(self):
             InputDim(0),
             Flatten((InputDim(1), InputDim(2))),
         )
-        view_as_complex_rule = ops[torch.view_as_complex].dim_map(inp)
+        view_as_complex_rule = dim_maps[torch.view_as_complex](inp)
         self.assertEqual(view_as_complex_rule, expected_view_as_complex_rule)
         expected_view_as_real_rule = (
             InputDim(0),
             Split(InputDim(1), (13, 2), 0),
             Split(InputDim(1), (13, 2), 1),
         )
-        view_as_real_rule = ops[torch.view_as_real].dim_map(intermediate)
+        view_as_real_rule = dim_maps[torch.view_as_real](intermediate)
         self.assertEqual(view_as_real_rule, expected_view_as_real_rule)
 
         # test sharded computation correctness
diff --git a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
index f24bb131667d..d3fa1851b90d 100644
--- a/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
+++ b/test/distributed/checkpoint/e2e/test_e2e_save_and_load.py
@@ -21,6 +21,7 @@
     get_state_dict,
 )
 from torch.distributed.checkpoint.state_dict_loader import _load_state_dict_from_keys
+from torch.distributed.checkpoint.utils import CheckpointException
 from torch.distributed.distributed_c10d import ReduceOp
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.api import ShardingStrategy
@@ -377,6 +378,30 @@ def test_partial_load(self):
                     loaded_optim_state[k][optim_key], v[optim_key], offload_to_cpu=True
                 )
 
+    @with_comms
+    @skip_if_lt_x_gpu(4)
+    @with_temp_dir
+    def test_overwrite(self):
+        t1, t2 = torch.randn(10), torch.randn(10)
+        DCP.save({"random": t1}, checkpoint_id=self.temp_dir)
+        DCP.save(
+            {"random": t2},
+            storage_writer=DCP.FileSystemWriter(self.temp_dir, overwrite=True),
+        )
+
+        sd = {"random": torch.zeros(10)}
+        DCP.load(sd, checkpoint_id=self.temp_dir)
+
+        self.assertTrue(torch.allclose(sd["random"], t2))
+
+        with self.assertRaisesRegex(
+            CheckpointException, ".*Checkpoint already exists.*"
+        ):
+            DCP.save(
+                {"random": t2},
+                storage_writer=DCP.FileSystemWriter(self.temp_dir, overwrite=False),
+            )
+
 
 class TestNoCPU(DTensorTestBase):
     @property
diff --git a/test/distributed/checkpoint/test_fsspec.py b/test/distributed/checkpoint/test_fsspec.py
index b5d41959dc32..e7ca4d65fd6e 100644
--- a/test/distributed/checkpoint/test_fsspec.py
+++ b/test/distributed/checkpoint/test_fsspec.py
@@ -11,6 +11,7 @@
 import torch.nn as nn
 from torch.distributed.checkpoint._fsspec_filesystem import FsspecReader, FsspecWriter
 from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
+from torch.distributed.checkpoint.utils import CheckpointException
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 from torch.testing._internal.common_distributed import requires_nccl, skip_if_lt_x_gpu
@@ -176,6 +177,32 @@ def opt_at(opt, idx):
             opt_at(optim, 0)["exp_avg_sq"], opt_at(optim_2, 0)["exp_avg_sq"]
         )
 
+    @with_comms(init_rpc=False)
+    @skip_if_lt_x_gpu(2)
+    @requires_nccl()
+    @with_temp_dir
+    def test_overwrite(self):
+        t1, t2 = torch.randn(10), torch.randn(10)
+
+        dcp.save(
+            {"random": t1}, storage_writer=FsspecWriter(self.temp_dir, overwrite=False)
+        )
+        dcp.save(
+            {"random": t2}, storage_writer=FsspecWriter(self.temp_dir, overwrite=True)
+        )
+
+        sd = {"random": torch.zeros(10)}
+        dcp.load(sd, checkpoint_id=self.temp_dir)
+        self.assertTrue(torch.allclose(sd["random"], t2))
+
+        with self.assertRaisesRegex(
+            CheckpointException, ".*Checkpoint already exists.*"
+        ):
+            dcp.save(
+                {"random": t2},
+                storage_writer=FsspecWriter(self.temp_dir, overwrite=False),
+            )
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/distributed/elastic/multiprocessing/api_test.py b/test/distributed/elastic/multiprocessing/api_test.py
index 9658ed087ab0..75e903807ff9 100644
--- a/test/distributed/elastic/multiprocessing/api_test.py
+++ b/test/distributed/elastic/multiprocessing/api_test.py
@@ -465,6 +465,30 @@ def test_function_raise(self):
                     self.assertTrue(pc._stderr_tail.stopped())
                     self.assertTrue(pc._stdout_tail.stopped())
 
+        def test_wait_for_all_child_procs_to_exit(self):
+            """
+            Tests that MultiprocessingContext actually waits for
+            the child process to exit (not just that the entrypoint fn has
+            finished running).
+            """
+
+            mpc = MultiprocessContext(
+                name="echo",
+                entrypoint=echo0,
+                args={},
+                envs={},
+                start_method="spawn",
+                logs_specs=DefaultLogsSpecs(log_dir=self.log_dir()),
+            )
+
+            with mock.patch.object(
+                mpc, "_is_done", return_value=True
+            ), mock.patch.object(mpc, "_pc"), mock.patch.object(
+                mpc._pc, "join", side_effect=[True, False, False, True]
+            ) as mock_join:
+                mpc._poll()
+                self.assertEqual(4, mock_join.call_count)
+
         ########################################
         # start_processes as binary tests
         ########################################
diff --git a/test/distributed/pipelining/test_unflatten.py b/test/distributed/pipelining/test_unflatten.py
index 9c388d279cdf..37eaf599e4d8 100644
--- a/test/distributed/pipelining/test_unflatten.py
+++ b/test/distributed/pipelining/test_unflatten.py
@@ -45,7 +45,6 @@ def test_unflatten(self):
         constant = torch.ones(1, 16, 256, 256)
 
         mod = M()
-        print("Original model:\n", mod)
 
         pipe = pipeline(
             mod,
@@ -58,21 +57,19 @@ def test_unflatten(self):
         orig_state_dict = mod.state_dict()
 
         # Check qualnames
-        print("\nParameters of each stage:")
         for stage_idx in range(pipe.num_stages):
-            print(f"\nStage {stage_idx}:")
             stage_mod = pipe.get_stage_module(stage_idx)
             for param_name, param in stage_mod.named_parameters():
                 assert (
                     param_name in orig_state_dict
                 ), f"{param_name} not in original state dict"
-                print(f"{param_name}: {param.size()}")
+        print("Param qualname test passed")
 
         # Check equivalence
         ref = mod(x, constant)
         out = pipe(x, constant)[0]
         torch.testing.assert_close(out, ref)
-        print(f"\nEquivalence test passed {torch.sum(out)} ref {torch.sum(ref)}")
+        print(f"Equivalence test passed {torch.sum(out)} ref {torch.sum(ref)}")
 
 
 if __name__ == "__main__":
diff --git a/test/distributed/tensor/parallel/test_tp_examples.py b/test/distributed/tensor/parallel/test_tp_examples.py
index cd038dbbb273..eb5e6b5e5a1d 100644
--- a/test/distributed/tensor/parallel/test_tp_examples.py
+++ b/test/distributed/tensor/parallel/test_tp_examples.py
@@ -262,8 +262,11 @@ def test_transformer_training(self, is_seq_parallel=False):
 
         # Ensure model weights are still the same after update.
         optim.step()
-        with CommDebugMode() as comm_mode:
-            optim_tp.step()
+        from torch.distributed._tensor.experimental import implicit_replication
+
+        with implicit_replication():
+            with CommDebugMode() as comm_mode:
+                optim_tp.step()
         self._check_module(model, model_tp)
         if is_seq_parallel:
             self.assertDictEqual(
diff --git a/test/distributed/test_c10d_functional_native.py b/test/distributed/test_c10d_functional_native.py
index 54030d1f1d42..775d3f9cc03d 100644
--- a/test/distributed/test_c10d_functional_native.py
+++ b/test/distributed/test_c10d_functional_native.py
@@ -195,6 +195,18 @@ def test_all_gather_into_tensor_single(self) -> None:
         assert torch.allclose(output, expect)
         assert output.eq(expect).all()
 
+        # Test out-variant of all_gather_into_tensor
+        output = torch.empty(expect.shape, device=self.device)
+        output = torch.ops._c10d_functional.all_gather_into_tensor_out(
+            input,
+            self.world_size,
+            "default",
+            out=output,
+        )
+        output = torch.ops._c10d_functional.wait_tensor(output)
+        assert torch.allclose(output, expect)
+        assert output.eq(expect).all()
+
         # Test Python API and AsyncCollectiveTensor
         output = all_gather_tensor(
             input,
diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py
index ebc56588ed57..5a958acdbdd7 100644
--- a/test/distributed/test_c10d_nccl.py
+++ b/test/distributed/test_c10d_nccl.py
@@ -226,6 +226,14 @@ def opts(self, high_priority_stream=False):
 
     def setUp(self):
         super().setUp()
+        # Need to skip return code checking for these tests since the child
+        # processes don't exit cleanly in some cuda versions
+        self.skip_return_code_checks = [
+            self.test_nan_assert_float16.__wrapped__,
+            self.test_nan_assert_float32.__wrapped__,
+            self.test_nan_assert_float64.__wrapped__,
+        ]
+
         # TORCH_NCCL_BLOCKING_WAIT overrides TORCH_NCCL_ASYNC_ERROR_HANDLING hence tests
         # that use TORCH_NCCL_BLOCKING_WAIT will test it as expected.
         os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "1"
@@ -325,6 +333,27 @@ def test_close_pg(self):
 
         del pg
 
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    @parametrize("type", [torch.float16, torch.float32, torch.float64])
+    @skip_if_rocm
+    def test_nan_assert(self, type):
+        os.environ["TORCH_NCCL_NAN_CHECK"] = "1"
+        store = c10d.FileStore(self.file_name, self.world_size)
+        pg = self._create_process_group_nccl(store, self.opts())
+        device = self.rank_to_GPU[self.rank][0]
+        size = (10, 10)
+        nan_tensor = torch.full(size, self.rank, dtype=type, device=device)
+        # randomly pick an nan element
+        i = random.randint(0, nan_tensor.size(0) - 1)
+        j = random.randint(0, nan_tensor.size(1) - 1)
+        nan_tensor[i, j] = float("nan")
+        with self.assertRaises(RuntimeError):
+            pg.allreduce(nan_tensor)
+        dist.destroy_process_group()
+        # reset env
+        os.environ["TORCH_NCCL_NAN_CHECK"] = "0"
+
     @requires_nccl()
     @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
     def test_destruct_before_terminate_pg(self):
@@ -2548,6 +2577,27 @@ def test_all_reduce_coalesced_nccl(self):
                 ),
             )
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_all_reduce_coalesced_nccl_float8_errors(self):
+        store = c10d.FileStore(self.file_name, self.world_size)
+        c10d.init_process_group(
+            backend="nccl", store=store, rank=self.rank, world_size=self.world_size
+        )
+        process_group = c10d.distributed_c10d._get_default_group()
+        device = torch.device("cuda:%d" % self.rank)
+        tensors = [
+            torch.full(
+                (60 + i,), self.rank + 1 + i, device=device, dtype=torch.float
+            ).to(torch.float8_e4m3fn)
+            for i in range(5)
+        ]
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Float8 dtypes are not currenlty supported for NCCL reductions",
+        ):
+            torch.distributed.all_reduce_coalesced(tensors, group=process_group)
+
     @requires_nccl()
     @skip_if_lt_x_gpu(2)
     def test_all_reduce_coalesced_manager_nccl(self):
@@ -2911,6 +2961,56 @@ def test_reduce_scatter_tensor_coalesced(self):
                 dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i])
         self.assertEqual(output_tensors, input_tensors[self.rank] * self.world_size)
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_reduce_scatter_base_k_float8_errors(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        output_tensor = (
+            torch.zeros(2, dtype=torch.float32).to(torch.float8_e4m3fn).to(self.rank)
+        )
+        input_tensors = (
+            torch.arange(self.world_size * 2, dtype=torch.float32)
+            .to(torch.float8_e4m3fn)
+            .to(self.rank)
+        )
+        input_tensors = torch.reshape(input_tensors, (self.world_size, 2))
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Float8 dtypes are not currenlty supported for NCCL reductions",
+        ):
+            dist.reduce_scatter_tensor(output_tensor, input_tensors)
+
+    @requires_nccl()
+    @skip_if_lt_x_gpu(2)
+    def test_reduce_scatter_tensor_coalesced_float8_errors(self):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        output_tensors = torch.zeros(2, 2).to(torch.float8_e5m2).to(self.rank)
+        input_tensors = [
+            torch.ones(2, 2).to(torch.float8_e5m2).to(self.rank)
+            for _ in range(self.world_size)
+        ]
+
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Float8 dtypes are not currenlty supported for NCCL reductions",
+        ):
+            with dist._coalescing_manager():
+                for i in range(self.world_size):
+                    dist.reduce_scatter_tensor(output_tensors[i], input_tensors[i])
+            self.assertEqual(output_tensors, input_tensors[self.rank])
+
 
 class SetDeviceMethod(Enum):
     TORCH_CUDA_SET = auto()  # torch.cuda.set_device
@@ -2951,6 +3051,28 @@ def test_allgather_base(self):
         dist.all_gather_into_tensor(output_tensor, tensor)
         self.assertEqual(output_tensor, tensor)
 
+    @requires_nccl()
+    @skip_if_lt_x_gpu(1)
+    @parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
+    def test_allgather_float8(self, float8_dtype):
+        store = dist.FileStore(self.file_name, self.world_size)
+        dist.init_process_group(
+            "nccl",
+            world_size=self.world_size,
+            rank=self.rank,
+            store=store,
+        )
+        device = "cuda"
+        tensor = torch.ones(10, 16, device=torch.device(device)).to(float8_dtype)
+        output_tensor = torch.zeros(10, 16, device=torch.device(device)).to(
+            float8_dtype
+        )
+        dist.all_gather_into_tensor(output_tensor, tensor)
+        self.assertEqual(output_tensor.view(torch.float32), tensor.view(torch.float32))
+
+
+instantiate_parametrized_tests(NcclProcessGroupWithDispatchedCollectivesTests)
+
 
 class LargeCommTest(test_c10d_common.AbstractLargeCommTest, MultiProcessTestCase):
     def setUp(self):
diff --git a/test/distributed/test_control_collectives.py b/test/distributed/test_control_collectives.py
new file mode 100644
index 000000000000..fb0067f2dd2e
--- /dev/null
+++ b/test/distributed/test_control_collectives.py
@@ -0,0 +1,189 @@
+# Owner(s): ["oncall: distributed"]
+
+from datetime import timedelta
+from multiprocessing.pool import ThreadPool
+
+import torch
+import torch.distributed as dist
+from torch.testing._internal.common_utils import run_tests, TestCase
+
+
+class TestCollectives(TestCase):
+    def test_barrier(self) -> None:
+        store = dist.HashStore()
+
+        world_size = 2
+
+        def f(rank: int) -> None:
+            collectives = dist._StoreCollectives(store, rank, world_size)
+            collectives.barrier("foo", timedelta(seconds=10), True)
+
+        with ThreadPool(world_size) as pool:
+            pool.map(f, range(world_size))
+
+    def test_broadcast(self) -> None:
+        store = dist.HashStore()
+
+        world_size = 4
+        timeout = timedelta(seconds=10)
+
+        def f(rank: int) -> None:
+            collectives = dist._StoreCollectives(store, rank, world_size)
+            if rank == 2:
+                collectives.broadcast_send("foo", b"data", timeout)
+            else:
+                out = collectives.broadcast_recv("foo", timeout)
+                self.assertEqual(out, b"data")
+
+        with ThreadPool(world_size) as pool:
+            pool.map(f, range(world_size))
+
+    def test_gather(self) -> None:
+        store = dist.HashStore()
+
+        world_size = 4
+        timeout = timedelta(seconds=10)
+
+        def f(rank: int) -> None:
+            collectives = dist._StoreCollectives(store, rank, world_size)
+            if rank == 2:
+                out = collectives.gather_recv("foo", str(rank), timeout)
+                self.assertEqual(out, [b"0", b"1", b"2", b"3"])
+            else:
+                collectives.gather_send("foo", str(rank), timeout)
+
+        with ThreadPool(world_size) as pool:
+            pool.map(f, range(world_size))
+
+    def test_scatter(self) -> None:
+        store = dist.HashStore()
+
+        world_size = 4
+        timeout = timedelta(seconds=10)
+
+        def f(rank: int) -> None:
+            collectives = dist._StoreCollectives(store, rank, world_size)
+            if rank == 2:
+                out = collectives.scatter_send(
+                    "foo", [str(i) for i in range(world_size)], timeout
+                )
+            else:
+                out = collectives.scatter_recv("foo", timeout)
+            self.assertEqual(out, str(rank).encode())
+
+        with ThreadPool(world_size) as pool:
+            pool.map(f, range(world_size))
+
+    def test_all_sum(self) -> None:
+        store = dist.HashStore()
+
+        world_size = 4
+        timeout = timedelta(seconds=10)
+
+        def f(rank: int) -> None:
+            collectives = dist._StoreCollectives(store, rank, world_size)
+            out = collectives.all_sum("foo", rank, timeout)
+            self.assertEqual(out, sum(range(world_size)))
+
+        with ThreadPool(world_size) as pool:
+            pool.map(f, range(world_size))
+
+    def test_broadcast_timeout(self) -> None:
+        store = dist.HashStore()
+
+        world_size = 4
+        timeout = timedelta(milliseconds=1)
+        collectives = dist._StoreCollectives(store, 1, world_size)
+        with self.assertRaisesRegex(Exception, "Wait timeout"):
+            collectives.broadcast_recv("foo", timeout)
+
+    def test_gather_timeout(self) -> None:
+        store = dist.HashStore()
+
+        world_size = 4
+        timeout = timedelta(milliseconds=1)
+        collectives = dist._StoreCollectives(store, 1, world_size)
+        with self.assertRaisesRegex(
+            Exception, "gather failed -- missing ranks: 0, 2, 3"
+        ):
+            collectives.gather_recv("foo", "data", timeout)
+
+    def test_scatter_timeout(self) -> None:
+        store = dist.HashStore()
+
+        world_size = 4
+        timeout = timedelta(milliseconds=1)
+        collectives = dist._StoreCollectives(store, 1, world_size)
+        with self.assertRaisesRegex(Exception, "Wait timeout"):
+            collectives.scatter_recv("foo", timeout)
+
+    def test_all_gather_timeout(self) -> None:
+        store = dist.HashStore()
+
+        world_size = 4
+        timeout = timedelta(milliseconds=1)
+        collectives = dist._StoreCollectives(store, 1, world_size)
+        with self.assertRaisesRegex(
+            Exception, "all_gather failed -- missing ranks: 0, 2, 3"
+        ):
+            collectives.all_gather("foo", "data", timeout)
+
+    def test_barrier_timeout(self) -> None:
+        store = dist.HashStore()
+
+        world_size = 4
+        timeout = timedelta(milliseconds=1)
+        collectives = dist._StoreCollectives(store, 1, world_size)
+        with self.assertRaisesRegex(
+            Exception, "barrier failed -- missing ranks: 0, 2, 3"
+        ):
+            collectives.barrier("foo", timeout, True)
+
+    def test_all_sum_timeout(self) -> None:
+        store = dist.HashStore()
+
+        world_size = 4
+        timeout = timedelta(milliseconds=1)
+        collectives = dist._StoreCollectives(store, 1, world_size)
+        with self.assertRaisesRegex(
+            Exception, "barrier failed -- missing ranks: 0, 2, 3"
+        ):
+            collectives.all_sum("foo", 1, timeout)
+
+    def test_unique(self) -> None:
+        store = dist.HashStore()
+
+        collectives = dist._StoreCollectives(store, 1, 1)
+        collectives.broadcast_send("foo", "bar")
+
+        with self.assertRaisesRegex(Exception, "Key foo has already been used"):
+            collectives.broadcast_send("foo", "bar")
+
+        with self.assertRaisesRegex(Exception, "Key foo has already been used"):
+            collectives.broadcast_recv("foo")
+
+        with self.assertRaisesRegex(Exception, "Key foo has already been used"):
+            collectives.gather_send("foo", "bar")
+
+        with self.assertRaisesRegex(Exception, "Key foo has already been used"):
+            collectives.gather_recv("foo", "asdf")
+
+        with self.assertRaisesRegex(Exception, "Key foo has already been used"):
+            collectives.scatter_send("foo", ["asdf"])
+
+        with self.assertRaisesRegex(Exception, "Key foo has already been used"):
+            collectives.scatter_recv("foo")
+
+        with self.assertRaisesRegex(Exception, "Key foo has already been used"):
+            collectives.all_gather("foo", "bar")
+
+        with self.assertRaisesRegex(Exception, "Key foo has already been used"):
+            collectives.all_sum("foo", 2)
+
+
+if __name__ == "__main__":
+    assert (
+        not torch.cuda._initialized
+    ), "test_distributed must not have initialized CUDA context on main process"
+
+    run_tests()
diff --git a/test/distributed/test_device_mesh.py b/test/distributed/test_device_mesh.py
index d04fcf938c42..8f70ee2f0b7d 100644
--- a/test/distributed/test_device_mesh.py
+++ b/test/distributed/test_device_mesh.py
@@ -23,10 +23,10 @@
     is_nccl_available,
     ProcessGroup,
 )
+from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,
-    skip_unless_torch_gpu,
     with_comms,
 )
 from torch.testing._internal.distributed.fake_pg import FakeStore
@@ -66,7 +66,7 @@ def test_init_process_group(self):
         self.destroy_pg()
 
     @with_comms
-    @skip_unless_torch_gpu
+    @skip_if_lt_x_gpu(4)
     def test_assert_invalid_mesh_tensor(self):
         mesh = torch.arange(self.world_size).to(self.rank)
         with self.assertRaises(ValueError):
@@ -168,7 +168,7 @@ def test_fake_pg_device_mesh(self):
         self.assertEqual(global_tensor.shape, (self.world_size * 2, 8))
 
     @with_comms
-    def test_from_group(self):
+    def test_from_group_with_global_pg(self):
         # Simple test: check `from_group` for a global PG vs. directly
         # initializing via `init_device_mesh`
         global_pg = _get_default_group()
@@ -180,6 +180,23 @@ def test_from_group(self):
             ref_global_mesh._coordinate_on_dim, global_mesh._coordinate_on_dim
         )
 
+    @with_comms
+    def test_from_group_with_invalid_mesh(self):
+        global_pg = _get_default_group()
+        global_pg_size = global_pg.size()
+        assert global_pg_size == 4, "Test assumes global world size of 4"
+        invalid_mesh = [[0, 1], [2, 3]]  # 2D mesh when we need 1D
+        regex = r"Invalid mesh \[\[0, 1\], \[2, 3\]\] for ProcessGroup with ranks \[0, 1, 2, 3\]"
+        with self.assertRaisesRegex(ValueError, regex):
+            DeviceMesh.from_group(global_pg, "cuda", invalid_mesh)
+
+        device_mesh = init_device_mesh(self.device_type, (2, 2))
+        groups = device_mesh.get_group()
+        invalid_mesh = (0, 1, 2, 3)  # 1D mesh when we need 2D
+        regex = r"Expects mesh with ndim equal to number of ProcessGroups but got mesh \[0, 1, 2, 3\] and 2 ProcessGroups"
+        with self.assertRaisesRegex(ValueError, regex):
+            DeviceMesh.from_group(groups, self.device_type, invalid_mesh)
+
     def test_raises_invalid_device_type(self):
         with self.assertRaisesRegex(
             RuntimeError,
@@ -225,7 +242,7 @@ def test_device_mesh_hash(self):
         mesh_tensor_2d = torch.arange(8).reshape(4, 2)
         mesh = DeviceMesh(self.device_type, mesh_tensor_2d)
         mesh2 = DeviceMesh(self.device_type, mesh_tensor_2d)
-        self.assertNotEqual(hash(mesh), hash(mesh2))
+        self.assertEqual(hash(mesh), hash(mesh2))
         mesh_tensor_3d = torch.arange(8).reshape(2, 2, 2)
         mesh3 = DeviceMesh(self.device_type, mesh_tensor_3d)
         self.assertNotEqual(hash(mesh), hash(mesh3))
@@ -269,6 +286,85 @@ def test_get_local_rank_3d(self):
         expected_dp_rank = self.rank // 4
         self.assertEqual(dp_rank, expected_dp_rank)
 
+    @with_comms
+    def test_device_mesh_parent_child_hash(self):
+        mesh_2d = init_device_mesh(
+            self.device_type, (2, self.world_size // 2), mesh_dim_names=("DP", "TP")
+        )
+
+        mesh_group_1 = torch.arange(0, self.world_size // 2)
+        mesh_group_2 = torch.arange(self.world_size // 2, self.world_size)
+        ep_mesh_1 = DeviceMesh(self.device_type, mesh_group_1)
+        ep_mesh_2 = DeviceMesh(self.device_type, mesh_group_2)
+        ep_mesh = ep_mesh_1 if self.rank < self.world_size // 2 else ep_mesh_2
+        # ep_mesh is considered different from mesh_2d["TP"]
+        # since mesh_2d["TP"] has a parent mesh while ep_mesh does not.
+        self.assertEqual(mesh_2d["TP"]._flatten_mesh_list, ep_mesh._flatten_mesh_list)
+        self.assertEqual(mesh_2d["TP"].mesh.shape, ep_mesh.mesh.shape)
+        self.assertEqual(mesh_2d["TP"].device_type, ep_mesh.device_type)
+        self.assertNotEqual(mesh_2d["TP"].mesh_dim_names, ep_mesh.mesh_dim_names)
+        self.assertEqual(mesh_2d["TP"]._thread_id, ep_mesh._thread_id)
+        self.assertNotEqual(mesh_2d["TP"]._parent_mesh, ep_mesh._parent_mesh)
+        self.assertNotEqual(hash(mesh_2d["TP"]), hash(ep_mesh))
+        self.assertNotEqual(mesh_2d["TP"], ep_mesh)
+
+        another_mesh_1 = DeviceMesh(self.device_type, mesh_group_1)
+        another_mesh_2 = DeviceMesh(self.device_type, mesh_group_2)
+        another_mesh = (
+            another_mesh_1 if self.rank < self.world_size // 2 else another_mesh_2
+        )
+        # another_mesh is considered the same as ep_mesh
+        # since they have the same mesh and no parent mesh.
+        self.assertEqual(ep_mesh._flatten_mesh_list, another_mesh._flatten_mesh_list)
+        self.assertEqual(ep_mesh.mesh.shape, another_mesh.mesh.shape)
+        self.assertEqual(ep_mesh.device_type, another_mesh.device_type)
+        self.assertEqual(ep_mesh.mesh_dim_names, another_mesh.mesh_dim_names)
+        self.assertEqual(ep_mesh._thread_id, another_mesh._thread_id)
+        self.assertEqual(ep_mesh._parent_mesh, another_mesh._parent_mesh)
+        self.assertEqual(hash(ep_mesh), hash(another_mesh))
+        self.assertEqual(ep_mesh, another_mesh)
+
+    @with_comms
+    def test_from_group_with_mesh_shape(self):
+        """Tests ``from_group`` when passing ``mesh_shape`` as 2D."""
+        # Consider two different logical views of the same mesh:
+        # - (4, 2) ("dp", "tp") mesh
+        # - (2, 2, 2) ("dp_replicate", "dp_shard", "tp") mesh
+        mesh_shape = (2, 2, 2)
+        mesh_dim_names = ("dp_replicate", "dp_shard", "tp")
+        ref_mesh = init_device_mesh(
+            self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
+        )
+
+        dp_shard_group = ref_mesh["dp_shard"].get_group()
+        dp_replicate_group = ref_mesh["dp_replicate"].get_group()
+
+        dp_mesh = DeviceMesh.from_group(
+            [dp_replicate_group, dp_shard_group],
+            self.device_type,
+            mesh=ref_mesh.mesh[:, :, ref_mesh.get_local_rank(2)],
+            mesh_dim_names=mesh_dim_names[:2],
+        )
+
+        ref_mesh_dp_dim_group_infos = ref_mesh._dim_group_infos[:2]
+        for (_, ref_ranks, _), (_, ranks, _) in zip(
+            ref_mesh_dp_dim_group_infos, dp_mesh._dim_group_infos
+        ):
+            self.assertEqual(ref_ranks, ranks)
+        # Cannot check directly for mesh equality since parent meshes are not
+        # the same since the ref's parent mesh is 3D
+        self.assertEqual(dp_mesh["dp_replicate"].mesh, ref_mesh["dp_replicate"].mesh)
+        for (_, ref_ranks, _), (_, ranks, _) in zip(
+            dp_mesh["dp_replicate"]._dim_group_infos,
+            ref_mesh["dp_replicate"]._dim_group_infos,
+        ):
+            self.assertEqual(ref_ranks, ranks)
+        self.assertEqual(dp_mesh["dp_shard"].mesh, ref_mesh["dp_shard"].mesh)
+        for (_, ref_ranks, _), (_, ranks, _) in zip(
+            dp_mesh["dp_shard"]._dim_group_infos, ref_mesh["dp_shard"]._dim_group_infos
+        ):
+            self.assertEqual(ref_ranks, ranks)
+
 
 class InitDeviceMeshTest(DTensorTestBase):
     @property
@@ -278,20 +374,20 @@ def world_size(self):
     @with_comms
     def test_init_device_mesh(self):
         mesh_shape = (2, 4)
-        ref_mesh = DeviceMesh(self.device_type, torch.arange(8).view(mesh_shape))
+        mesh_dim_names = ("DP", "TP")
+        ref_mesh = DeviceMesh(
+            self.device_type,
+            torch.arange(8).view(mesh_shape),
+            mesh_dim_names=mesh_dim_names,
+        )
 
         # test init_device_mesh with mesh_dim_names
-        mesh_dim_names = ("DP", "TP")
         mesh_2d = init_device_mesh(
             self.device_type, mesh_shape, mesh_dim_names=mesh_dim_names
         )
         self.assertEqual(mesh_2d, ref_mesh)
         self.assertEqual(mesh_2d.mesh_dim_names, mesh_dim_names)
 
-        # test init_device_mesh without mesh_dim_names
-        mesh_2d = init_device_mesh(self.device_type, mesh_shape)
-        self.assertEqual(mesh_2d, ref_mesh)
-
     @with_comms
     def test_raises_duplicate_mesh_dim_names(self):
         with self.assertRaisesRegex(
diff --git a/test/dynamo/test_comptime.py b/test/dynamo/test_comptime.py
index 1170010837a2..6d874a005047 100644
--- a/test/dynamo/test_comptime.py
+++ b/test/dynamo/test_comptime.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: dynamo"]
 
+import collections
 import re
 import sys
 from io import StringIO
@@ -17,6 +18,57 @@
 
 
 class ComptimeTests(torch._dynamo.test_case.TestCase):
+    def test_print_single(self):
+        global FILE
+        FILE = StringIO()
+        cnt = torch._dynamo.testing.CompileCounter()
+
+        def comptime_print(e):
+            @comptime
+            def _(ctx):
+                ctx.print(ctx.get_local("e"), file=FILE)
+
+        Employee = collections.namedtuple("Employee", ["name", "id"])
+
+        class mylist(list):
+            pass
+
+        @torch._dynamo.optimize(cnt, dynamic=True)
+        def f(x):
+            y = x * 2
+            comptime_print(y)
+            comptime_print(2)
+            comptime_print([y, 2])
+            comptime_print((y, 2))
+            comptime_print({"foo": y})
+            comptime_print(range(1, 3))
+            comptime_print(Employee("foo", 2))
+            comptime_print(mylist([1, 2]))
+            comptime_print(collections.defaultdict(lambda: None))
+            comptime_print(set())
+            comptime_print({"a", "b"})
+            comptime_print(x.size(0))
+            return y + 3
+
+        f(torch.randn(2))
+        self.assertEqual(cnt.frame_count, 1)
+        self.assertExpectedInline(
+            FILE.getvalue().strip(),
+            """\
+FakeTensor(..., size=(s0,))
+2
+[FakeTensor(..., size=(s0,)), 2]
+(FakeTensor(..., size=(s0,)), 2)
+{'foo': FakeTensor(..., size=(s0,))}
+range(1, 3, 1)
+Employee(name='foo', id=2)
+[1, 2]
+defaultdict(NestedUserFunctionVariable(), {})
+set()
+{'a','b'}
+s0""",
+        )
+
     def test_print_graph(self):
         global FILE
         FILE = StringIO()
diff --git a/test/dynamo/test_cpp_guard_manager.py b/test/dynamo/test_cpp_guard_manager.py
deleted file mode 100644
index 0597e158972d..000000000000
--- a/test/dynamo/test_cpp_guard_manager.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Owner(s): ["module: dynamo"]
-
-from torch._dynamo import config
-from torch._dynamo.testing import make_test_cls_with_patches
-
-try:
-    from . import (
-        test_functions,
-        test_higher_order_ops,
-        test_misc,
-        test_optimizers,
-        test_repros,
-    )
-except ImportError:
-    import test_functions
-    import test_higher_order_ops
-    import test_misc
-    import test_optimizers
-    import test_repros
-
-
-test_classes = {}
-
-
-def make_cpp_guard_manager_cls(cls):
-    suffix = "_cpp_guard_manager"
-
-    cls_prefix = "CppGuardManager"
-
-    test_class = make_test_cls_with_patches(
-        cls,
-        cls_prefix,
-        suffix,
-        (config, "enable_cpp_guard_manager", True),
-    )
-
-    test_classes[test_class.__name__] = test_class
-    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
-    globals()[test_class.__name__] = test_class
-    test_class.__module__ = __name__
-    return test_class
-
-
-tests = [
-    test_functions.FunctionTests,
-    test_misc.MiscTests,
-    test_repros.ReproTests,
-    test_higher_order_ops.HigherOrderOpTests,
-    test_higher_order_ops.FuncTorchHigherOrderOpTests,
-    test_optimizers.End2EndTests,
-]
-for test in tests:
-    make_cpp_guard_manager_cls(test)
-del test
-
-if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
-
-    run_tests()
diff --git a/test/dynamo/test_deviceguard.py b/test/dynamo/test_deviceguard.py
index 4ed54a4c1922..bd3c73a7b578 100644
--- a/test/dynamo/test_deviceguard.py
+++ b/test/dynamo/test_deviceguard.py
@@ -20,7 +20,7 @@ def setUp(self):
         self.device_interface = Mock()
 
         self.device_interface.exchange_device = Mock(return_value=0)
-        self.device_interface.maybe_exchange_device = Mock(return_value=0)
+        self.device_interface.maybe_exchange_device = Mock(return_value=1)
 
     def test_device_guard(self):
         device_guard = DeviceGuard(self.device_interface, 1)
@@ -32,7 +32,7 @@ def test_device_guard(self):
 
         self.device_interface.maybe_exchange_device.assert_called_once_with(0)
         self.assertEqual(device_guard.prev_idx, 0)
-        self.assertEqual(device_guard.idx, 0)
+        self.assertEqual(device_guard.idx, 1)
 
     def test_device_guard_no_index(self):
         device_guard = DeviceGuard(self.device_interface, None)
@@ -70,7 +70,7 @@ def test_device_guard(self):
 
         self.assertEqual(torch.cuda.current_device(), current_device)
         self.assertEqual(device_guard.prev_idx, 0)
-        self.assertEqual(device_guard.idx, 0)
+        self.assertEqual(device_guard.idx, 1)
 
     def test_device_guard_no_index(self):
         current_device = torch.cuda.current_device()
diff --git a/test/dynamo/test_functions.py b/test/dynamo/test_functions.py
index 77d7fea5d862..472e9c56bae6 100644
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@@ -642,6 +642,13 @@ def test_get_autocast_gpu_dtype(x):
         dtype = torch.get_autocast_gpu_dtype()
         return x.type(dtype)
 
+    @make_test
+    def test_is_any_autocast_enabled(x):
+        if torch._C._is_any_autocast_enabled():
+            return x + 1
+        else:
+            return x - 1
+
     @make_test
     def test_list_compare_polyfill(x):
         for a, b, c in [
@@ -1157,6 +1164,32 @@ def test_tuple_contains(a, b):
             return a + b
         return a - b
 
+    @unittest.skipIf(
+        sys.version_info < (3, 9),
+        "SET_UPDATE was added at Python 3.9",
+    )
+    @make_test
+    def test_set_update_bytecode(x):
+        # This produces bytecode SET_UPDATE since python 3.9
+        var = {"apple", "banana", "cherry"}
+        if isinstance(var, set):
+            return x + 1
+        else:
+            return x - 1
+
+    @unittest.skipIf(
+        sys.version_info < (3, 9),
+        "SET_UPDATE was added at Python 3.9",
+    )
+    @make_test
+    def test_set_update_list_with_duplicated_items(x):
+        list1 = ["apple", "banana", "apple"]
+        list2 = ["orange", "banana"]
+        if len({*list1, *list2}) == 3:
+            return x + 1
+        else:
+            return x - 1
+
     @make_test
     def test_set_contains(a, b):
         vals = set(["a", "b", "c"])
@@ -1330,6 +1363,13 @@ def isinstance_namedtuple(obj) -> bool:
         else:
             return a - b
 
+    @make_test
+    def test_torch_size_hasattr(x):
+        if hasattr(x.shape, "_fields"):
+            return x + 1
+        else:
+            return x - 1
+
     @make_test
     def test_is_quantized(a, b):
         if not a.is_quantized:
diff --git a/test/dynamo/test_higher_order_ops.py b/test/dynamo/test_higher_order_ops.py
index 880e761037cd..9b86a90b02f3 100644
--- a/test/dynamo/test_higher_order_ops.py
+++ b/test/dynamo/test_higher_order_ops.py
@@ -1921,7 +1921,8 @@ def f(x):
         self.assertTrue(len(wrap_node.args), 3)
 
         # Check that the linear bias and weight are getattr in the outer graph
-        self.assertTrue(len(dict(backend.graphs[0].named_parameters())) == 2)
+        if not torch._dynamo.config.inline_inbuilt_nn_modules:
+            self.assertTrue(len(dict(backend.graphs[0].named_parameters())) == 2)
 
         # Check that the inner function has one op and its a linear op
         body_function = getattr(backend.graphs[0], wrap_node.args[0].name)
@@ -2052,7 +2053,8 @@ def f(x):
         self.assertTrue(len(wrap_node.args), 3)
 
         # Check that the linear bias and weight are getattr in the outer graph
-        self.assertTrue(len(dict(backend.graphs[0].named_parameters())) == 2)
+        if not torch._dynamo.config.inline_inbuilt_nn_modules:
+            self.assertTrue(len(dict(backend.graphs[0].named_parameters())) == 2)
 
         # Check that the inner function has one op and its a linear op
         body_function = getattr(backend.graphs[0], wrap_node.args[0].name)
diff --git a/test/dynamo/test_misc.py b/test/dynamo/test_misc.py
index a70e5767f3d6..5d7f780457d0 100644
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@@ -10371,6 +10371,7 @@ def fn(x, const):
         c2 = _debug_get_cache_entry_list(fn.__code__)
         self.assertIs(c1[1], c2[0])
 
+    @torch._dynamo.config.patch(inline_inbuilt_nn_modules=False)
     def test_dynamo_cache_invalidate(self):
         class Mod(torch.nn.Module):
             def __init__(self):
diff --git a/test/dynamo/test_modules.py b/test/dynamo/test_modules.py
index 0b12767583bd..ceb1521ffe69 100644
--- a/test/dynamo/test_modules.py
+++ b/test/dynamo/test_modules.py
@@ -1885,9 +1885,52 @@ def forward(self, x):
             "torch._dynamo.config.cache_size_limit",
             cache_size_limit,
         ):
-            x = torch.randn(*size)
+            x = torch.randn(*size, requires_grad=True)
             mod(x)
-            self.assertEqual(cnts.frame_count, num_submodules)
+            if torch._dynamo.config.inline_inbuilt_nn_modules:
+                self.assertEqual(cnts.frame_count, 1)
+            else:
+                self.assertEqual(cnts.frame_count, num_submodules)
+
+    @patch.object(torch._dynamo.config, "inline_inbuilt_nn_modules", True)
+    def test_inline_inbuilt_nn_modules(self):
+        size = (10, 10)
+        cache_size_limit = 1
+        num_submodules = 4
+        cnts = torch._dynamo.testing.CompileCounterWithBackend("eager")
+
+        class SubModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(*size)
+
+            def forward(self, x):
+                a = torch.sin(torch.cos(x))
+                return self.linear(a)
+
+        class MockModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.mods = [SubModule() for _ in range(num_submodules)]
+                self.mods = [torch.compile(mod, backend=cnts) for mod in self.mods]
+
+            def forward(self, x):
+                for mod in self.mods:
+                    x = mod(x)
+                return x
+
+        mod = MockModule()
+        # Each submod is compiled separately and has a different nn module
+        # guard. Ensure that recompilation logic is handle correctly.
+        with unittest.mock.patch(
+            "torch._dynamo.config.error_on_recompile", True
+        ), unittest.mock.patch(
+            "torch._dynamo.config.cache_size_limit",
+            cache_size_limit,
+        ):
+            x = torch.randn(*size, requires_grad=True)
+            mod(x)
+            self.assertEqual(cnts.frame_count, 1)
 
     def test_cache_size_limit_on_guarded_nn_modules(self):
         cache_size_limit = 2
@@ -1929,7 +1972,10 @@ def forward(self, x):
             ]:
                 x = torch.randn(size)
                 mod(x)
-        self.assertEqual(cnts.frame_count, 2 * num_submodules)
+        if torch._dynamo.config.inline_inbuilt_nn_modules:
+            self.assertEqual(cnts.frame_count, 2)
+        else:
+            self.assertEqual(cnts.frame_count, 2 * num_submodules)
 
     def test_recursion(self):
         mod = MockModule()
@@ -2129,6 +2175,7 @@ def new_forward_hook(
 
     @patch.object(torch._dynamo.config, "guard_nn_modules", False)
     @patch.object(torch._dynamo.config, "skip_nnmodule_hook_guards", True)
+    @patch.object(torch._dynamo.config, "inline_inbuilt_nn_modules", False)
     def test_hooks_skip_guards(self):
         class TestModule(torch.nn.Module):
             def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -2532,10 +2579,16 @@ def foo(mod, x):
 
         mod = Mod()
         foo(mod, torch.rand([4]))
-        self.assertEqual(compiles_without_buffers, 0)
+        if torch._dynamo.config.inline_inbuilt_nn_modules:
+            self.assertEqual(compiles_without_buffers, 1)
+        else:
+            self.assertEqual(compiles_without_buffers, 0)
 
         foo(mod, torch.rand([4], dtype=torch.half))
-        self.assertEqual(compiles_without_buffers, 1)
+        if torch._dynamo.config.inline_inbuilt_nn_modules:
+            self.assertEqual(compiles_without_buffers, 2)
+        else:
+            self.assertEqual(compiles_without_buffers, 1)
 
         class Mod2(Mod):
             def __setattr__(self, name, value):
@@ -2560,9 +2613,10 @@ class MockModule(torch.nn.Module):
             def __init__(self):
                 super().__init__()
                 self.linear = torch.nn.Linear(10, 10)
+                self.multiplier = 10
 
             def forward(self, x):
-                return self.linear(x)
+                return self.linear(x) * self.multiplier
 
         mod = MockModule()
 
@@ -2578,7 +2632,7 @@ def generate(x, c):
         self.assertEqual(cnt.frame_count, 2)
 
         # Ensure that modification in user module causes recompile
-        mod.eval()
+        mod.multiplier = 11
         generate(torch.randn(10, 10), 0)
         self.assertEqual(cnt.frame_count, 3)
 
@@ -2628,6 +2682,44 @@ def fn(x):
         self.assertEqual(test_functions._variable, 1)
         self.assertEqual(res, 3 * torch.ones(10))
 
+    def test_monkeypatching_forward(self):
+        class FakeModule(torch.nn.Module):
+            def forward(self, x):
+                return torch.sin(x)
+
+        class MyModule(torch.nn.Module):
+            def __init__(self, x):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.cos(x)
+
+        def helper():
+            torch._dynamo.reset()
+            mod = MyModule(3)
+
+            def fn(x):
+                return mod(x)
+
+            cnt = torch._dynamo.testing.CompileCounter()
+            opt_fn = torch._dynamo.optimize(cnt)(fn)
+            x = torch.randn(10)
+
+            opt_fn(x)
+            opt_fn(x)
+            self.assertEqual(cnt.frame_count, 1)
+
+            # Monkeypatch forward
+            mod.forward = types.MethodType(FakeModule.forward, mod)
+            ref = fn(x)
+            res = opt_fn(x)
+            self.assertEqual(ref, res)
+            self.assertEqual(cnt.frame_count, 2)
+
+        helper()
+        with torch._dynamo.config.patch(inline_inbuilt_nn_modules=True):
+            helper()
+
 
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
diff --git a/test/dynamo/test_repros.py b/test/dynamo/test_repros.py
index 33f8d10a7b71..96bf924e0999 100644
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@@ -33,6 +33,7 @@
 from torch import nn
 from torch._dynamo.debug_utils import same_two_models
 from torch._dynamo.testing import CompileCounter, rand_strided, same
+from torch._inductor.utils import fresh_inductor_cache
 from torch.nn import functional as F
 
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
@@ -400,7 +401,7 @@ def _iter_ex(self, resolve: bool) -> Iterator[Any]:
         try:
             return ListConfig.ListIterator(self, resolve)
         except Exception:
-            raise AssertionError
+            raise AssertionError from None
 
     def __init__(self):
         self._content = [
@@ -3346,8 +3347,7 @@ def fn(x):
             return y
 
         x = {"a": torch.tensor([1]), "b": torch.tensor([1])}
-        # FIXME It should be KeyError here
-        self.assertRaises(torch._dynamo.exc.InternalTorchDynamoError, lambda: fn(x))
+        self.assertRaises(KeyError, lambda: fn(x))
 
     def test_attached_attribute_in_dir(self):
         class MyModule(torch.nn.Module):
@@ -4949,6 +4949,45 @@ def f(a, tmp):
         # grad state may not be properly reset after the error
         self.assertTrue(torch.is_grad_enabled())
 
+    def test_const_dict_keyerror(self):
+        d = {}
+
+        def fn(x):
+            try:
+                y = d[0]
+            except KeyError:
+                y = 1
+            return x + y
+
+        opt_fn = torch.compile(fn, backend="eager")
+        inp = torch.randn(3, 3)
+        self.assertEqual(fn(inp), opt_fn(inp))
+
+    def test_nonconst_issubclass(self):
+        def fn(x):
+            if issubclass(x.__class__, np.ndarray):
+                return 1
+            return 0
+
+        opt_fn = torch.compile(fn, backend="eager")
+        opt_fn(np.ones([3, 3]))
+
+    def test_issue126128(self):
+        def fn():
+            x = torch.randn(1, 10)
+            y = torch.randn(10, 1)
+            return torch.mm(x, y).sum()
+
+        def fn2():
+            x = torch.randn(10, 100)
+            y = torch.randn(100, 10)
+            return torch.mm(x, y).sum()
+
+        with fresh_inductor_cache():
+            torch.compile(fn)()
+
+        torch.compile(fn2)()
+
 
 instantiate_parametrized_tests(ReproTests)
 
diff --git a/test/dynamo/test_unspec.py b/test/dynamo/test_unspec.py
index 9b49f5ff8bb6..cb47a0b728a3 100644
--- a/test/dynamo/test_unspec.py
+++ b/test/dynamo/test_unspec.py
@@ -394,6 +394,36 @@ def f4(v):
         self.assertEqual(f3(r), optimize(f3)(r))
         self.assertEqual(f4(r), optimize(f4)(r))
 
+    def test_to_tensor(self):
+        def f1():
+            a = np.random.uniform(low=-1, high=1, size=(20, 1))
+            return torch.tensor([a, a, a, a], dtype=torch.float64, device="cpu")
+
+        def f2():
+            a = torch.tensor([[[123]]])
+            return torch.tensor([a, a])
+
+        def f3():
+            a = torch.tensor(123)
+            return torch.tensor([a, a])
+
+        def f4():
+            a = torch.tensor(123)
+            b = torch.tensor([[[456]]])
+            return torch.tensor([a, b])
+
+        def f5():
+            a = np.array([1, 2])
+            return torch.tensor([a, a])
+
+        optimize = torch.compile(backend="aot_eager", fullgraph=True)
+
+        self.assertEqual(f1().shape, optimize(f1)().shape)
+        self.assertEqual(f2(), optimize(f2)())
+        self.assertEqual(f3(), optimize(f3)())
+        self.assertEqual(f4(), optimize(f4)())
+        self.assertEqual(f5(), optimize(f5)())
+
     def test_sym_int_conversion(self):
         def f(x):
             y = x.size(0)
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_maml_regression_mechanism_functional_call_cpu b/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_maml_regression_mechanism_functional_call_cpu
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_maml_regression_mechanism_make_functional_cpu b/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_maml_regression_mechanism_make_functional_cpu
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_resnet18_per_sample_grads_mechanism_functional_call_cpu b/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_resnet18_per_sample_grads_mechanism_functional_call_cpu
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_resnet18_per_sample_grads_mechanism_make_functional_cpu b/test/dynamo_expected_failures/TestExamplesCorrectnessCPU.test_resnet18_per_sample_grads_mechanism_make_functional_cpu
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCUDA.test_maml_regression_mechanism_functional_call_cuda b/test/dynamo_expected_failures/TestExamplesCorrectnessCUDA.test_maml_regression_mechanism_functional_call_cuda
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestExamplesCorrectnessCUDA.test_maml_regression_mechanism_make_functional_cuda b/test/dynamo_expected_failures/TestExamplesCorrectnessCUDA.test_maml_regression_mechanism_make_functional_cuda
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestFXGraphMatcherModels.test_mobilenet_v2_qat b/test/dynamo_expected_failures/TestFXGraphMatcherModels.test_mobilenet_v2_qat
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/caffe2/python/docs/__init__.py b/test/dynamo_expected_failures/TestQuantizePT2E.test_multi_users_without_output_observer
similarity index 100%
rename from caffe2/python/docs/__init__.py
rename to test/dynamo_expected_failures/TestQuantizePT2E.test_multi_users_without_output_observer
diff --git a/caffe2/python/examples/__init__.py b/test/dynamo_expected_failures/TestSubclassSerialization.test_allowlist_for_weights_only
similarity index 100%
rename from caffe2/python/examples/__init__.py
rename to test/dynamo_expected_failures/TestSubclassSerialization.test_allowlist_for_weights_only
diff --git a/test/dynamo_expected_failures/TestTensorBoardEmbedding.test_embedding b/test/dynamo_expected_failures/TestTensorBoardEmbedding.test_embedding
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestTensorBoardEmbedding.test_embedding_64 b/test/dynamo_expected_failures/TestTensorBoardEmbedding.test_embedding_64
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestTensorBoardSummary.test_image_with_one_channel b/test/dynamo_expected_failures/TestTensorBoardSummary.test_image_with_one_channel
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/dynamo_expected_failures/TestTensorBoardSummary.test_image_without_channel b/test/dynamo_expected_failures/TestTensorBoardSummary.test_image_without_channel
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/test/expect/HasDecompTest.test_has_decomposition.expect b/test/expect/HasDecompTest.test_has_decomposition.expect
index bbd8475fd680..669c3d91e849 100644
--- a/test/expect/HasDecompTest.test_has_decomposition.expect
+++ b/test/expect/HasDecompTest.test_has_decomposition.expect
@@ -18,6 +18,10 @@ aten::_add_relu.Tensor
 aten::_add_relu.out
 aten::_add_relu_.Scalar
 aten::_add_relu_.Tensor
+aten::_aminmax
+aten::_aminmax.dim
+aten::_aminmax.dim_out
+aten::_aminmax.out
 aten::_amp_foreach_non_finite_check_and_unscale
 aten::_amp_foreach_non_finite_check_and_unscale.out
 aten::_amp_foreach_non_finite_check_and_unscale_
@@ -335,6 +339,9 @@ aten::_functional_assert_async.msg
 aten::_functional_assert_scalar
 aten::_functional_sym_constrain_range
 aten::_functional_sym_constrain_range_for_size
+aten::_fused_adagrad
+aten::_fused_adagrad.out
+aten::_fused_adagrad_
 aten::_fused_adam
 aten::_fused_adam.out
 aten::_fused_adam.tensor_lr
diff --git a/test/export/test_experimental.py b/test/export/test_experimental.py
index 2d7e88bfc111..b343dbff27a7 100644
--- a/test/export/test_experimental.py
+++ b/test/export/test_experimental.py
@@ -8,6 +8,7 @@
 from torch._export.wrappers import _mark_strict_experimental
 
 from torch._functorch.aot_autograd import aot_export_module
+from torch.export._trace import _convert_ts_to_export_experimental
 
 from torch.testing import FileCheck
 
@@ -106,6 +107,36 @@ def forward(self, x):
         ):
             ep = torch.export.export(M(), inp, strict=False)
 
+    def test_torchscript_module_export(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.cos() + x.sin()
+
+        model_to_trace = M()
+        inps = (torch.randn(4, 4),)
+        traced_module_by_torchscript = torch.jit.trace(M(), example_inputs=inps)
+
+        exported_module = _convert_ts_to_export_experimental(
+            traced_module_by_torchscript, inps
+        )
+
+        self.assertTrue(torch.allclose(exported_module(*inps), model_to_trace(*inps)))
+
+    def test_torchscript_module_export_single_input(self):
+        class M(torch.nn.Module):
+            def forward(self, x):
+                return x.cos() + x.sin()
+
+        model_to_trace = M()
+        inps = torch.randn(4, 4)
+        traced_module_by_torchscript = torch.jit.trace(M(), example_inputs=inps)
+
+        exported_module = _convert_ts_to_export_experimental(
+            traced_module_by_torchscript, inps
+        )
+
+        self.assertTrue(torch.allclose(exported_module(inps), model_to_trace(inps)))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/export/test_export.py b/test/export/test_export.py
index fe5b7dc2ca65..406e1f55dd80 100644
--- a/test/export/test_export.py
+++ b/test/export/test_export.py
@@ -631,7 +631,13 @@ def forward(self, x, weight, bias):
         self.assertEqual(actual_result, expected_result)
 
     # TODO(yidi)
-    @unittest.expectedFailure
+    # Expected failure for test cases that calls run_decomposition().
+    # The top-level cond node has pre-existing metadata,
+    # which overrides the metadata for operators in subgraph due to interpreter.run(),
+    # where cond is a single node in the interpreter.run(). And we preserve metadata
+    # by copying current node's metadata for all nodes created during interpreting.
+    @testing.expectedFailurePreDispatchRunDecomp
+    @testing.expectedFailureRetraceability
     def test_export_cond_preserve_torch_fn_for_subgraphs(self):
         class MySubModule(torch.nn.Module):
             def foo(self, x):
@@ -1516,7 +1522,6 @@ def forward(self, arg1, arg2, *args, kw1, kw2, **kwargs):
         self._test_export_same_as_eager(kw_func, args, kwargs)
 
     @testing.expectedFailureSerDer  # we don't save placeholder metadata
-    @testing.expectedFailureSerDerPreDispatch
     @testing.expectedFailureNonStrict
     def test_linear_conv(self):
         class MyLinear(torch.nn.Module):
@@ -2091,6 +2096,32 @@ def forward(self, x):
         ):
             export(Module(), (torch.tensor(1, device="cpu"),))
 
+    def test_float_conversion(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                return x.float()
+
+        ep = export(Module(), (torch.tensor(1, dtype=torch.float),))
+        ops = []
+        for node in ep.graph.nodes:
+            if node.op == "call_function":
+                ops.append(node.target)
+        self.assertGreater(len(ops), 0)
+        for op in ops:
+            self.assertIn(op, (torch.ops.aten._to_copy.default,))
+
+    def test_device_to_mutation_float(self):
+        class Module(torch.nn.Module):
+            def forward(self, x):
+                y = x.float()
+                y.add_(1)
+                return y, x
+
+        with self.assertRaisesRegex(
+            RuntimeError, "cannot mutate tensors with frozen storage"
+        ):
+            export(Module(), (torch.tensor(1, dtype=torch.float),))
+
     def test_module(self):
         class MyLinear(torch.nn.Module):
             def __init__(self):
@@ -2870,7 +2901,6 @@ def forward(self, xs, y):
             )
 
     @testing.expectedFailureSerDer  # We don't preserve metadata on graph module
-    @testing.expectedFailureSerDerPreDispatch
     @testing.expectedFailureNonStrict
     def test_retrace_graph_level_meta_preservation(self):
         class Foo(torch.nn.Module):
@@ -3660,7 +3690,6 @@ def forward(self, q, k, v):
         self.assertEqual(ep.module()(*inputs), m(*inputs))
 
     @testing.expectedFailureSerDer  # symfloat nyi
-    @testing.expectedFailureSerDerPreDispatch  # symfloat nyi
     def test_sym_sqrt(self):
         import math
 
@@ -4890,6 +4919,31 @@ def forward(self, x):
         unflattened = unflatten(ep)
         self.assertTrue(torch.allclose(m1(*inps), unflattened(*inps)))
 
+    @testing.expectedFailureRetraceability
+    def test_unused_aliases(self):
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                # param
+                self.alpha = torch.nn.Parameter(torch.randn(4))
+                self.beta = self.alpha
+                self.gamma = self.alpha
+
+            def forward(self, x):
+                return x + self.gamma
+
+        inps = (torch.randn(4),)
+        ep = export(Foo(), inps)
+        # placeholder nodes will be deduplicated in strict-mode,
+        # but check that all params still appear in state dict
+        for param in ["alpha", "beta", "gamma"]:
+            self.assertTrue(param in ep.state_dict)
+
+        # check that they also appear in unflattened state dict
+        unep = unflatten(ep)
+        for param in ["alpha", "beta", "gamma"]:
+            self.assertTrue(param in unep.state_dict())
+
 
 @unittest.skipIf(not torchdynamo.is_dynamo_supported(), "dynamo isn't support")
 class TestOneOffModelExportResult(TestCase):
diff --git a/test/export/test_export_predispatch.py b/test/export/test_export_predispatch.py
deleted file mode 100644
index 2075cba58ca6..000000000000
--- a/test/export/test_export_predispatch.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Owner(s): ["oncall: export"]
-
-try:
-    from . import test_export, testing
-except ImportError:
-    import test_export
-    import testing
-from torch.export._trace import _export
-
-test_classes = {}
-
-
-def mocked_predispatch_export(*args, **kwargs):
-    # If user already specified strict, don't make it non-strict
-    ep = _export(*args, **kwargs, pre_dispatch=True)
-    return ep.run_decompositions()
-
-
-def make_dynamic_cls(cls):
-    suffix = "_pre_dispatch"
-
-    cls_prefix = "PreDispatchExport"
-
-    test_class = testing.make_test_cls_with_mocked_export(
-        cls,
-        cls_prefix,
-        suffix,
-        mocked_predispatch_export,
-        xfail_prop="_expected_failure_pre_dispatch",
-    )
-
-    test_classes[test_class.__name__] = test_class
-    # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
-    globals()[test_class.__name__] = test_class
-    test_class.__module__ = __name__
-    return test_class
-
-
-tests = [
-    test_export.TestDynamismExpression,
-    test_export.TestExport,
-]
-for test in tests:
-    make_dynamic_cls(test)
-del test
-
-if __name__ == "__main__":
-    from torch._dynamo.test_case import run_tests
-
-    run_tests()
diff --git a/test/export/test_serdes.py b/test/export/test_serdes.py
index 253c6db81819..bd11cd7f8366 100644
--- a/test/export/test_serdes.py
+++ b/test/export/test_serdes.py
@@ -9,7 +9,6 @@
     import testing
 
 from torch.export import export, load, save
-from torch.export._trace import _export
 
 test_classes = {}
 
@@ -23,21 +22,10 @@ def mocked_serder_export(*args, **kwargs):
     return loaded_ep
 
 
-def mocked_serder_export_pre_dispatch(*args, **kwargs):
-    ep = _export(*args, **kwargs, pre_dispatch=True)
-    buffer = io.BytesIO()
-    save(ep, buffer)
-    buffer.seek(0)
-    loaded_ep = load(buffer)
-    return loaded_ep
-
-
 def make_dynamic_cls(cls):
     suffix = "_serdes"
-    suffix_pre_dispatch = "_serdes_pre_dispatch"
 
     cls_prefix = "SerDesExport"
-    cls_prefix_pre_dispatch = "SerDesExportPreDispatch"
 
     test_class = testing.make_test_cls_with_mocked_export(
         cls,
@@ -47,21 +35,10 @@ def make_dynamic_cls(cls):
         xfail_prop="_expected_failure_serdes",
     )
 
-    test_class_pre_dispatch = testing.make_test_cls_with_mocked_export(
-        cls,
-        cls_prefix_pre_dispatch,
-        suffix_pre_dispatch,
-        mocked_serder_export_pre_dispatch,
-        xfail_prop="_expected_failure_serdes_pre_dispatch",
-    )
-
     test_classes[test_class.__name__] = test_class
-    test_classes[test_class_pre_dispatch.__name__] = test_class_pre_dispatch
     # REMOVING THIS LINE WILL STOP TESTS FROM RUNNING
     globals()[test_class.__name__] = test_class
-    globals()[test_class_pre_dispatch.__name__] = test_class_pre_dispatch
     test_class.__module__ = __name__
-    test_class_pre_dispatch.__module__ = __name__
 
 
 tests = [
diff --git a/test/export/test_tools.py b/test/export/test_tools.py
new file mode 100644
index 000000000000..b8ab7616fd67
--- /dev/null
+++ b/test/export/test_tools.py
@@ -0,0 +1,67 @@
+# Owner(s): ["oncall: export"]
+
+import torch
+from torch._dynamo.test_case import TestCase
+from torch._export.tools import report_exportability
+
+from torch.testing._internal.common_utils import run_tests
+
+torch.library.define(
+    "testlib::op_missing_meta",
+    "(Tensor(a!) x, Tensor(b!) z) -> Tensor",
+    tags=torch.Tag.pt2_compliant_tag,
+)
+
+
+@torch.library.impl("testlib::op_missing_meta", "cpu")
+@torch._dynamo.disable
+def op_missing_meta(x, z):
+    x.add_(5)
+    z.add_(5)
+    return x + z
+
+
+class TestExportTools(TestCase):
+    def test_report_exportability_basic(self):
+        class Module(torch.nn.Module):
+            def forward(self, x, y):
+                return x[0] + y
+
+        f = Module()
+        inp = ([torch.ones(1, 3)], torch.ones(1, 3))
+
+        report = report_exportability(f, inp)
+        self.assertTrue(len(report) == 1)
+        self.assertTrue(report[""] is None)
+
+    def test_report_exportability_with_issues(self):
+        class Unsupported(torch.nn.Module):
+            def forward(self, x):
+                return torch.ops.testlib.op_missing_meta(x, x.cos())
+
+        class Supported(torch.nn.Module):
+            def forward(self, x):
+                return x.sin()
+
+        class Module(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.unsupported = Unsupported()
+                self.supported = Supported()
+
+            def forward(self, x):
+                y = torch.nonzero(x)
+                return self.unsupported(y) + self.supported(y)
+
+        f = Module()
+        inp = (torch.ones(4, 4),)
+
+        report = report_exportability(f, inp, strict=False, pre_dispatch=True)
+
+        self.assertTrue(report[""] is not None)
+        self.assertTrue(report["unsupported"] is not None)
+        self.assertTrue(report["supported"] is None)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/export/test_unflatten.py b/test/export/test_unflatten.py
index b8ff48334f01..19c55982d590 100644
--- a/test/export/test_unflatten.py
+++ b/test/export/test_unflatten.py
@@ -708,6 +708,44 @@ def forward(self, input_):
         umod = unflatten(ep_non_strict)
         self.assertTrue(torch.allclose(umod(input_), mod(input_)))
 
+    def test_simple_alias(self):
+        # handle weight sharing, check tensor ids after unflattening
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                # alias param
+                self.bias = torch.nn.Parameter(torch.randn(4))
+                self.m = torch.nn.Linear(4, 4)
+                self.m.bias = self.bias
+
+            def forward(self, x):
+                return self.m(x) + self.bias
+
+        m = Foo()
+        inps = (torch.randn(4, 4),)
+        ep = export(m, inps)
+        unep = unflatten(ep)
+        self.assertTrue(id(unep.m.bias) == id(unep.bias))
+
+        # handle aliasing where one alias is unused
+        class Foo(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.bias = torch.nn.Parameter(torch.randn(4))
+                self.m = torch.nn.Linear(4, 4)
+                self.m.bias = (
+                    self.bias
+                )  # self.bias is unused, aliasing should be handled
+
+            def forward(self, x):
+                return self.m(x)
+
+        m = Foo()
+        inps = (torch.randn(4, 4),)
+        ep = export(m, inps)
+        unep = unflatten(ep)
+        self.assertTrue(torch.allclose(unep(*inps), m(*inps)))
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/test/forward_backward_compatibility/check_forward_backward_compatibility.py b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
index c458110c859c..81b85a4fe42f 100644
--- a/test/forward_backward_compatibility/check_forward_backward_compatibility.py
+++ b/test/forward_backward_compatibility/check_forward_backward_compatibility.py
@@ -9,9 +9,6 @@
 from torch._C import parse_schema
 
 
-# Run by backwards_compat CI job
-
-
 # How to run this test locally:
 # 1 Have two virtual environments (eg conda env), one without PyTorch installed (venv_nightly)
 #   one with your local changes (venv_yours).
@@ -143,10 +140,8 @@
     ("onednn::qconv2d_pointwise", datetime.date(2024, 12, 31)),
     ("onednn::qconv3d_pointwise", datetime.date(2024, 12, 31)),
     ("onednn::qconv2d_pointwise.binary", datetime.date(2024, 12, 31)),
-    ("aten::_aminmax", datetime.date(2024, 12, 31)),
-    ("aten::_aminmax.dim", datetime.date(2024, 12, 31)),
-    ("aten::_aminmax.out", datetime.date(2024, 12, 31)),
-    ("aten::_aminmax.dim_out", datetime.date(2024, 12, 31)),
+    # BC-breaking change in can_cast signature: 'from' -> 'from_'
+    ("aten::can_cast", datetime.date(2024, 5, 31)),
 ]
 
 ALLOW_LIST_COMPILED = [
diff --git a/test/functorch/test_aotdispatch.py b/test/functorch/test_aotdispatch.py
index ffa71a7e905b..cfbd96e7368d 100644
--- a/test/functorch/test_aotdispatch.py
+++ b/test/functorch/test_aotdispatch.py
@@ -4835,70 +4835,6 @@ def f(a, b, c, d):
         self.assertEqual(get_num_ins_outs(fw_graph), (4, 2))
         self.assertEqual(get_num_ins_outs(bw_graph), (2, 4))
 
-    @unittest.skipIf(not USE_NETWORKX, "networkx not available")
-    def test_min_cut_partitioner_recomputable_ops(self):
-        def f(x):
-            return x * x * x
-
-        recomputable_ops = []
-        partition_fn = partial(
-            min_cut_rematerialization_partition, recomputable_ops=recomputable_ops
-        )
-
-        fw_graph, bw_graph = get_fw_bw_graph(
-            f, [torch.randn(3, requires_grad=True)], partition_fn
-        )
-        # Expected forward graph:
-        # opcode         name       target           args                        kwargs
-        # -------------  ---------  ---------------  --------------------------  --------
-        # placeholder    primals_1  primals_1        ()                          {}
-        # call_function  mul        aten.mul.Tensor  (primals_1, primals_1)      {}
-        # call_function  mul_1      aten.mul.Tensor  (mul, primals_1)            {}
-        # output         output     output           ([mul_1, primals_1, mul],)  {}
-        self.assertEqual(get_num_ins_outs(fw_graph), (1, 3))
-        # Expected backward graph:
-        # opcode         name        target           args                     kwargs
-        # -------------  ----------  ---------------  -----------------------  --------
-        # placeholder    primals_1   primals_1        ()                       {}
-        # placeholder    mul         mul              ()                       {}
-        # placeholder    tangents_1  tangents_1       ()                       {}
-        # call_function  mul_2       aten.mul.Tensor  (tangents_1, mul)        {}
-        # call_function  mul_3       aten.mul.Tensor  (tangents_1, primals_1)  {}
-        # call_function  mul_4       aten.mul.Tensor  (mul_3, primals_1)       {}
-        # call_function  add         aten.add.Tensor  (mul_2, mul_4)           {}
-        # call_function  add_1       aten.add.Tensor  (add, mul_4)             {}
-        # output         output      output           ([add_1],)               {}
-        self.assertEqual(get_num_ins_outs(bw_graph), (3, 1))
-
-        recomputable_ops = [torch.ops.aten.mul]
-        partition_fn = partial(
-            min_cut_rematerialization_partition, recomputable_ops=recomputable_ops
-        )
-        fw_graph, bw_graph = get_fw_bw_graph(
-            f, [torch.randn(3, requires_grad=True)], partition_fn
-        )
-        # Expected forward graph:
-        # opcode         name       target           args                    kwargs
-        # -------------  ---------  ---------------  ----------------------  --------
-        # placeholder    primals_1  primals_1        ()                      {}
-        # call_function  mul        aten.mul.Tensor  (primals_1, primals_1)  {}
-        # call_function  mul_1      aten.mul.Tensor  (mul, primals_1)        {}
-        # output         output     output           ([mul_1, primals_1],)   {}
-        self.assertEqual(get_num_ins_outs(fw_graph), (1, 2))
-        # Expected backward graph:
-        # opcode         name        target           args                     kwargs
-        # -------------  ----------  ---------------  -----------------------  --------
-        # placeholder    primals_1   primals_1        ()                       {}
-        # placeholder    tangents_1  tangents_1       ()                       {}
-        # call_function  mul         aten.mul.Tensor  (primals_1, primals_1)   {} # RECOMPUTED
-        # call_function  mul_2       aten.mul.Tensor  (tangents_1, mul)        {}
-        # call_function  mul_3       aten.mul.Tensor  (tangents_1, primals_1)  {}
-        # call_function  mul_4       aten.mul.Tensor  (mul_3, primals_1)       {}
-        # call_function  add         aten.add.Tensor  (mul_2, mul_4)           {}
-        # call_function  add_1       aten.add.Tensor  (add, mul_4)             {}
-        # output         output      output           ([add_1],)               {}
-        self.assertEqual(get_num_ins_outs(bw_graph), (2, 1))
-
     def test_contiguous(self):
         # The test simulates the condition where transpose followed by view
         # happens in the backward pass.
diff --git a/test/functorch/test_control_flow.py b/test/functorch/test_control_flow.py
index a1aeb8c1de7d..92a988d83db3 100644
--- a/test/functorch/test_control_flow.py
+++ b/test/functorch/test_control_flow.py
@@ -890,6 +890,42 @@ def f(x):
         )(inp)
         self.assertEqual(gm_functional(torch.zeros(1, 2)), f(torch.zeros(1, 2)))
 
+    def test_cond_subgraph_same_shape_env_as_parent(self):
+        def true_fn(x):
+            return x.sin() + 10
+
+        def false_fn(x):
+            return x.cos() - 20
+
+        def f(x, pred):
+            y = cond(pred, true_fn, false_fn, [x])
+            z = torch.add(y, y)
+            return z
+
+        symbolic_traced_graph = self._check_tracing(f, (torch.ones(4), True))[
+            "symbolic"
+        ]
+        graph_shape_env = symbolic_traced_graph.shape_env
+
+        def _node_shape_env_iter(gm):
+            for node in symbolic_traced_graph.graph.nodes:
+                if node.op == "call_function":
+                    val = node.meta.get("val")
+                    if isinstance(val, tuple):
+                        for v in val:
+                            yield v.fake_mode.shape_env
+                    else:
+                        yield val.fake_mode.shape_env
+
+        for shape_env in _node_shape_env_iter(symbolic_traced_graph):
+            self.assertTrue(shape_env is graph_shape_env)
+
+        for shape_env in _node_shape_env_iter(symbolic_traced_graph.true_graph_0):
+            self.assertTrue(shape_env is graph_shape_env)
+
+        for shape_env in _node_shape_env_iter(symbolic_traced_graph.false_graph_0):
+            self.assertTrue(shape_env is graph_shape_env)
+
     def test_cond_functionalized_nested(self):
         def true_true_fn(x):
             y = x.cos()
diff --git a/test/functorch/test_eager_transforms.py b/test/functorch/test_eager_transforms.py
index b54b9762fbc3..fd2f16701510 100644
--- a/test/functorch/test_eager_transforms.py
+++ b/test/functorch/test_eager_transforms.py
@@ -67,9 +67,7 @@
 from torch.testing._internal.common_utils import (
     freeze_rng_state,
     instantiate_parametrized_tests,
-    IS_ARM64,
     IS_FBCODE,
-    IS_MACOS,
     IS_WINDOWS,
     markDynamoStrictTest,
     parametrize,
@@ -5082,9 +5080,7 @@ class TestCompileTransforms(TestCase):
     @skipIfRocm(msg="test leaks memory on ROCm")
     # torch.compile is not supported on Windows
     # Triton only supports GPU with SM70 or later.
-    @expectedFailureIf(
-        (IS_ARM64 and not IS_MACOS) or IS_WINDOWS or (TEST_CUDA and not SM70OrLater)
-    )
+    @expectedFailureIf(IS_WINDOWS or (TEST_CUDA and not SM70OrLater))
     def test_compile_vmap_hessian(self, device):
         # The model and inputs are a smaller version
         # of code at benchmark repo:
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index fe7b48fccbae..49afa50b78ac 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -9,6 +9,7 @@
 from unittest import skip
 
 import torch
+import torch._export
 import torch._inductor
 import torch.nn as nn
 from torch._dynamo.testing import rand_strided, same
@@ -1210,6 +1211,24 @@ def forward(self, x):
         torch._export.aot_compile(Model(self.device), example_inputs)
         self.check_model(Model(self.device), example_inputs)
 
+    def test_non_tensor_input(self):
+        def fn(a, b, alpha=1.0):
+            return torch.add(a, b, alpha=alpha)
+
+        a = torch.randn(10, device=self.device)
+        b = torch.randn(10, device=self.device)
+        with self.assertRaises(RuntimeError):
+            torch._export.aot_compile(fn, args=(a, b), kwargs={"alpha": 2.0})
+
+        so_path = torch._export.aot_compile(
+            torch.ops.aten.add, args=(a, b), kwargs={"alpha": 2.0}, same_signature=False
+        )
+        kernel_runner = AOTIRunnerUtil.load_runner(self.device, so_path)
+        res = kernel_runner.run([a, b])
+        self.assertTrue(isinstance(res, list))
+        self.assertTrue(len(res) == 1)
+        self.assertEqual(fn(a, b, alpha=2.0), res[0])
+
     def test_buffer_mutation_2(self):
         class Model(torch.nn.Module):
             def __init__(self, device):
diff --git a/test/inductor/test_benchmark_fusion.py b/test/inductor/test_benchmark_fusion.py
index c2f435e9bf94..3970148b2747 100644
--- a/test/inductor/test_benchmark_fusion.py
+++ b/test/inductor/test_benchmark_fusion.py
@@ -203,7 +203,7 @@ def setUpClass(cls):
                     {
                         "benchmark_kernel": True,
                         "benchmark_fusion": True,
-                        "benchmark_multi_templates": True,
+                        "benchmark_epilogue_fusion": True,
                     }
                 )
             )
@@ -231,7 +231,7 @@ def foo(m, inp):
 
             torch._dynamo.reset()
             with unittest.mock.patch.object(
-                torch._inductor.config, "benchmark_multi_templates", False
+                torch._inductor.config, "benchmark_epilogue_fusion", False
             ):
                 foo_c = torch.compile(mode="max-autotune-no-cudagraphs")(foo)
                 with torch.no_grad():
diff --git a/test/inductor/test_compiled_autograd.py b/test/inductor/test_compiled_autograd.py
index 074d075fc848..201dd4a3c77d 100644
--- a/test/inductor/test_compiled_autograd.py
+++ b/test/inductor/test_compiled_autograd.py
@@ -1,5 +1,6 @@
 # Owner(s): ["module: inductor"]
 import functools
+import logging
 import re
 import sys
 import unittest
@@ -51,6 +52,14 @@ def hook3(gI, gO):
 
 
 class TestCompiledAutograd(TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        compiled_autograd.reset()
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        compiled_autograd.reset()
+
     def check_output_and_recompiles(
         self, fn, count=1, compiler_fn=compiler_fn, compile_fn=False
     ):
@@ -322,6 +331,7 @@ def bytecode_hook(code, out_code):
             handle.remove()
 
     def test_inputs_aliasing_bytecode_stack_restore(self):
+        logging.getLogger().setLevel(logging.WARNING)
         from torch.testing._internal.logging_tensor import LoggingTensor
 
         # Create a graph that allows inputs stealing
@@ -752,6 +762,52 @@ def backward(ctx, gO_1, gO_2, gO_3):
 
         self.check_output_and_recompiles(fn, count=2)
 
+    @unittest.skipIf(not HAS_CUDA, "requires cuda")
+    def test_logging_tensor_flaky(self) -> None:
+        # when you first run some test using triton and then run test_inputs_aliasing_bytecode_stack_restore
+        # resulting in:
+        #   - pytest: `TypeError: unsupported operand type(s) for +: 'Tensor' and 'LoggingTensor'`
+        #   - python: `TypeError: not all arguments converted during string formatting`
+
+        # 1. some triton involving test
+        def fn():
+            def _fn(x):
+                return x
+
+            x = torch.arange(
+                1, 10, requires_grad=True, dtype=torch.float16, device="cuda"
+            )
+            out = _fn(x)
+            loss = out.sum()
+            loss.backward()
+
+        with compiled_autograd.enable(compiler_fn):
+            fn()
+
+        logging.getLogger().setLevel(
+            logging.WARNING
+        )  # triton setup overwrote it to INFO
+        # 2. test_inputs_aliasing_bytecode_stack_restore
+        from torch.testing._internal.logging_tensor import LoggingTensor
+
+        def forward(inputs):
+            add = inputs[0] + 1
+            add_1 = add + inputs[1]
+            out = add_1.cpu()
+            return (out,)
+
+        gm = torch.fx.symbolic_trace(forward)
+        print(gm.print_readable())
+        torch._dynamo.utils.set_locals_to_steal(gm, ["inputs"])
+        compiled_fn = torch.compile(gm)
+
+        inputs = [
+            torch.ones(1000000, dtype=torch.float32),
+            LoggingTensor(torch.ones(1)),
+        ]
+
+        compiled_fn(inputs)
+
     @unittest.skipIf(not HAS_CUDA, "requires cuda")
     def test_custom_fn_output_metadata(self):
         def my_compiler_fn(gm):
diff --git a/test/inductor/test_coordinate_descent_tuner.py b/test/inductor/test_coordinate_descent_tuner.py
index 70618c06e9ec..fdd3abb14392 100644
--- a/test/inductor/test_coordinate_descent_tuner.py
+++ b/test/inductor/test_coordinate_descent_tuner.py
@@ -16,7 +16,7 @@
 except ImportError:
     if __name__ == "__main__":
         sys.exit(0)
-    raise unittest.SkipTest("requires triton")  # noqa: TRY200
+    raise unittest.SkipTest("requires triton")  # noqa: B904
 
 from torch._inductor import config
 from torch._inductor.runtime.coordinate_descent_tuner import CoordescTuner
diff --git a/test/inductor/test_cpu_cpp_wrapper.py b/test/inductor/test_cpu_cpp_wrapper.py
index b8fdbc49bd38..0888f3ad47a1 100644
--- a/test/inductor/test_cpu_cpp_wrapper.py
+++ b/test/inductor/test_cpu_cpp_wrapper.py
@@ -71,7 +71,6 @@ class DynamicShapesCppWrapperCpuTests(InductorTestCase):
 
 if config.abi_compatible:
     xfail_list = [
-        "test_bernoulli1_cpu",  # cpp fallback op naming issue
         "test_conv2d_binary_inplace_fusion_failed_cpu",
         "test_conv2d_binary_inplace_fusion_pass_cpu",
         "test_dynamic_qlinear_cpu",
@@ -297,6 +296,24 @@ class BaseTest(NamedTuple):
             test_mkldnn_pattern_matcher.TestPatternMatcher(),
             condition=torch.backends.mkldnn.is_available(),
         ),
+        BaseTest(
+            "test_qlinear_gelu",
+            "cpu",
+            test_mkldnn_pattern_matcher.TestPatternMatcher(),
+            condition=torch.backends.mkldnn.is_available(),
+        ),
+        BaseTest(
+            "test_qlinear_add",
+            "cpu",
+            test_mkldnn_pattern_matcher.TestPatternMatcher(),
+            condition=torch.backends.mkldnn.is_available(),
+        ),
+        BaseTest(
+            "test_qlinear_add_relu",
+            "cpu",
+            test_mkldnn_pattern_matcher.TestPatternMatcher(),
+            condition=torch.backends.mkldnn.is_available(),
+        ),
         BaseTest(
             "test_qlinear_dequant_promotion",
             "cpu",
diff --git a/test/inductor/test_cpu_select_algorithm.py b/test/inductor/test_cpu_select_algorithm.py
deleted file mode 100644
index 5377b1f8f7e5..000000000000
--- a/test/inductor/test_cpu_select_algorithm.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Owner(s): ["oncall: cpu inductor"]
-import functools
-import unittest
-from unittest.mock import patch
-
-import torch
-import torch._dynamo.config
-import torch._dynamo.config as dynamo_config
-import torch._inductor.config as inductor_config
-import torch._inductor.select_algorithm as select_algorithm
-from torch._dynamo.utils import counters
-from torch._inductor.test_case import run_tests, TestCase
-from torch.testing._internal.common_device_type import (
-    dtypes,
-    instantiate_device_type_tests,
-)
-
-from torch.testing._internal.common_utils import IS_MACOS, parametrize, TEST_MKL
-
-aten = torch.ops.aten
-
-
-def patches(fn):
-    def skip_cache(self, choices, name, key, benchmark):
-        if benchmark is None:
-            return {}
-        return benchmark(choices)
-
-    for patcher in [
-        dynamo_config.patch(verbose=True),
-        inductor_config.patch(
-            debug=True,
-            max_autotune=True,
-            epilogue_fusion=True,
-            max_autotune_gemm_backends="CPP,ATEN",
-        ),
-        patch.object(select_algorithm, "VERIFY", dict(atol=1e-4, rtol=1e-4)),
-        patch.object(select_algorithm.AlgorithmSelectorCache, "lookup", skip_cache),
-    ]:
-        fn = patcher(fn)
-
-    @functools.wraps(fn)
-    def wrapped(*args, **kwargs):
-        counters.clear()
-        torch.manual_seed(12345)
-        return fn(*args, **kwargs)
-
-    return wrapped
-
-
-class TestSelectAlgorithm(TestCase):
-    @inductor_config.patch({"freezing": True})
-    @patches
-    @torch.no_grad
-    @unittest.skipIf(not TEST_MKL, "Test requires MKL")
-    @parametrize("batch_size", (1, 2, 1000))
-    @parametrize("in_features", (1, 2, 1000))
-    @parametrize("out_features", (1, 32, 1024))
-    @parametrize("bias", (True, False))
-    @parametrize("input_3d", (True, False))
-    @dtypes(torch.float)
-    def test_linear_static_shapes(
-        self, batch_size, in_features, out_features, bias, input_3d, dtype
-    ):
-        class M(torch.nn.Module):
-            def __init__(self, bias):
-                super().__init__()
-                self.linear = torch.nn.Linear(in_features, out_features, bias)
-
-            @torch.compile
-            def forward(self, x):
-                return self.linear(x)
-
-        counters.clear()
-        mod = M(bias=bias).to(dtype=dtype).eval()
-        B = (2, batch_size) if input_3d else (batch_size,)
-        v = torch.randn(*B, in_features).to(dtype=dtype)
-        mod(v)
-        self.assertEqual(
-            counters["inductor"]["select_algorithm_autotune"],
-            1 if out_features != 1 else 0,
-        )
-
-    @inductor_config.patch({"freezing": True})
-    @patches
-    @torch.no_grad
-    @unittest.skipIf(not TEST_MKL, "Test requires MKL")
-    @parametrize("bias", (True, False))
-    @dtypes(torch.float)
-    def test_linear_input_transpose(self, bias, dtype):
-        batch_size = 384
-        in_features = 196
-        out_features = 384
-
-        class M(torch.nn.Module):
-            def __init__(self, bias):
-                super().__init__()
-                self.linear = torch.nn.Linear(in_features, out_features, bias)
-
-            @torch.compile
-            def forward(self, x):
-                return self.linear(x)
-
-        counters.clear()
-        mod = M(bias=bias).to(dtype=dtype).eval()
-        v = torch.randn(in_features, batch_size).to(dtype=dtype)
-        mod(v.transpose(0, 1))
-        # TODO(jgong5): support transposed input
-        self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 0)
-
-
-@dynamo_config.patch({"dynamic_shapes": True, "assume_static_by_default": False})
-class _DynamicShapesTestBase(TestCase):
-    pass
-
-
-class TestSelectAlgorithmDynamicShapes(_DynamicShapesTestBase):
-    test_linear_dynamic_shapes = TestSelectAlgorithm.test_linear_static_shapes
-
-
-instantiate_device_type_tests(TestSelectAlgorithm, globals(), only_for="cpu")
-instantiate_device_type_tests(
-    TestSelectAlgorithmDynamicShapes, globals(), only_for="cpu"
-)
-
-
-if __name__ == "__main__":
-    from torch.testing._internal.inductor_utils import HAS_CPU
-
-    if HAS_CPU and not IS_MACOS:
-        run_tests()
diff --git a/test/inductor/test_cuda_cpp_wrapper.py b/test/inductor/test_cuda_cpp_wrapper.py
index 42df6813c63e..5cb8af9db165 100644
--- a/test/inductor/test_cuda_cpp_wrapper.py
+++ b/test/inductor/test_cuda_cpp_wrapper.py
@@ -97,9 +97,7 @@ class DynamicShapesCudaWrapperCudaTests(InductorTestCase):
 
 if config.abi_compatible:
     xfail_list = [
-        "test_bernoulli1_cuda",  # cpp fallback op naming issue
         "test_profiler_mark_wrapper_call_cuda",
-        "test_randint_cuda",
         "test_scaled_dot_product_attention_cuda_dynamic_shapes",
     ]
     for test_name in xfail_list:
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
index db02d1931009..f303330bc114 100644
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@@ -32,7 +32,7 @@
         import triton
         from triton import language as tl
     except ImportError:
-        raise unittest.SkipTest("requires triton")  # noqa: TRY200
+        raise unittest.SkipTest("requires triton")  # noqa: B904
 
     try:
         from . import test_torchinductor
diff --git a/test/inductor/test_debug_trace.py b/test/inductor/test_debug_trace.py
index 41b885fde6a3..cca698761055 100644
--- a/test/inductor/test_debug_trace.py
+++ b/test/inductor/test_debug_trace.py
@@ -54,9 +54,9 @@ def fn(a, b):
             open(filename / "ir_pre_fusion.txt").read().rstrip(),
             """\
 buf0: SchedulerNode(ComputedBuffer)
-buf0.writes = [MemoryDep('buf0', c0, {c0: 256})]
+buf0.writes = [MemoryDep('buf0', c0, {c0: 256}, None)]
 buf0.unmet_dependencies = []
-buf0.met_dependencies = [MemoryDep('arg0_1', c0, {c0: 256})]
+buf0.met_dependencies = [MemoryDep('arg0_1', c0, {c0: 256}, None)]
 buf0.users = [NodeUser(node=SchedulerNode(name='buf1'), can_inplace=True, is_weak=False)]
 buf0.group.device = cpu
 buf0.group.iteration = ((256,), ())
@@ -77,8 +77,8 @@ def body(self, ops):
 
 
 buf1: SchedulerNode(ComputedBuffer)
-buf1.writes = [MemoryDep('buf1', c0, {c0: 256})]
-buf1.unmet_dependencies = [MemoryDep('buf0', c0, {c0: 256})]
+buf1.writes = [MemoryDep('buf1', c0, {c0: 256}, None)]
+buf1.unmet_dependencies = [MemoryDep('buf0', c0, {c0: 256}, None)]
 buf1.met_dependencies = []
 buf1.users = [NodeUser(node=ExternKernelSchedulerNode(name='buf2'), can_inplace=False, is_weak=False)]
 buf1.group.device = cpu
@@ -100,9 +100,9 @@ def body(self, ops):
 
 
 buf2: ExternKernelSchedulerNode(ExternKernelOut)
-buf2.writes = [StarDep(name='buf2')]
-buf2.unmet_dependencies = [StarDep(name='buf1')]
-buf2.met_dependencies = [StarDep(name='arg1_1')]
+buf2.writes = [StarDep(name='buf2', mode=None)]
+buf2.unmet_dependencies = [StarDep(name='buf1', mode=None)]
+buf2.met_dependencies = [StarDep(name='arg1_1', mode=None)]
 buf2.users = [NodeUser(node=OUTPUT, can_inplace=False, is_weak=False)]
 buf2.node.kernel = extern_kernels.mm""",
         )
@@ -110,15 +110,15 @@ def body(self, ops):
             open(filename / "ir_post_fusion.txt").read().rstrip(),
             """\
 buf0_buf1: FusedSchedulerNode(SchedulerNode,SchedulerNode)
-buf0_buf1.writes = [MemoryDep('buf0', c0, {c0: 256}), MemoryDep('buf1', c0, {c0: 256})]
+buf0_buf1.writes = [MemoryDep('buf0', c0, {c0: 256}, None), MemoryDep('buf1', c0, {c0: 256}, None)]
 buf0_buf1.unmet_dependencies = []
-buf0_buf1.met_dependencies = [MemoryDep('arg0_1', c0, {c0: 256})]
+buf0_buf1.met_dependencies = [MemoryDep('arg0_1', c0, {c0: 256}, None)]
 buf0_buf1.users = []
     buf0_buf1.snodes[0] =
     buf0: SchedulerNode(ComputedBuffer)
-    buf0.writes = [MemoryDep('buf0', c0, {c0: 256})]
+    buf0.writes = [MemoryDep('buf0', c0, {c0: 256}, None)]
     buf0.unmet_dependencies = []
-    buf0.met_dependencies = [MemoryDep('arg0_1', c0, {c0: 256})]
+    buf0.met_dependencies = [MemoryDep('arg0_1', c0, {c0: 256}, None)]
     buf0.users = [NodeUser(node=SchedulerNode(name='buf1'), can_inplace=True, is_weak=False)]
     buf0.group.device = cpu
     buf0.group.iteration = ((256,), ())
@@ -138,8 +138,8 @@ def body(self, ops):
             return store
     buf0_buf1.snodes[1] =
     buf1: SchedulerNode(ComputedBuffer)
-    buf1.writes = [MemoryDep('buf1', c0, {c0: 256})]
-    buf1.unmet_dependencies = [MemoryDep('buf0', c0, {c0: 256})]
+    buf1.writes = [MemoryDep('buf1', c0, {c0: 256}, None)]
+    buf1.unmet_dependencies = [MemoryDep('buf0', c0, {c0: 256}, None)]
     buf1.met_dependencies = []
     buf1.users = [NodeUser(node=ExternKernelSchedulerNode(name='buf2'), can_inplace=False, is_weak=False)]
     buf1.group.device = cpu
@@ -161,9 +161,9 @@ def body(self, ops):
 
 
 buf2: ExternKernelSchedulerNode(ExternKernelOut)
-buf2.writes = [StarDep(name='buf2')]
-buf2.unmet_dependencies = [StarDep(name='buf1')]
-buf2.met_dependencies = [StarDep(name='arg1_1')]
+buf2.writes = [StarDep(name='buf2', mode=None)]
+buf2.unmet_dependencies = [StarDep(name='buf1', mode=None)]
+buf2.met_dependencies = [StarDep(name='arg1_1', mode=None)]
 buf2.users = [NodeUser(node=OUTPUT, can_inplace=False, is_weak=False)]
 buf2.node.kernel = extern_kernels.mm""",
         )
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
index 37461bc2c50a..f3a9026a3c80 100644
--- a/test/inductor/test_flex_attention.py
+++ b/test/inductor/test_flex_attention.py
@@ -1,8 +1,9 @@
 # Owner(s): ["module: inductor"]
 
 import functools
+import unittest
 from collections import namedtuple
-from typing import Callable
+from typing import Callable, Optional
 
 from unittest import expectedFailure, skip, skipUnless
 from unittest.mock import patch
@@ -58,14 +59,8 @@ def create_attention(score_mod):
 
 
 # --------- Useful score mod functions for testing ---------
-
-test_score_mods = [
-    _identity,
-    _causal,
-    _rel_bias,
-    _rel_causal,
-    _generate_alibi_bias(8),
-]
+def _inverse_causal(score, b, h, m, n):
+    return torch.where(m <= n, score, float("-inf"))
 
 
 def _times_two(score, b, h, m, n):
@@ -79,13 +74,11 @@ def _squared(score, b, h, m, n):
 
 
 def _head_offset(dtype: torch.dtype):
-    """Captured Buffer
-    Note: this builds a score_mod with index of a type
-    """
+    """Captured Buffer"""
     head_offset = torch.rand(H, device="cuda", dtype=dtype)
 
     def score_mod(score, b, h, m, n):
-        return score * index(head_offset, [h])
+        return score * head_offset[h]
 
     return score_mod
 
@@ -103,20 +96,19 @@ def _trig2(score, b, h, m, n):
     return z
 
 
-def _buffer_reduced(dtype: torch.dtype):
-    """Reduction in captured buffer"""
-    batch_offsets = torch.rand(B, 8, device="cuda", dtype=dtype)
-
-    def score_mod(score, b, h, m, n):
-        batch_vals = index(batch_offsets, [b])
-        return score + batch_vals.sum()
-
-    return score_mod
-
+test_score_mods = [
+    _identity,
+    _times_two,
+    _squared,
+    _causal,
+    _inverse_causal,
+    _rel_bias,
+    _rel_causal,
+    _generate_alibi_bias(8),
+]
 
 captured_buffers_map = {
     "_head_offset": _head_offset,
-    "_buffer_reduced": _buffer_reduced,
 }
 
 B = 4
@@ -125,7 +117,37 @@ def score_mod(score, b, h, m, n):
 D = 64
 
 
-class TestTemplatedSDPA(InductorTestCase):
+def query_key_value_clones(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    dtype: torch.dtype = None,
+):
+    """Clones the query, key, and value tensors and moves them to the specified dtype."""
+    if dtype is None:
+        dtype = query.dtype
+    query_ref = query.clone().detach().to(dtype).requires_grad_(query.requires_grad)
+    key_ref = key.clone().detach().to(dtype).requires_grad_(key.requires_grad)
+    value_ref = value.clone().detach().to(dtype).requires_grad_(value.requires_grad)
+    return query_ref, key_ref, value_ref
+
+
+class TestFlexAttention(InductorTestCase):
+    def _check_equal(
+        self,
+        golden_out: torch.Tensor,
+        ref_out: torch.Tensor,
+        compiled_out: torch.Tensor,
+        fudge_factor: float,
+        tensor_name: Optional[str] = None,
+    ):
+        compiled_error = (golden_out - compiled_out).abs().mean()
+        ref_error = (golden_out - ref_out).abs().mean()
+        if compiled_error > ref_error * fudge_factor:
+            name = tensor_name if tensor_name is not None else ""
+            msg = f"{name} Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X."
+            self.assertTrue(False, msg)
+
     def run_test(
         self,
         score_mod: Callable,
@@ -137,26 +159,168 @@ def run_test(
     ):
         sdpa_partial = create_attention(score_mod)
         compiled_sdpa = torch.compile(sdpa_partial)
-        q = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
-        k = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
-        v = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
-        golden_out = sdpa_partial(
-            q.to(torch.float64), k.to(torch.float64), v.to(torch.float64)
-        )
-        ref_out = sdpa_partial(q, k, v)
+        q = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
+        k = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
+        v = torch.randn((B, H, S, D), dtype=dtype, device="cuda", requires_grad=True)
+        q_ref, k_ref, v_ref = query_key_value_clones(q, k, v)
+        q_gold, k_gold, v_gold = query_key_value_clones(q, k, v, torch.float64)
+        golden_out = sdpa_partial(q_gold, k_gold, v_gold)
+        ref_out = sdpa_partial(q_ref, k_ref, v_ref)
         compiled_out = compiled_sdpa(q, k, v)
 
-        compiled_error = (golden_out - compiled_out).abs().mean()
-        ref_error = (golden_out - ref_out).abs().mean()
+        backward_grad = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+
+        golden_out.backward(backward_grad.to(torch.float64))
+        ref_out.backward(backward_grad)
+        compiled_out.backward(backward_grad)
+
+        with torch.no_grad():
+            # Note, it seems like we really are less accurate than the float32
+            # computation, likely due to the online softmax
+            if dtype == torch.float32:
+                fudge_factor = 10.0
+            else:
+                fudge_factor = 1.1
+
+            # Checkout output
+            self._check_equal(golden_out, ref_out, compiled_out, fudge_factor, "Out")
+
+            # Check gradients
+            q_fudge_factor = 2.5 * fudge_factor
+            self._check_equal(
+                q_gold.grad, q_ref.grad, q.grad, q_fudge_factor, "Grad_Query"
+            )
+            k_fudge_factor = 4 * fudge_factor
+            self._check_equal(
+                k_gold.grad, k_ref.grad, k.grad, k_fudge_factor, "Grad_Key"
+            )
+            v_fudge_factor = 8 * fudge_factor
+            self._check_equal(
+                v_gold.grad, v_ref.grad, v.grad, v_fudge_factor, "Grad_Value"
+            )
+
+    def run_dynamic_test(
+        self,
+        score_mod: Callable,
+        dtype: torch.dtype = torch.float16,
+        B: int = B,
+        H: int = H,
+        S: int = S,
+        D: int = D,
+    ):
+        sdpa_partial = create_attention(score_mod)
+        # The first eager batch, shape (B, H, S, D)
+        q1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        k1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        v1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        golden_out1 = sdpa_partial(
+            q1.to(torch.float64), k1.to(torch.float64), v1.to(torch.float64)
+        )
+        ref_out1 = sdpa_partial(q1, k1, v1)
+
+        # The second eager batch, shape (B * 2, H, S / 2, D)
+        B = int(B * 2)
+        S = int(S / 2)
+        q2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        k2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        v2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        golden_out2 = sdpa_partial(
+            q2.to(torch.float64), k2.to(torch.float64), v2.to(torch.float64)
+        )
+        ref_out2 = sdpa_partial(q2, k2, v2)
+
+        # Need to clear dynamo counters, since flex attention eager mode also uses dynamo tracing.
+        # We check dynamo counters["frames"]["ok"] to ensure there is no re-compilation.
+        torch._dynamo.reset()
+        # Compiling with dynamic shape in the first batch.
+        compiled_sdpa = torch.compile(sdpa_partial, dynamic=True)
+        compiled_out1 = compiled_sdpa(q1, k1, v1)
+
         # Note, it seems like we really are less accurate than the float32
         # computation, likely due to the online softmax
         if dtype == torch.float32:
             fudge_factor = 10.0
         else:
             fudge_factor = 1.1
-        if compiled_error > ref_error * fudge_factor:
-            msg = f"Compiled error {compiled_error} is greater than ref error {ref_error} by more than {fudge_factor}X."
-            self.assertTrue(False, msg)
+
+        self._check_equal(golden_out1, ref_out1, compiled_out1, fudge_factor)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 1)
+
+        # No re-compilation, use the compiled dynamic shape version.
+        compiled_out2 = compiled_sdpa(q2, k2, v2)
+        self._check_equal(golden_out2, ref_out2, compiled_out2, fudge_factor)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 1)
+
+    def run_automatic_dynamic_test(
+        self,
+        score_mod: Callable,
+        dtype: torch.dtype = torch.float16,
+        B: int = B,
+        H: int = H,
+        S: int = S,
+        D: int = D,
+    ):
+        sdpa_partial = create_attention(score_mod)
+        # The first eager batch, shape (B, H, S, D)
+        q1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        k1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        v1 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        golden_out1 = sdpa_partial(
+            q1.to(torch.float64), k1.to(torch.float64), v1.to(torch.float64)
+        )
+        ref_out1 = sdpa_partial(q1, k1, v1)
+
+        # The second eager batch, shape (B * 2, H, S / 2, D)
+        B = int(B * 2)
+        S = int(S / 2)
+        q2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        k2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        v2 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        golden_out2 = sdpa_partial(
+            q2.to(torch.float64), k2.to(torch.float64), v2.to(torch.float64)
+        )
+        ref_out2 = sdpa_partial(q2, k2, v2)
+
+        # The third eager batch, shape (B * 4, H, S / 4, D)
+        B = int(B * 2)
+        S = int(S / 2)
+        q3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        k3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        v3 = torch.randn((B, H, S, D), dtype=dtype, device="cuda")
+        golden_out3 = sdpa_partial(
+            q3.to(torch.float64), k3.to(torch.float64), v3.to(torch.float64)
+        )
+        ref_out3 = sdpa_partial(q3, k3, v3)
+
+        # Need to clear dynamo counters, since flex attention eager mode also uses dynamo tracing.
+        # We check dynamo counters["frames"]["ok"] to ensure:
+        # 1, the first batch is compiled with static shape
+        # 2, the second batch is compiled with dynamic shape
+        # 3, no re-compilation in the third batch
+        torch._dynamo.reset()
+
+        # Note, it seems like we really are less accurate than the float32
+        # computation, likely due to the online softmax
+        if dtype == torch.float32:
+            fudge_factor = 10.0
+        else:
+            fudge_factor = 1.1
+
+        # The first batch.
+        compiled_sdpa = torch.compile(sdpa_partial)
+        compiled_out1 = compiled_sdpa(q1, k1, v1)
+        self._check_equal(golden_out1, ref_out1, compiled_out1, fudge_factor)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 1)
+
+        # The second batch (automatic dynamic).
+        compiled_out2 = compiled_sdpa(q2, k2, v2)
+        self._check_equal(golden_out2, ref_out2, compiled_out2, fudge_factor)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 2)
+
+        # The third batch (no re-compilation).
+        compiled_out3 = compiled_sdpa(q3, k3, v3)
+        self._check_equal(golden_out3, ref_out3, compiled_out3, fudge_factor)
+        self.assertEqual(torch._dynamo.utils.counters["frames"]["ok"], 2)
 
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes)
@@ -164,6 +328,20 @@ def run_test(
     def test_builtin_score_mods(self, dtype: torch.dtype, score_mod: Callable):
         self.run_test(score_mod, dtype)
 
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    @common_utils.parametrize("score_mod", test_score_mods)
+    def test_builtin_score_mods_dynamic(self, dtype: torch.dtype, score_mod: Callable):
+        self.run_dynamic_test(score_mod, dtype)
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    @common_utils.parametrize("score_mod", test_score_mods)
+    def test_builtin_score_mods_automatic_dynamic(
+        self, dtype: torch.dtype, score_mod: Callable
+    ):
+        self.run_automatic_dynamic_test(score_mod, dtype)
+
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes)
     def test_skip_odd_keys(self, dtype: torch.dtype):
@@ -195,6 +373,21 @@ def score_mod(score, b, h, m, n):
 
         self.run_test(score_mod, dtype)
 
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes)
+    def test_captured_buffers_all_dims(self, dtype: torch.dtype):
+        head_scale = torch.randn(H, device="cuda")
+        batch_scale = torch.randn(B, device="cuda")
+        tok_scale = torch.randn(S, device="cuda")
+
+        def all_bias(score, batch, head, token_q, token_kv):
+            score = score + tok_scale[token_q]
+            score = score + batch_scale[batch]
+            score = score + head_scale[head]
+            return score
+
+        self.run_test(all_bias, dtype)
+
     @supported_platform
     @common_utils.parametrize("dtype", test_dtypes_fast)
     def test_seq_masking(self, dtype):
@@ -289,8 +482,53 @@ def natten_mask(score, b, h, q, kv):
         self.run_test(natten_mask, dtype)
 
     @supported_platform
-    @expectedFailure
     @common_utils.parametrize("dtype", test_dtypes_fast)
+    def test_subgraph_respect_decompostion(self, dtype):
+        from torch._decomp import core_aten_decompositions
+        from torch.fx.experimental.proxy_tensor import make_fx
+
+        def score_mod_func(score, b, h, q, kv):
+            return score - q // (1 + kv)
+
+        make_tensor = functools.partial(
+            torch.randn,
+            (2, 2, 128, 4),
+            device="cuda",
+            dtype=torch.float64,
+            requires_grad=True,
+        )
+        query, key, value = make_tensor(), make_tensor(), make_tensor()
+        # floor_div is not decomposed in decompostion_table is empty
+        gm = make_fx(_flex_attention, decomposition_table={})(
+            query, key, value, score_mod_func
+        )
+        self.assertExpectedInline(
+            gm.sdpa_score0.code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
+    add = torch.ops.aten.add.Tensor(arg4_1, 1);  arg4_1 = None
+    floor_divide = torch.ops.aten.floor_divide.default(arg3_1, add);  arg3_1 = add = None
+    sub = torch.ops.aten.sub.Tensor(arg0_1, floor_divide);  arg0_1 = floor_divide = None
+    return sub""",
+        )
+
+        # floor_div is decomposed for core_aten_decompositions
+        gm = make_fx(_flex_attention, decomposition_table=core_aten_decompositions())(
+            query, key, value, score_mod_func
+        )
+        self.assertExpectedInline(
+            gm.sdpa_score0.code.strip(),
+            """\
+def forward(self, arg0_1, arg1_1, arg2_1, arg3_1, arg4_1):
+    add = torch.ops.aten.add.Tensor(arg4_1, 1);  arg4_1 = None
+    div = torch.ops.aten.div.Tensor_mode(arg3_1, add, rounding_mode = 'floor');  arg3_1 = add = None
+    sub = torch.ops.aten.sub.Tensor(arg0_1, div);  arg0_1 = div = None
+    return sub""",
+        )
+
+    @supported_platform
+    @common_utils.parametrize("dtype", test_dtypes_fast)
+    @unittest.skip("Silu decomp failing for full in backwards")
     def test_silu_on_score(self, dtype):
         def silu_score(score, b, h, q, kv):
             return torch.nn.functional.silu(score)
@@ -430,23 +668,6 @@ def njt_score_mod(qk, b, h, q, kv):
 
         self.run_test(causal_njt, dtype)
 
-    @supported_platform
-    def test_backwards_fails(self):
-        make_tensor = functools.partial(
-            torch.randn,
-            (B, H, S, D),
-            dtype=torch.float32,
-            device="cuda",
-            requires_grad=True,
-        )
-        q, k, v = make_tensor(), make_tensor(), make_tensor()
-        func = torch.compile(_flex_attention, backend="inductor", fullgraph=True)
-        with self.assertRaisesRegex(
-            AssertionError, "flex_attention_backward is not an OpOverload"
-        ):
-            out = func(q, k, v, _identity)
-            out.backward(torch.ones_like(out))
-
     @supported_platform
     def test_mixed_dtypes_fails(self):
         query = torch.randn((1, 1, 1024, 64), dtype=torch.float32, device="cuda")
@@ -474,6 +695,7 @@ def score_mod(score, b, h, m, n):
         self.run_test(score_mod)
 
     @supported_platform
+    @skip("TODO: Figure out why this is erroring")
     @patch.object(torch._inductor.config, "max_autotune", True)
     def test_max_autotune_with_captured(self):
         head_scale = torch.randn(H, device="cuda")
@@ -609,7 +831,7 @@ def test_aot_eager_gradcheck(self, score_mod):
         )
 
     @supported_platform
-    @common_utils.parametrize("score_mod_name", ["_head_offset", "_buffer_reduced"])
+    @common_utils.parametrize("score_mod_name", ["_head_offset"])
     @common_utils.parametrize("mode", ["eager", "aot_eager"])
     def test_captured_score_mod_aot_eager_gradcheck(
         self, score_mod_name: str, mode: str
@@ -697,13 +919,10 @@ def debug_compile_fx_inner(graph, example_inputs, *args, **kwargs):
             joint_graph,
             """\
 class GraphModule(torch.nn.Module):
-    def forward(self, primals_1: "f64[2, 2, 8, 4]", primals_2: "f64[2, 2, 8, 4]", primals_3: "f64[2, 2, 8, 4]", """
-            + """alias_5: "f64[2, 2, 8, 4]", alias_7: "f32[2, 2, 8]", tangents_1: "f64[2, 2, 8, 4]"):
+    def forward(self, primals_1: "f64[2, 2, 8, 4]", primals_2: "f64[2, 2, 8, 4]", primals_3: "f64[2, 2, 8, 4]", alias_3: "f64[2, 2, 8, 4]", alias_5: "f32[2, 2, 8]", tangents_1: "f64[2, 2, 8, 4]"):
         fw_graph = self.fw_graph
         joint_graph = self.joint_graph
-        flex_attention_backward = torch.ops.higher_order.flex_attention_backward(primals_1, primals_2, """
-            + """primals_3, alias_5, alias_7, tangents_1, fw_graph, joint_graph);  primals_1 = primals_2 = primals_3 = alias_5 """
-            + """= alias_7 = tangents_1 = fw_graph = joint_graph = None
+        flex_attention_backward = torch.ops.higher_order.flex_attention_backward(primals_1, primals_2, primals_3, alias_3, alias_5, tangents_1, fw_graph, joint_graph);  primals_1 = primals_2 = primals_3 = alias_3 = alias_5 = tangents_1 = fw_graph = joint_graph = None
         getitem_2: "f64[2, 2, 8, 4]" = flex_attention_backward[0]
         getitem_3: "f64[2, 2, 8, 4]" = flex_attention_backward[1]
         getitem_4: "f64[2, 2, 8, 4]" = flex_attention_backward[2];  flex_attention_backward = None
@@ -721,11 +940,11 @@ def forward(self, arg0_1: "f64[]", arg1_1: "i32[]", arg2_1: "i32[]", arg3_1: "i3
             mul_2: "f64[]" = torch.ops.aten.mul.Tensor(arg5_1, arg0_1);  arg5_1 = arg0_1 = None
             add: "f64[]" = torch.ops.aten.add.Tensor(mul_2, mul_1);  mul_2 = mul_1 = None
             return [add, None, None, None, None]
-""",
+""",  # noqa: B950
         )
 
 
-common_utils.instantiate_parametrized_tests(TestTemplatedSDPA)
+common_utils.instantiate_parametrized_tests(TestFlexAttention)
 
 if __name__ == "__main__":
     from torch._inductor.test_case import run_tests
diff --git a/test/inductor/test_memory_planning.py b/test/inductor/test_memory_planning.py
index 1bd546e5b4df..1ec1dd9f89e9 100644
--- a/test/inductor/test_memory_planning.py
+++ b/test/inductor/test_memory_planning.py
@@ -2,6 +2,8 @@
 
 import sys
 
+import unittest
+
 from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, skipIfRocm
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
@@ -13,14 +15,12 @@
         sys.exit(0)
     raise unittest.SkipTest("requires sympy/functorch/filelock")  # noqa: F821
 
-import unittest
-
 import torch
-from test_torchinductor import run_and_get_cpp_code
 from torch._C import FileCheck
 from torch._dynamo.utils import same
 from torch._inductor import config
 from torch._inductor.test_case import run_tests, TestCase
+from torch._inductor.utils import run_and_get_cpp_code
 from torch.export import Dim
 from torch.utils._triton import has_triton
 
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index cbf9dd89c506..756de35df84c 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -1589,6 +1589,144 @@ def test_qlinear_gelu_int8_mixed_bf16(self):
                 (torch.randn((2, 4)),), gelu, int8_mixed_bf16=True
             )
 
+    def _qlinear_add_cpu_test_helper(self, use_relu=False, int8_mixed_bf16=False):
+        r"""
+        This testcase will quantize two consecutive Linear->Add(->relu) patterns as:
+                 X
+               /   \
+        linear(X)   linear(X)
+               \   /
+                Add
+                 |
+           Optional(relu)
+               /   \
+        linear(X)   linear(X)
+               \   /
+                Add
+                 |
+           Optional(relu)
+                 |
+                 Y
+        """
+
+        def fake_quant(x):
+            # to produce a float32 result as extra input
+            qlib = torch.ops.quantized_decomposed
+            x = qlib.quantize_per_tensor.default(x, 0.0166785, 42, 0, 255, torch.uint8)
+            x = qlib.dequantize_per_tensor.default(
+                x, 0.0166785, 42, 0, 255, torch.uint8
+            )
+            return x
+
+        class M(torch.nn.Module):
+            def __init__(
+                self,
+                add_fn,
+                use_relu,
+                fake_quant_before_extra_input,
+            ):
+                super().__init__()
+                self.linear1 = torch.nn.Linear(4, 4)
+                self.linear2 = torch.nn.Linear(4, 4)
+                self.add_fn = add_fn
+                self.relu = torch.nn.ReLU()
+                self.linear3 = torch.nn.Linear(4, 4)
+                self.linear4 = torch.nn.Linear(4, 4)
+                self.add_fn2 = add_fn
+                self.relu2 = torch.nn.ReLU()
+                self.use_relu = use_relu
+                self.fake_quant_before_extra_input = fake_quant_before_extra_input
+
+            def forward(self, x):
+                x1 = self.linear1(x)
+                x2 = self.linear2(x)
+                if self.fake_quant_before_extra_input:
+                    x2 = fake_quant(x2)
+                tmp = self.add_fn(x1, x2)
+                if self.use_relu:
+                    tmp = self.relu(tmp)
+                tmp1 = self.linear3(tmp)
+                tmp2 = self.linear4(tmp)
+                if self.fake_quant_before_extra_input:
+                    tmp2 = fake_quant(tmp2)
+                res = self.add_fn2(tmp1, tmp2)
+                if self.use_relu:
+                    res = self.relu2(res)
+                return res
+
+        add_fn_list = [
+            lambda x, y: x + y,
+            lambda x, y: y + x,
+            lambda x, y: x.add_(y),
+            lambda x, y: y.add_(x),
+        ]
+        fake_quant_x2_list = [False, True] if int8_mixed_bf16 else [False]
+        cases = itertools.product(add_fn_list, fake_quant_x2_list)
+        for add_fn, fq_x2 in cases:
+            mod = M(add_fn, use_relu, fq_x2).eval()
+            v = torch.randn((4, 4), dtype=torch.float32, requires_grad=False).add(1)
+
+            def matcher_check_fn():
+                # 1. Dequant-linear pattern matched in quantization weight prepack * 4
+                self.assertEqual(
+                    counters["inductor"]["qlinear_weight_prepack_matcher_count"], 4
+                )
+                # pattern = [dequant_per_tensor, (convert_dtype), dequant_per_channel, (convert_dtype), permute, addmm]
+                nodes_per_match = 6 if int8_mixed_bf16 else 4
+                self.assertEqual(
+                    counters["inductor"]["qlinear_weight_prepack_matcher_nodes"],
+                    4 * nodes_per_match,
+                )
+                # 2. Qlinear Binary Unary fusion in post-grad fusion pass * 2
+                self.assertEqual(
+                    counters["inductor"]["qlinear_binary_matcher_count"], 2
+                )
+                # Two linear-binary patterns are matched
+                # matched patter1 = [qlinear, add, (convert dtype), (relu), quantize_per_tensor]
+                # matched patter2 = [qlinear, add, (convert dtype), (relu)]
+                # If add_fn is x.add_(y), x is bf16 and y is fp32, there is a to_bf16 node after binary
+                to_bf16_after_binary = 2 * (add_fn == add_fn_list[2] and fq_x2)
+                self.assertEqual(
+                    counters["inductor"]["qlinear_binary_matcher_nodes"],
+                    5 + 2 * use_relu + to_bf16_after_binary,
+                )
+
+            for is_qat in [False, True]:
+                self._test_common(
+                    mod,
+                    (v,),
+                    check_quantization=True,
+                    check_autocast=torch.bfloat16 if int8_mixed_bf16 else torch.float,
+                    matcher_check_fn=matcher_check_fn,
+                    is_qat=is_qat,
+                )
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qlinear_add_cpu(self):
+        self._qlinear_add_cpu_test_helper()
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qlinear_add_int8_mixed_bf16(self):
+        self._qlinear_add_cpu_test_helper(int8_mixed_bf16=True)
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qlinear_add_relu_cpu(self):
+        self._qlinear_add_cpu_test_helper(use_relu=True)
+
+    @skipIfNoDynamoSupport
+    @skipIfNoONEDNNBF16
+    @skipIfNoONEDNN
+    @skipIfRocm
+    def test_qlinear_add_relu_int8_mixed_bf16(self):
+        self._qlinear_add_cpu_test_helper(use_relu=True, int8_mixed_bf16=True)
+
     def _qlinear_dequant_promotion_cpu_test_helper(
         self,
         inputs,
diff --git a/test/inductor/test_pad_mm.py b/test/inductor/test_pad_mm.py
index 5aeb4d01edbd..bb37368f9567 100644
--- a/test/inductor/test_pad_mm.py
+++ b/test/inductor/test_pad_mm.py
@@ -5,10 +5,15 @@
 
 import torch._inductor.config as inductor_config
 from torch._dynamo.testing import rand_strided
-from torch._inductor.fx_passes.pad_mm import get_alignment_size, get_padded_length
+from torch._inductor.fx_passes.pad_mm import (
+    get_alignment_size,
+    get_pad_cache,
+    get_padded_length,
+    should_pad_common,
+)
 
 from torch._inductor.test_case import run_tests, TestCase
-from torch._inductor.utils import run_and_get_code
+from torch._inductor.utils import fresh_inductor_cache, run_and_get_code
 from torch.testing import FileCheck
 from torch.testing._internal.inductor_utils import HAS_CUDA
 
@@ -125,7 +130,7 @@ def forward(self, a, b):
         b = rand_strided((K, N), (1, K), device="cuda", dtype=torch.float32)
         # TODO: Getting the alignment right requires pattern matcher to
         # run on newly added nodes
-        aligned_m = get_padded_length(M, get_alignment_size(a)) + M - 3
+        aligned_m = get_padded_length(M, get_alignment_size(a)) + M
         torch._dynamo.mark_dynamic(a, 1)
         torch._dynamo.mark_dynamic(b, 0)
         with unittest.mock.patch(
@@ -164,6 +169,16 @@ def forward(self, a, b):
             res2, (code,) = run_and_get_code(compiled_fn, a, b)
         self.assertEqual(res1, res2)
 
+    @inductor_config.patch(force_shape_pad=True)
+    def test_zero_dim(self):
+        def addmm(x, a, b):
+            return torch.addmm(x, a, b)
+
+        x = torch.randn(100).cuda()
+        a = torch.randn(0, 10).cuda()
+        b = torch.randn(10, 100).cuda()
+        self.assertEqual(torch.compile(addmm)(x, a, b), addmm(x, a, b))
+
     @inductor_config.patch(max_autotune=True, max_autotune_gemm_backends="TRITON")
     def test_pad_bmm_dyn_b(self):
         B = 10
@@ -312,6 +327,103 @@ def forward(self, a, b, c):
             FileCheck().check(f"K = {K}").run(code)
         self.assertEqual(res1, res2)
 
+    @inductor_config.patch(force_shape_pad=True)
+    def test_pad_single_cat(self):
+        @torch.compile()
+        def foo(x, y):
+            return x @ y
+
+        inps = [torch.rand([5, 5], device="cuda") for _ in range(2)]
+        out = foo(*inps)
+        self.assertEqual(out, inps[0] @ inps[1])
+
+    @inductor_config.patch(force_shape_pad=True)
+    @fresh_inductor_cache()
+    def test_pad_addmm_2d_bias(self):
+        @torch.compile()
+        def foo(input, x, y):
+            return torch.ops.aten.addmm(input, x, y)
+
+        for a in [1, 4]:
+            for b in [1, 6]:
+                inps = (
+                    torch.rand([a, b], device="cuda"),
+                    torch.rand([4, 5], device="cuda"),
+                    torch.rand([5, 6], device="cuda"),
+                )
+                out = foo(*inps)
+                out_eager = torch.ops.aten.addmm(*inps)
+                self.assertEqual(out, out_eager)
+
+        for a in [1, 6]:
+            inps = (
+                torch.rand([a], device="cuda"),
+                torch.rand([4, 5], device="cuda"),
+                torch.rand([5, 6], device="cuda"),
+            )
+            out = foo(*inps)
+            out_eager = torch.ops.aten.addmm(*inps)
+            self.assertEqual(out, out_eager)
+
+    @inductor_config.patch(force_shape_pad=True)
+    def test_pad_batch(self):
+        m = 6
+        n = 9
+        k = 11
+        batch_size = 3
+        mat1 = torch.ones((batch_size, m, k), device="cuda", dtype=torch.float16)
+        mat2 = torch.ones((batch_size, k, n), device="cuda", dtype=torch.float16)
+        expected_alignment = get_alignment_size(mat1)
+
+        assert expected_alignment == 8, "Alignment for float16 should be 8"
+        assert should_pad_common(
+            mat1, mat2
+        ), "This should pass the common padding criteria"
+
+        @torch.compile()
+        def bmm(mat1, mat2):
+            return torch.bmm(mat1, mat2)
+
+        res2, (code,) = run_and_get_code(bmm, mat1, mat2)
+        bmm_expected_result = torch.bmm(mat1, mat2)
+        # in call code, expect to see a single pad per input, and then we should see padded allocation for output
+        FileCheck().check("del async_compile").check_count(
+            ".run(", 2, exactly=True
+        ).check("empty_strided_cuda((3, 8, 16)").run(code)
+
+        assert torch.allclose(
+            res2, bmm_expected_result
+        ), "BMM results are not identical"
+
+    @fresh_inductor_cache()
+    def test_exclude_padding(self):
+        @torch.compile()
+        def mm(a, b):
+            return a @ b
+
+        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
+        local_cache = get_pad_cache().get_local_cache()
+        self.assertTrue(len(local_cache) == 2)
+        FileCheck().check_count("exclude_pad:False", 2, exactly=True).run(
+            repr(local_cache)
+        )
+
+        @torch.compile()
+        def mm(a, b):
+            return (a + 1) @ b
+
+        mm(torch.rand([25, 25], device="cuda"), torch.rand([25, 25], device="cuda"))
+        local_cache = get_pad_cache().get_local_cache()
+        # reuse original base timing
+        self.assertTrue(len(local_cache) == 3)
+
+        FileCheck().check_count("exclude_pad:False", 3, exactly=True).run(
+            repr(local_cache)
+        )
+        FileCheck().check_count("exclude_pad:True", 1, exactly=True).run(
+            repr(local_cache)
+        )
+
 
 if __name__ == "__main__":
     if HAS_CUDA:
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 201db26b6044..1201e68f277e 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -36,6 +36,8 @@
     expectedFailureCodegenDynamic,
     rand_strided,
     same,
+    skipIfPy312,
+    xfailIfPy312,
 )
 from torch._inductor.codegen.common import DataTypePropagation, OptimizationContext
 from torch._inductor.fx_passes import pad_mm
@@ -46,6 +48,7 @@
     aoti_eager_cache_dir,
     load_aoti_eager_cache,
     run_and_get_code,
+    run_and_get_cpp_code,
     run_and_get_triton_code,
 )
 from torch._inductor.virtualized import V
@@ -86,7 +89,6 @@
 from torch.utils import _pytree as pytree
 from torch.utils._python_dispatch import TorchDispatchMode
 from torch.utils._pytree import tree_flatten, tree_unflatten
-from torch.utils._triton import has_triton
 from torch.utils.weak import WeakTensorKeyDictionary
 
 DO_PERF_TEST = os.environ.get("DO_PERF_TEST") == "1"
@@ -343,29 +345,6 @@ def clone_preserve_strides(x, device=None):
     return out
 
 
-def run_and_get_cpp_code(fn, *args, **kwargs):
-    # We use the patch context manager instead of using it as a decorator.
-    # In this way, we can ensure that the attribute is patched and unpatched correctly
-    # even if this run_and_get_cpp_code function is called multiple times.
-    with patch.object(config, "debug", True):
-        torch._dynamo.reset()
-        import io
-        import logging
-
-        log_capture_string = io.StringIO()
-        ch = logging.StreamHandler(log_capture_string)
-        from torch._inductor.graph import output_code_log
-
-        output_code_log.addHandler(ch)
-        prev_level = output_code_log.level
-        output_code_log.setLevel(logging.DEBUG)
-        result = fn(*args, **kwargs)
-        s = log_capture_string.getvalue()
-        output_code_log.setLevel(prev_level)
-        output_code_log.removeHandler(ch)
-    return result, s
-
-
 def check_model(
     self: TestCase,
     model,
@@ -862,6 +841,86 @@ def fn(a):
 
         self.assertTrue(kernel_lib_path in kernel_libs_abs_path)
 
+    @skipCUDAIf(not SM80OrLater, "Requires sm80")
+    def test_eager_aoti_with_scalar(self):
+        namespace_name = "aten"
+        op_name = "add"
+        op_overload_name = "Tensor"
+        op_name_with_overload = f"{op_name}.{op_overload_name}"
+
+        dispatch_key = "CPU"
+        device = torch.device("cpu")
+        if self.device.lower() == "cuda":
+            dispatch_key = "CUDA"
+            device = torch.device("cuda")
+
+        # Test the difference between scalar tensor and scalar
+        a = torch.scalar_tensor(1.0, device=device)
+        b = torch.scalar_tensor(2.0, device=device)
+
+        kernel_lib_path = aoti_compile_with_persistent_cache(
+            namespace_name,
+            op_name_with_overload,
+            a.device.type,
+            False,
+            torch.ops.aten.add,
+            args=(a, b),
+            kwargs={"alpha": 3.0},
+        )
+        self.assertTrue(Path(kernel_lib_path).exists())
+        device_kernel_cache = aoti_eager_cache_dir(namespace_name, device.type)
+        kernel_conf = device_kernel_cache / f"{op_name_with_overload}.json"
+        self.assertTrue(kernel_conf.exists())
+        json_data = load_aoti_eager_cache(
+            namespace_name, op_name_with_overload, a.device.type
+        )
+        op_info = json_data[0]
+        self.assertTrue(isinstance(op_info, dict))
+        self.assertTrue("meta_info" in op_info)
+        self.assertTrue(len(op_info["meta_info"]) == 3)
+        self.assertTrue(op_info["meta_info"][0]["sizes"] == [])
+        self.assertTrue(op_info["meta_info"][0]["strides"] == [])
+        # Scalar Tensor
+        self.assertTrue("scalar_value" not in op_info["meta_info"][0])
+        self.assertTrue(op_info["meta_info"][1]["sizes"] == [])
+        self.assertTrue(op_info["meta_info"][1]["strides"] == [])
+        # Scalar Tensor
+        self.assertTrue("scalar_value" not in op_info["meta_info"][1])
+        self.assertTrue(op_info["meta_info"][2]["sizes"] == [])
+        self.assertTrue(op_info["meta_info"][2]["strides"] == [])
+        # Scalar
+        self.assertTrue("scalar_value" in op_info["meta_info"][2])
+
+        with _scoped_library("aten", "IMPL") as torch_compile_op_lib_impl:
+            a = torch.randn(128, device=device)
+            b = torch.randn(128, device=device)
+
+            scalar_values = [1.0, 2.0, 3.0]
+            ref_values = []
+            for scalar_value in scalar_values:
+                ref_values.append(torch.add(a, b, alpha=scalar_value))
+
+            qualified_op_name = f"{namespace_name}::{op_name}"
+            _, overload_names = torch._C._jit_get_operation(qualified_op_name)
+            for overload_name in overload_names:
+                try:
+                    reg_op_name = qualified_op_name
+                    schema = torch._C._get_schema(reg_op_name, overload_name)
+                    if schema.overload_name:
+                        reg_op_name = f"{reg_op_name}.{schema.overload_name}"
+                    torch_compile_op_lib_impl._impl_with_aoti_compile(  # noqa: F821
+                        reg_op_name, dispatch_key
+                    )
+                except Exception as e:
+                    continue
+
+            res_values = []
+            for scalar_value in scalar_values:
+                res_values.append(torch.add(a, b, alpha=scalar_value))
+
+            self.assertEqual(len(ref_values), len(res_values))
+            self.assertEqual(ref_values, res_values)
+
     @skipCUDAIf(not SM80OrLater, "Requires sm80")
     def test_torch_compile_override_registration(self):
         dynamic = False
@@ -2744,6 +2803,7 @@ def fn(a, b):
             check_lowp=False,
         )
 
+    @skipIfPy312  # segfaults
     @config.patch(force_mixed_mm=True)
     def test_mixed_mm(self):
         def fn(a, b):
@@ -2758,6 +2818,7 @@ def fn(a, b):
             check_lowp=True,
         )
 
+    @skipIfPy312  # segfaults
     @config.patch(force_mixed_mm=True)
     def test_mixed_mm2(self):
         def fn(a, b, scale, bias):
@@ -6126,6 +6187,7 @@ def fn(a, b):
                 (a, b),
             )
 
+    @skipIfXpu
     def test_nll_loss_backward(self):
         def fn(a, b, c):
             return aten.nll_loss_backward(
@@ -9448,6 +9510,7 @@ def fn(inp, offsets):
 
         self.common(fn, (inp, offsets), check_lowp=False)
 
+    @xfailIfPy312
     @requires_gpu()
     @config.patch(assume_aligned_inputs=False)
     def test_config_option_dont_assume_alignment(self):
@@ -9963,6 +10026,7 @@ def fn(n):
         res = torch.compile(fn)(20)
         self.assertTrue(torch.all((0 <= res) & (res < 10)).item())
 
+    @torch._inductor.config.patch(force_shape_pad=True)
     def test_should_pad_bench_for_bmm(self):
         B = 2
         M = 1024
@@ -9972,25 +10036,9 @@ def test_should_pad_bench_for_bmm(self):
         mat1 = torch.rand(B, M, K, device=self.device)
         mat2 = torch.rand(B, K, N, device=self.device)
 
-        def return_true(*args, **kwargs):
-            return True
-
-        # return value of is_mm_compute_bound depends on flops and membw of
-        # the GPU. Mock it so the test does not becomes flaky when running
-        # on different GPUs.
-        patch1 = patch.object(pad_mm, "is_mm_compute_bound", return_true)
-        # mock get_cached_should_pad so the test does not rely on benchmarking
-        # result.
-        patch2 = patch.object(pad_mm, "get_cached_should_pad", return_true)
+        should_pad = pad_mm.should_pad_bench(None, mat1, mat2, torch.ops.aten.bmm)
 
-        with patch1, patch2:
-            should_pad = pad_mm.should_pad_bench(mat1, mat2, torch.ops.aten.bmm)
-
-        if has_triton():
-            self.assertTrue(should_pad)
-        else:
-            # should_pad_bench always returns False if has_triton returns False
-            self.assertFalse(should_pad)
+        self.assertTrue(should_pad)
 
     @parametrize(
         "name, op",
diff --git a/test/inductor/test_torchinductor_dynamic_shapes.py b/test/inductor/test_torchinductor_dynamic_shapes.py
index 9ee63752f8e0..8513e928c412 100644
--- a/test/inductor/test_torchinductor_dynamic_shapes.py
+++ b/test/inductor/test_torchinductor_dynamic_shapes.py
@@ -116,7 +116,7 @@ def setUp(self):
         if not HAS_GPU:
             self.skipTest("Triton not available")
         torch._dynamo.reset()
-        super(TestCase, self).setUp()
+        TestCase.setUp(self)
         # this should be in setUpClass, but device-generic tests
         # don't work with setUpClass well (non-deterministically the wrong setUpClass is resolved),
         # so put it in test setUp, it's cheap
@@ -134,7 +134,7 @@ def setUp(self):
 
     def tearDown(self):
         self._stack.close()
-        super(TestCase, self).tearDown()
+        TestCase.tearDown(self)
         torch._dynamo.reset()
 
     def test_arange_dynamic(self, device):
diff --git a/test/inductor/test_torchinductor_opinfo.py b/test/inductor/test_torchinductor_opinfo.py
index fdb9a8c37a47..9bd873ac747b 100644
--- a/test/inductor/test_torchinductor_opinfo.py
+++ b/test/inductor/test_torchinductor_opinfo.py
@@ -438,6 +438,8 @@ def wrapper_noop_set_seed(op, *args, **kwargs):
     "mH",
     "rsub",
     "triu",
+    "cummax",
+    "cummin",
 }
 
 
diff --git a/test/inductor/test_triton_heuristics.py b/test/inductor/test_triton_heuristics.py
index 6375512cc128..d8c74c0a3841 100644
--- a/test/inductor/test_triton_heuristics.py
+++ b/test/inductor/test_triton_heuristics.py
@@ -14,7 +14,7 @@
 except ImportError:
     if __name__ == "__main__":
         sys.exit(0)
-    raise unittest.SkipTest("requires triton")  # noqa: TRY200
+    raise unittest.SkipTest("requires triton")  # noqa: B904
 
 from torch._inductor import config
 from torch._inductor.runtime.hints import TRITON_MAX_BLOCK
diff --git a/test/inductor/test_unbacked_symints.py b/test/inductor/test_unbacked_symints.py
index 43d1307fcfa5..60ce45317238 100644
--- a/test/inductor/test_unbacked_symints.py
+++ b/test/inductor/test_unbacked_symints.py
@@ -188,6 +188,32 @@ def fn(x, w, a, b):
         expected = fn(*example_inputs)
         torch.testing.assert_close(actual, expected)
 
+    @skipCUDAIf(not HAS_CUDA, "requires cuda")
+    @dynamo_config.patch({"capture_scalar_outputs": True})
+    def test_vertical_pointwise_reduction_fusion(self, device):
+        # Tests fusing a pointwise & reduction op with unbacked numel/rnumel.
+        def fn(x, y, repeats):
+            u0 = repeats.item()
+            unbacked = y.expand(u0, *y.shape)  # [u0, 1, 16]
+
+            # Note: We add x to both pointwise and reduction. Otherwise, the
+            # scheduler will refuse to fuse ops whose only common buffer has
+            # unbacked symints.
+            pointwise = unbacked + x
+            reduction = torch.sum(pointwise + x)
+            return pointwise, reduction
+
+        example_inputs = (
+            torch.randn(32, 16).cuda(),
+            torch.randn(1, 16).cuda(),
+            torch.tensor(32).cuda(),
+        )
+
+        actual = torch.compile(fn, fullgraph=True)(*example_inputs)
+        expected = fn(*example_inputs)
+        torch.testing.assert_close(actual, expected)
+        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
+
 
 instantiate_device_type_tests(
     TestUnbackedSymints, globals(), only_for=(GPU_TYPE, "cpu")
diff --git a/test/mobile/model_test/coverage.yaml b/test/mobile/model_test/coverage.yaml
index 679fd7f85c4c..5433fea4df10 100644
--- a/test/mobile/model_test/coverage.yaml
+++ b/test/mobile/model_test/coverage.yaml
@@ -1048,6 +1048,7 @@ uncovered_ops:
   aten::__is__: 83
   aten::__isnot__: 81
   aten::__not__: 32
+  aten::_aminmax: 4
   aten::_convolution: 12
   aten::_convolution.deprecated: 3
   aten::_make_per_tensor_quantized_tensor: 2
diff --git a/test/mobile/model_test/model_ops.yaml b/test/mobile/model_test/model_ops.yaml
index e62f5fd35117..43e4876451e3 100644
--- a/test/mobile/model_test/model_ops.yaml
+++ b/test/mobile/model_test/model_ops.yaml
@@ -30,6 +30,7 @@ root_operators:
   aten::__range_length: 106
   aten::__rshift__.int: 2
   aten::__xor__.bool: 16
+  aten::_aminmax: 18
   aten::_convolution: 27
   aten::_convolution.deprecated: 3
   aten::_infer_size: 9
diff --git a/test/quantization/core/experimental/test_adaround_eager.py b/test/quantization/core/experimental/test_adaround_eager.py
new file mode 100644
index 000000000000..33a16f21bd0f
--- /dev/null
+++ b/test/quantization/core/experimental/test_adaround_eager.py
@@ -0,0 +1,118 @@
+# Owner(s): ["oncall: speech_infra"]
+
+import copy
+
+import torch
+import torch.nn as nn
+from torch.ao.quantization.experimental.adaround_optimization import (
+    AdaptiveRoundingOptimizer,
+)
+
+from torch.nn import functional as F
+from torch.quantization.observer import MinMaxObserver
+from torch.testing._internal.common_quantization import QuantizationTestCase
+
+
+def forward_wrapper(fetcher):
+    def forward(module, input, output):
+        fetcher.append(input[0].detach())
+        fetcher.append(output.detach())
+
+    return forward
+
+
+class TestAdaround(QuantizationTestCase):
+    def feedforawrd_callback(
+        self,
+        model,
+        data,
+    ) -> None:
+        model(data)
+
+    def run_adaround(self, model, img_data):
+        adaround_optimizer = AdaptiveRoundingOptimizer(
+            model,
+            self.feedforawrd_callback,
+            forward_wrapper,
+            img_data,
+            max_iter=100,
+            batch_size=10,
+        )
+        adarounded_model = adaround_optimizer.run_adaround()
+        return adarounded_model
+
+    def get_fake_quant(self, model):
+        hard_fake_quant_model = copy.deepcopy(model)
+        for _, module in hard_fake_quant_model.named_modules():
+            if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
+                weight_observer = MinMaxObserver(
+                    quant_min=-128,
+                    quant_max=127,
+                    dtype=torch.qint8,
+                    qscheme=torch.per_tensor_symmetric,
+                )
+                weight_observer(module.weight)
+                scale, zero_point = weight_observer.calculate_qparams()
+                fake_quant_module = torch.fake_quantize_per_tensor_affine(
+                    module.weight,
+                    scale=scale,
+                    zero_point=zero_point,
+                    quant_min=-128,
+                    quant_max=127,
+                )
+                module.weight.data.copy_(fake_quant_module)
+        return hard_fake_quant_model
+
+    def test_linear_chain(self):
+        class LinearChain(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear1 = nn.Linear(3, 4)
+                self.linear2 = nn.Linear(4, 5)
+                self.linear3 = nn.Linear(5, 6)
+
+            def forward(self, x):
+                x = self.linear1(x)
+                x = self.linear2(x)
+                x = self.linear3(x)
+                return x
+
+        float_model = LinearChain()
+        img_data = [torch.rand(10, 3, dtype=torch.float) for _ in range(50)]
+        adarounded_model = self.run_adaround(float_model, img_data)
+        fq_model = self.get_fake_quant(float_model)
+        rand_input = torch.rand(10, 3)
+        with torch.no_grad():
+            ada_out = adarounded_model(rand_input)
+            fq_out = fq_model(rand_input)
+            float_out = float_model(rand_input)
+            ada_loss = F.mse_loss(ada_out, float_out)
+            fq_loss = F.mse_loss(fq_out, float_out)
+            self.assertTrue(ada_loss.item() < fq_loss.item())
+
+    def test_conv_chain(self):
+        class ConvChain(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv2d1 = nn.Conv2d(3, 4, 5, 5)
+                self.conv2d2 = nn.Conv2d(4, 5, 5, 5)
+                self.conv2d3 = nn.Conv2d(5, 6, 5, 5)
+
+            def forward(self, x):
+                x = self.conv2d1(x)
+                x = self.conv2d2(x)
+                x = self.conv2d3(x)
+                return x
+
+        float_model = ConvChain()
+        img_data = [torch.rand(10, 3, 125, 125, dtype=torch.float) for _ in range(50)]
+        adarounded_model = self.run_adaround(float_model, img_data)
+        fq_model = self.get_fake_quant(float_model)
+        rand_input = torch.rand(10, 3, 256, 256)
+        with torch.no_grad():
+            ada_out = adarounded_model(rand_input)
+            fq_out = fq_model(rand_input)
+            float_out = float_model(rand_input)
+            ada_loss = F.mse_loss(ada_out, float_out)
+            fq_loss = F.mse_loss(fq_out, float_out)
+            self.assertTrue(ada_loss.item() < fq_loss.item())
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
index b96e1ff12ac3..75cf3c444571 100644
--- a/test/quantization/pt2e/test_quantize_pt2e.py
+++ b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -2278,5 +2278,46 @@ def validate(self, model: torch.fx.GraphModule) -> None:
             node_list,
         )
 
+    def test_multi_users_without_output_observer(self):
+        """
+        Test the case in which a node is used by multiple users,
+        and had its output observer removed.
+        """
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.conv = torch.nn.Conv2d(3, 3, 3)
+
+            def forward(self, x):
+                x = self.conv(x)
+                return x, x + 1
+
+        example_inputs = (torch.randn(1, 3, 5, 5),)
+        m = M()
+        m = capture_pre_autograd_graph(m, example_inputs)
+        quantizer = XNNPACKQuantizer().set_global(
+            get_symmetric_quantization_config(),
+        )
+        m = prepare_pt2e(m, quantizer)
+        m(*example_inputs)
+
+        # Remove output observer
+        observer_to_remove = None
+        for n in m.graph.nodes:
+            if n.op == "output":
+                observer_to_remove = n.args[0][0]
+                assert observer_to_remove.op == "call_module"
+                assert observer_to_remove.target.startswith("activation_post_process_")
+                break
+        assert observer_to_remove is not None
+        observer_to_remove.replace_all_uses_with(observer_to_remove.args[0])
+        m.graph.erase_node(observer_to_remove)
+        m.recompile()
+
+        # Convert should succeed
+        m = convert_pt2e(m)
+        m(*example_inputs)
+
 
 instantiate_parametrized_tests(TestQuantizePT2E)
diff --git a/test/run_test.py b/test/run_test.py
index 5b24a0078996..71ab08199f7a 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -181,6 +181,7 @@ def __contains__(self, item):
     "test_jit_legacy",
     "test_cuda_nvml_based_avail",
     "test_jit_cuda_fuser",
+    "distributed/_tensor/test_attention",
 ]
 
 XPU_BLOCKLIST = [
@@ -239,7 +240,8 @@ def __contains__(self, item):
     "test_native_mha",  # OOM
     "test_module_hooks",  # OOM
     "inductor/test_max_autotune",
-    "inductor/test_cutlass_backend",  # slow due to many nvcc compilation steps
+    "inductor/test_cutlass_backend",  # slow due to many nvcc compilation steps,
+    "inductor/test_flex_attention",  # OOM
 ]
 # A subset of onnx tests that cannot run in parallel due to high memory usage.
 ONNX_SERIAL_LIST = [
@@ -406,7 +408,7 @@ def run_test(
         stepcurrent_key = f"{test_file}_{test_module.shard}_{os.urandom(8).hex()}"
 
     if options.verbose:
-        unittest_args.append(f'-{"v"*options.verbose}')  # in case of pytest
+        unittest_args.append(f'-{"v" * options.verbose}')  # in case of pytest
 
     if test_file in RUN_PARALLEL_BLOCKLIST:
         unittest_args = [
diff --git a/test/test_autocast.py b/test/test_autocast.py
index 2baa774f0df1..ce3d94318ccd 100644
--- a/test/test_autocast.py
+++ b/test/test_autocast.py
@@ -253,6 +253,14 @@ def test_generic_autocast(self):
                 cpu_autocast_output = getattr(torch, op)(*args, **maybe_kwargs)
             self.assertEqual(generic_autocast_output, cpu_autocast_output)
 
+    def test_cpu_autocast_deprecated_warning(self):
+        with self.assertWarnsRegex(
+            DeprecationWarning,
+            r"torch.cpu.amp.autocast\(args...\) is deprecated. Please use torch.amp.autocast\('cpu', args...\) instead.",
+        ):
+            with torch.cpu.amp.autocast():
+                _ = torch.ones(10)
+
 
 class CustomLinear(torch.autograd.Function):
     @staticmethod
diff --git a/test/test_autograd.py b/test/test_autograd.py
index e20e8b18ebae..3ae37e18e7a3 100644
--- a/test/test_autograd.py
+++ b/test/test_autograd.py
@@ -2612,7 +2612,7 @@ def coro_no_grad(n=10):
 
                 except UnrecoverableException:
                     self.assertFalse(torch.is_grad_enabled())
-                    raise SecondaryException
+                    raise SecondaryException from None
 
         @torch.enable_grad()
         def coro_enable_grad(n=10):
@@ -2624,7 +2624,7 @@ def coro_enable_grad(n=10):
 
                 except UnrecoverableException:
                     self.assertTrue(torch.is_grad_enabled())
-                    raise SecondaryException
+                    raise SecondaryException from None
 
         with torch.enable_grad():
             coro = coro_no_grad()
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 1872faee6a28..93e08eff4df6 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -1978,6 +1978,14 @@ def test_autocast_checkpointing(self):
         self.assertTrue(output.dtype is torch.float16)
         output.sum().backward()
 
+    def test_cuda_autocast_deprecated_warning(self):
+        with self.assertWarnsRegex(
+            DeprecationWarning,
+            r"torch.cuda.amp.autocast\(args...\) is deprecated. Please use torch.amp.autocast\('cuda', args...\) instead.",
+        ):
+            with torch.cuda.amp.autocast():
+                _ = torch.ones(10)
+
     @slowTest
     @unittest.skipIf(not TEST_LARGE_TENSOR, "not enough memory")
     @serialTest()
@@ -4703,7 +4711,7 @@ class TestCudaOptims(TestCase):
         [
             optim
             for optim in optim_db
-            if "foreach" in optim.supported_impls and "fused" in optim.supported_impls
+            if "foreach" in optim.supported_impls and "cuda" in optim.supports_fused_on
         ],
         dtypes=[torch.float32],
     )
diff --git a/test/test_datapipe.py b/test/test_datapipe.py
index 0881da2f9f64..b6be7eb76b97 100644
--- a/test/test_datapipe.py
+++ b/test/test_datapipe.py
@@ -24,16 +24,18 @@
     Set,
     Tuple,
     Type,
+    TYPE_CHECKING,
     TypeVar,
     Union,
-    TYPE_CHECKING
 )
+
 if not TYPE_CHECKING:
     # pyre isn't treating this the same as a typing.NamedTuple
     from typing_extensions import NamedTuple
 else:
     from typing import NamedTuple
 
+import operator
 from unittest import skipIf
 
 import numpy as np
@@ -44,37 +46,41 @@
 import torch.utils.data.graph
 import torch.utils.data.graph_settings
 from torch.testing._internal.common_utils import (
-    TestCase, run_tests, suppress_warnings, skipIfTorchDynamo, TEST_DILL, skipIfNoDill,
+    run_tests,
+    skipIfNoDill,
+    skipIfTorchDynamo,
+    suppress_warnings,
+    TEST_DILL,
+    TestCase,
 )
 from torch.utils._import_utils import import_dill
 from torch.utils.data import (
-    DataLoader,
+    argument_validation,
     DataChunk,
+    DataLoader,
     IterDataPipe,
     MapDataPipe,
     RandomSampler,
-    argument_validation,
     runtime_validation,
     runtime_validation_disabled,
 )
-from torch.utils.data.graph import traverse_dps
+from torch.utils.data.datapipes.dataframe import (
+    CaptureDataFrame,
+    dataframe_wrapper as df_wrapper,
+)
+from torch.utils.data.datapipes.iter.sharding import SHARDING_PRIORITIES
 from torch.utils.data.datapipes.utils.common import StreamWrapper
 from torch.utils.data.datapipes.utils.decoder import (
     basichandlers as decoder_basichandlers,
 )
-from torch.utils.data.datapipes.utils.snapshot import (
-    _simple_graph_snapshot_restoration
-)
-from torch.utils.data.datapipes.dataframe import CaptureDataFrame
-from torch.utils.data.datapipes.dataframe import dataframe_wrapper as df_wrapper
-from torch.utils.data.datapipes.iter.sharding import SHARDING_PRIORITIES
-import operator
+from torch.utils.data.datapipes.utils.snapshot import _simple_graph_snapshot_restoration
+from torch.utils.data.graph import traverse_dps
 
 dill = import_dill()
 HAS_DILL = TEST_DILL
 
 try:
-    import pandas  # type: ignore[import] # noqa: F401 F403
+    import pandas  # type: ignore[import]  # noqa: F401 F403
 
     HAS_PANDAS = True
 except ImportError:
@@ -90,36 +96,49 @@ def create_temp_dir_and_files():
     # Adding `noqa: P201` to avoid mypy's warning on not releasing the dir handle within this function.
     temp_dir = tempfile.TemporaryDirectory()  # noqa: P201
     temp_dir_path = temp_dir.name
-    with tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False, suffix='.txt') as f:
+    with tempfile.NamedTemporaryFile(
+        dir=temp_dir_path, delete=False, suffix=".txt"
+    ) as f:
         temp_file1_name = f.name
-    with tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False, suffix='.byte') as f:
+    with tempfile.NamedTemporaryFile(
+        dir=temp_dir_path, delete=False, suffix=".byte"
+    ) as f:
         temp_file2_name = f.name
-    with tempfile.NamedTemporaryFile(dir=temp_dir_path, delete=False, suffix='.empty') as f:
+    with tempfile.NamedTemporaryFile(
+        dir=temp_dir_path, delete=False, suffix=".empty"
+    ) as f:
         temp_file3_name = f.name
 
-    with open(temp_file1_name, 'w') as f1:
-        f1.write('0123456789abcdef')
-    with open(temp_file2_name, 'wb') as f2:
+    with open(temp_file1_name, "w") as f1:
+        f1.write("0123456789abcdef")
+    with open(temp_file2_name, "wb") as f2:
         f2.write(b"0123456789abcdef")
 
     temp_sub_dir = tempfile.TemporaryDirectory(dir=temp_dir_path)  # noqa: P201
     temp_sub_dir_path = temp_sub_dir.name
-    with tempfile.NamedTemporaryFile(dir=temp_sub_dir_path, delete=False, suffix='.txt') as f:
+    with tempfile.NamedTemporaryFile(
+        dir=temp_sub_dir_path, delete=False, suffix=".txt"
+    ) as f:
         temp_sub_file1_name = f.name
-    with tempfile.NamedTemporaryFile(dir=temp_sub_dir_path, delete=False, suffix='.byte') as f:
+    with tempfile.NamedTemporaryFile(
+        dir=temp_sub_dir_path, delete=False, suffix=".byte"
+    ) as f:
         temp_sub_file2_name = f.name
 
-    with open(temp_sub_file1_name, 'w') as f1:
-        f1.write('0123456789abcdef')
-    with open(temp_sub_file2_name, 'wb') as f2:
+    with open(temp_sub_file1_name, "w") as f1:
+        f1.write("0123456789abcdef")
+    with open(temp_sub_file2_name, "wb") as f2:
         f2.write(b"0123456789abcdef")
 
-    return [(temp_dir, temp_file1_name, temp_file2_name, temp_file3_name),
-            (temp_sub_dir, temp_sub_file1_name, temp_sub_file2_name)]
+    return [
+        (temp_dir, temp_file1_name, temp_file2_name, temp_file3_name),
+        (temp_sub_dir, temp_sub_file1_name, temp_sub_file2_name),
+    ]
 
 
-def reset_after_n_next_calls(datapipe: Union[IterDataPipe[T_co], MapDataPipe[T_co]],
-                             n: int) -> Tuple[List[T_co], List[T_co]]:
+def reset_after_n_next_calls(
+    datapipe: Union[IterDataPipe[T_co], MapDataPipe[T_co]], n: int
+) -> Tuple[List[T_co], List[T_co]]:
     """
     Given a DataPipe and integer n, iterate the DataPipe for n elements and store the elements into a list
     Then, reset the DataPipe and return a tuple of two lists
@@ -221,7 +240,7 @@ def test_dir(self):
         wrap_fd = StreamWrapper(fd)
 
         s = set(dir(wrap_fd))
-        for api in ['open', 'read', 'close']:
+        for api in ["open", "read", "close"]:
             self.assertTrue(api in s)
 
     @skipIfTorchDynamo()
@@ -281,11 +300,13 @@ def tearDown(self):
             self.temp_sub_dir.cleanup()
             self.temp_dir.cleanup()
         except Exception as e:
-            warnings.warn(f"TestIterableDatasetBasic was not able to cleanup temp dir due to {str(e)}")
+            warnings.warn(
+                f"TestIterableDatasetBasic was not able to cleanup temp dir due to {str(e)}"
+            )
 
     def test_listdirfiles_iterable_datapipe(self):
         temp_dir = self.temp_dir.name
-        datapipe: IterDataPipe = dp.iter.FileLister(temp_dir, '')
+        datapipe: IterDataPipe = dp.iter.FileLister(temp_dir, "")
 
         count = 0
         for pathname in datapipe:
@@ -294,10 +315,12 @@ def test_listdirfiles_iterable_datapipe(self):
         self.assertEqual(count, len(self.temp_files))
 
         count = 0
-        datapipe = dp.iter.FileLister(temp_dir, '', recursive=True)
+        datapipe = dp.iter.FileLister(temp_dir, "", recursive=True)
         for pathname in datapipe:
             count = count + 1
-            self.assertTrue((pathname in self.temp_files) or (pathname in self.temp_sub_files))
+            self.assertTrue(
+                (pathname in self.temp_files) or (pathname in self.temp_sub_files)
+            )
         self.assertEqual(count, len(self.temp_files) + len(self.temp_sub_files))
 
         temp_files = self.temp_files
@@ -319,42 +342,39 @@ def test_listdirfiles_iterable_datapipe(self):
     def test_listdirfilesdeterministic_iterable_datapipe(self):
         temp_dir = self.temp_dir.name
 
-        datapipe = dp.iter.FileLister(temp_dir, '')
+        datapipe = dp.iter.FileLister(temp_dir, "")
         # The output order should be always the same.
         self.assertEqual(list(datapipe), list(datapipe))
 
-        datapipe = dp.iter.FileLister(temp_dir, '', recursive=True)
+        datapipe = dp.iter.FileLister(temp_dir, "", recursive=True)
         # The output order should be always the same.
         self.assertEqual(list(datapipe), list(datapipe))
 
     def test_openfilesfromdisk_iterable_datapipe(self):
         # test import datapipe class directly
-        from torch.utils.data.datapipes.iter import (
-            FileLister,
-            FileOpener,
-        )
+        from torch.utils.data.datapipes.iter import FileLister, FileOpener
 
         temp_dir = self.temp_dir.name
-        datapipe1 = FileLister(temp_dir, '')
-        datapipe2 = FileOpener(datapipe1, mode='b')
+        datapipe1 = FileLister(temp_dir, "")
+        datapipe2 = FileOpener(datapipe1, mode="b")
 
         count = 0
         for rec in datapipe2:
             count = count + 1
             self.assertTrue(rec[0] in self.temp_files)
-            with open(rec[0], 'rb') as f:
+            with open(rec[0], "rb") as f:
                 self.assertEqual(rec[1].read(), f.read())
                 rec[1].close()
         self.assertEqual(count, len(self.temp_files))
 
         # functional API
-        datapipe3 = datapipe1.open_files(mode='b')
+        datapipe3 = datapipe1.open_files(mode="b")
 
         count = 0
         for rec in datapipe3:
             count = count + 1
             self.assertTrue(rec[0] in self.temp_files)
-            with open(rec[0], 'rb') as f:
+            with open(rec[0], "rb") as f:
                 self.assertEqual(rec[1].read(), f.read())
                 rec[1].close()
         self.assertEqual(count, len(self.temp_files))
@@ -366,13 +386,16 @@ def test_openfilesfromdisk_iterable_datapipe(self):
     def test_routeddecoder_iterable_datapipe(self):
         temp_dir = self.temp_dir.name
         temp_pngfile_pathname = os.path.join(temp_dir, "test_png.png")
-        png_data = np.array([[[1., 0., 0.], [1., 0., 0.]], [[1., 0., 0.], [1., 0., 0.]]], dtype=np.single)
+        png_data = np.array(
+            [[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0]], [[1.0, 0.0, 0.0], [1.0, 0.0, 0.0]]],
+            dtype=np.single,
+        )
         np.save(temp_pngfile_pathname, png_data)
-        datapipe1 = dp.iter.FileLister(temp_dir, ['*.png', '*.txt'])
-        datapipe2 = dp.iter.FileOpener(datapipe1, mode='b')
+        datapipe1 = dp.iter.FileLister(temp_dir, ["*.png", "*.txt"])
+        datapipe2 = dp.iter.FileOpener(datapipe1, mode="b")
 
         def _png_decoder(extension, data):
-            if extension != 'png':
+            if extension != "png":
                 return None
             return np.load(data)
 
@@ -382,14 +405,20 @@ def _helper(prior_dp, dp, channel_first=False):
                 self.assertFalse(inp[1].closed)
             for inp, rec in zip(prior_dp, dp):
                 ext = os.path.splitext(rec[0])[1]
-                if ext == '.png':
-                    expected = np.array([[[1., 0., 0.], [1., 0., 0.]], [[1., 0., 0.], [1., 0., 0.]]], dtype=np.single)
+                if ext == ".png":
+                    expected = np.array(
+                        [
+                            [[1.0, 0.0, 0.0], [1.0, 0.0, 0.0]],
+                            [[1.0, 0.0, 0.0], [1.0, 0.0, 0.0]],
+                        ],
+                        dtype=np.single,
+                    )
                     if channel_first:
                         expected = expected.transpose(2, 0, 1)
                     self.assertEqual(rec[1], expected)
                 else:
-                    with open(rec[0], 'rb') as f:
-                        self.assertEqual(rec[1], f.read().decode('utf-8'))
+                    with open(rec[0], "rb") as f:
+                        self.assertEqual(rec[1], f.read().decode("utf-8"))
                 # Corresponding byte stream is closed by Decoder
                 self.assertTrue(inp[1].closed)
 
@@ -406,13 +435,30 @@ def _helper(prior_dp, dp, channel_first=False):
         _helper(cached, datapipe4, channel_first=True)
 
     def test_groupby_iterable_datapipe(self):
-        file_list = ["a.png", "b.png", "c.json", "a.json", "c.png", "b.json", "d.png",
-                     "d.json", "e.png", "f.json", "g.png", "f.png", "g.json", "e.json",
-                     "h.txt", "h.json"]
+        file_list = [
+            "a.png",
+            "b.png",
+            "c.json",
+            "a.json",
+            "c.png",
+            "b.json",
+            "d.png",
+            "d.json",
+            "e.png",
+            "f.json",
+            "g.png",
+            "f.png",
+            "g.json",
+            "e.json",
+            "h.txt",
+            "h.json",
+        ]
 
         import io
 
-        datapipe1 = dp.iter.IterableWrapper([(filename, io.BytesIO(b'12345abcde')) for filename in file_list])
+        datapipe1 = dp.iter.IterableWrapper(
+            [(filename, io.BytesIO(b"12345abcde")) for filename in file_list]
+        )
 
         def group_fn(data):
             filepath, _ = data
@@ -427,8 +473,15 @@ def order_fn(data):
         datapipe3 = dp.iter.Mapper(datapipe2, fn=order_fn)  # type: ignore[var-annotated]
 
         expected_result = [
-            ("a.png", "a.json"), ("c.png", "c.json"), ("b.png", "b.json"), ("d.png", "d.json"),
-            ("f.png", "f.json"), ("g.png", "g.json"), ("e.png", "e.json"), ("h.txt", "h.json")]
+            ("a.png", "a.json"),
+            ("c.png", "c.json"),
+            ("b.png", "b.json"),
+            ("d.png", "d.json"),
+            ("f.png", "f.json"),
+            ("g.png", "g.json"),
+            ("e.png", "e.json"),
+            ("h.txt", "h.json"),
+        ]
 
         count = 0
         for rec, expected in zip(datapipe3, expected_result):
@@ -436,12 +489,14 @@ def order_fn(data):
             self.assertEqual(os.path.basename(rec[0][0]), expected[0])
             self.assertEqual(os.path.basename(rec[1][0]), expected[1])
             for i in [0, 1]:
-                self.assertEqual(rec[i][1].read(), b'12345abcde')
+                self.assertEqual(rec[i][1].read(), b"12345abcde")
                 rec[i][1].close()
         self.assertEqual(count, 8)
 
         # testing the keep_key option
-        datapipe4 = dp.iter.Grouper(datapipe1, group_key_fn=group_fn, keep_key=True, group_size=2)
+        datapipe4 = dp.iter.Grouper(
+            datapipe1, group_key_fn=group_fn, keep_key=True, group_size=2
+        )
 
         def order_fn(data):
             data[1].sort(key=lambda f: f[0], reverse=True)
@@ -450,9 +505,15 @@ def order_fn(data):
         datapipe5 = dp.iter.Mapper(datapipe4, fn=order_fn)  # type: ignore[var-annotated]
 
         expected_result = [
-            ("a", ("a.png", "a.json")), ("c", ("c.png", "c.json")), ("b", ("b.png", "b.json")),
-            ("d", ("d.png", "d.json")), ("f", ("f.png", "f.json")), ("g", ("g.png", "g.json")),
-            ("e", ("e.png", "e.json")), ("h", ("h.txt", "h.json"))]
+            ("a", ("a.png", "a.json")),
+            ("c", ("c.png", "c.json")),
+            ("b", ("b.png", "b.json")),
+            ("d", ("d.png", "d.json")),
+            ("f", ("f.png", "f.json")),
+            ("g", ("g.png", "g.json")),
+            ("e", ("e.png", "e.json")),
+            ("h", ("h.txt", "h.json")),
+        ]
 
         count = 0
         for rec, expected in zip(datapipe5, expected_result):
@@ -461,7 +522,7 @@ def order_fn(data):
             self.assertEqual(rec[1][0][0], expected[1][0])
             self.assertEqual(rec[1][1][0], expected[1][1])
             for i in [0, 1]:
-                self.assertEqual(rec[1][i][1].read(), b'12345abcde')
+                self.assertEqual(rec[1][i][1].read(), b"12345abcde")
                 rec[1][i][1].close()
         self.assertEqual(count, 8)
 
@@ -489,7 +550,7 @@ def test_demux_mux_datapipe(self):
     @suppress_warnings  # Suppress warning for lambda fn
     def test_map_with_col_file_handle_datapipe(self):
         temp_dir = self.temp_dir.name
-        datapipe1 = dp.iter.FileLister(temp_dir, '')
+        datapipe1 = dp.iter.FileLister(temp_dir, "")
         datapipe2 = dp.iter.FileOpener(datapipe1)
 
         def _helper(datapipe):
@@ -507,7 +568,7 @@ def _helper(datapipe):
 @skipIfNoDataFrames
 class TestCaptureDataFrame(TestCase):
     def get_new_df(self):
-        return df_wrapper.create_dataframe([[1, 2]], columns=['a', 'b'])
+        return df_wrapper.create_dataframe([[1, 2]], columns=["a", "b"])
 
     def compare_capture_and_eager(self, operations):
         cdf = CaptureDataFrame()
@@ -522,7 +583,7 @@ def compare_capture_and_eager(self, operations):
 
     def test_basic_capture(self):
         def operations(df):
-            df['c'] = df.b + df['a'] * 7
+            df["c"] = df.b + df["a"] * 7
             # somehow swallows pandas UserWarning when `df.c = df.b + df['a'] * 7`
             return df
 
@@ -531,27 +592,26 @@ def operations(df):
 
 class TestDataFramesPipes(TestCase):
     """
-        Most of test will fail if pandas instaled, but no dill available.
-        Need to rework them to avoid multiple skips.
+    Most of test will fail if pandas instaled, but no dill available.
+    Need to rework them to avoid multiple skips.
     """
 
     def _get_datapipe(self, range=10, dataframe_size=7):
-        return NumbersDataset(range) \
-            .map(lambda i: (i, i % 3))
+        return NumbersDataset(range).map(lambda i: (i, i % 3))
 
     def _get_dataframes_pipe(self, range=10, dataframe_size=7):
-        return NumbersDataset(range) \
-            .map(lambda i: (i, i % 3)) \
-            ._to_dataframes_pipe(
-            columns=['i', 'j'],
-            dataframe_size=dataframe_size)
+        return (
+            NumbersDataset(range)
+            .map(lambda i: (i, i % 3))
+            ._to_dataframes_pipe(columns=["i", "j"], dataframe_size=dataframe_size)
+        )
 
     @skipIfNoDataFrames
     @skipIfNoDill  # TODO(VitalyFedyunin): Decouple tests from dill by avoiding lambdas in map
     def test_capture(self):
         dp_numbers = self._get_datapipe().map(lambda x: (x[0], x[1], x[1] + 3 * x[0]))
         df_numbers = self._get_dataframes_pipe()
-        df_numbers['k'] = df_numbers['j'] + df_numbers.i * 3
+        df_numbers["k"] = df_numbers["j"] + df_numbers.i * 3
         expected = list(dp_numbers)
         actual = list(df_numbers)
         self.assertEqual(expected, actual)
@@ -599,19 +659,22 @@ def collate_i(column):
 
         def collate_j(column):
             return column.prod()
+
         df_numbers = self._get_dataframes_pipe(range=30).batch(3)
-        df_numbers = df_numbers.collate({'j': collate_j, 'i': collate_i})
-
-        expected_i = [3,
-                      12,
-                      21,
-                      30,
-                      39,
-                      48,
-                      57,
-                      66,
-                      75,
-                      84, ]
+        df_numbers = df_numbers.collate({"j": collate_j, "i": collate_i})
+
+        expected_i = [
+            3,
+            12,
+            21,
+            30,
+            39,
+            48,
+            57,
+            66,
+            75,
+            84,
+        ]
 
         actual_i = []
         for i, j in df_numbers:
@@ -631,7 +694,11 @@ def __init__(self, input_dp):
 
     # Prevent in-place modification
     def __iter__(self):
-        input_dp = self.input_dp if isinstance(self.input_dp, IterDataPipe) else copy.deepcopy(self.input_dp)
+        input_dp = (
+            self.input_dp
+            if isinstance(self.input_dp, IterDataPipe)
+            else copy.deepcopy(self.input_dp)
+        )
         yield from input_dp
 
 
@@ -650,6 +717,7 @@ def _fake_filter_fn(data):
 def _simple_filter_fn(data):
     return data >= 5
 
+
 def _fake_filter_fn_constant(constant, data):
     return data >= constant
 
@@ -682,7 +750,6 @@ def __call__(self, x):
 
 
 class TestFunctionalIterDataPipe(TestCase):
-
     def _serialization_test_helper(self, datapipe, use_dill):
         if use_dill:
             serialized_dp = dill.dumps(datapipe)
@@ -738,7 +805,15 @@ def _serialization_test_for_dp_with_children(self, dp1, dp2, use_dill=False):
 
     def test_serializable(self):
         picklable_datapipes: List = [
-            (dp.iter.Batcher, None, (3, True,), {}),
+            (
+                dp.iter.Batcher,
+                None,
+                (
+                    3,
+                    True,
+                ),
+                {},
+            ),
             (dp.iter.Collator, None, (_fake_fn,), {}),
             (dp.iter.Concater, None, (dp.iter.IterableWrapper(range(5)),), {}),
             (dp.iter.Demultiplexer, None, (2, _simple_filter_fn), {}),
@@ -767,7 +842,9 @@ def test_serializable(self):
         for dpipe, custom_input, dp_args, dp_kwargs in picklable_datapipes:
             if custom_input is None:
                 custom_input = dp.iter.IterableWrapper(range(10))
-            if dpipe in dp_skip_comparison:  # Merely make sure they are picklable and loadable (no value comparison)
+            if (
+                dpipe in dp_skip_comparison
+            ):  # Merely make sure they are picklable and loadable (no value comparison)
                 datapipe = dpipe(custom_input, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
                 serialized_dp = pickle.dumps(datapipe)
                 _ = pickle.loads(serialized_dp)
@@ -783,9 +860,18 @@ def test_serializable_with_dill(self):
         """Only for DataPipes that take in a function as argument"""
         input_dp = dp.iter.IterableWrapper(range(10))
 
-        datapipes_with_lambda_fn: List[Tuple[Type[IterDataPipe], Tuple, Dict[str, Any]]] = [
+        datapipes_with_lambda_fn: List[
+            Tuple[Type[IterDataPipe], Tuple, Dict[str, Any]]
+        ] = [
             (dp.iter.Collator, (lambda_fn1,), {}),
-            (dp.iter.Demultiplexer, (2, lambda_fn2,), {}),
+            (
+                dp.iter.Demultiplexer,
+                (
+                    2,
+                    lambda_fn2,
+                ),
+                {},
+            ),
             (dp.iter.Filter, (lambda_fn3,), {}),
             (dp.iter.Grouper, (lambda_fn3,), {}),
             (dp.iter.Mapper, (lambda_fn1,), {}),
@@ -805,9 +891,18 @@ def _fn3(x):
 
         fn1, fn2, fn3 = _local_fns()
 
-        datapipes_with_local_fn: List[Tuple[Type[IterDataPipe], Tuple, Dict[str, Any]]] = [
+        datapipes_with_local_fn: List[
+            Tuple[Type[IterDataPipe], Tuple, Dict[str, Any]]
+        ] = [
             (dp.iter.Collator, (fn1,), {}),
-            (dp.iter.Demultiplexer, (2, fn2,), {}),
+            (
+                dp.iter.Demultiplexer,
+                (
+                    2,
+                    fn2,
+                ),
+                {},
+            ),
             (dp.iter.Filter, (fn3,), {}),
             (dp.iter.Grouper, (fn3,), {}),
             (dp.iter.Mapper, (fn1,), {}),
@@ -816,19 +911,25 @@ def _fn3(x):
         dp_compare_children = {dp.iter.Demultiplexer}
 
         if HAS_DILL:
-            for dpipe, dp_args, dp_kwargs in datapipes_with_lambda_fn + datapipes_with_local_fn:
+            for dpipe, dp_args, dp_kwargs in (
+                datapipes_with_lambda_fn + datapipes_with_local_fn
+            ):
                 if dpipe in dp_compare_children:
                     dp1, dp2 = dpipe(input_dp, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
-                    self._serialization_test_for_dp_with_children(dp1, dp2, use_dill=True)
+                    self._serialization_test_for_dp_with_children(
+                        dp1, dp2, use_dill=True
+                    )
                 else:
                     datapipe = dpipe(input_dp, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
                     self._serialization_test_for_single_dp(datapipe, use_dill=True)
         else:
             msgs = (
                 r"^Lambda function is not supported by pickle",
-                r"^Local function is not supported by pickle"
+                r"^Local function is not supported by pickle",
             )
-            for dps, msg in zip((datapipes_with_lambda_fn, datapipes_with_local_fn), msgs):
+            for dps, msg in zip(
+                (datapipes_with_lambda_fn, datapipes_with_local_fn), msgs
+            ):
                 for dpipe, dp_args, dp_kwargs in dps:
                     with self.assertWarnsRegex(UserWarning, msg):
                         datapipe = dpipe(input_dp, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
@@ -873,7 +974,6 @@ def test_docstring(self):
             assert "Example:" in docstring or "Examples:" in docstring
 
     def test_iterable_wrapper_datapipe(self):
-
         input_ls = list(range(10))
         input_dp = dp.iter.IterableWrapper(input_ls)
 
@@ -882,7 +982,9 @@ def test_iterable_wrapper_datapipe(self):
 
         # Functional Test: deep copy by default when an iterator is initialized (first element is read)
         it = iter(input_dp)
-        self.assertEqual(0, next(it))  # The deep copy only happens when the first element is read
+        self.assertEqual(
+            0, next(it)
+        )  # The deep copy only happens when the first element is read
         input_ls.append(50)
         self.assertEqual(list(range(1, 10)), list(it))
 
@@ -896,7 +998,9 @@ def test_iterable_wrapper_datapipe(self):
         input_ls = list(range(10))
         input_dp = dp.iter.IterableWrapper(input_ls)
         n_elements_before_reset = 5
-        res_before_reset, res_after_reset = reset_after_n_next_calls(input_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            input_dp, n_elements_before_reset
+        )
         self.assertEqual(input_ls[:n_elements_before_reset], res_before_reset)
         self.assertEqual(input_ls, res_after_reset)
 
@@ -912,7 +1016,9 @@ def test_concat_iterdatapipe(self):
             dp.iter.Concater()
 
         # Functional Test: Raises exception for non-IterDataPipe input
-        with self.assertRaisesRegex(TypeError, r"Expected all inputs to be `IterDataPipe`"):
+        with self.assertRaisesRegex(
+            TypeError, r"Expected all inputs to be `IterDataPipe`"
+        ):
             dp.iter.Concater(input_dp1, ())  # type: ignore[arg-type]
 
         # Functional Test: Concatenate DataPipes as expected
@@ -922,7 +1028,9 @@ def test_concat_iterdatapipe(self):
 
         # Reset Test: reset the DataPipe
         n_elements_before_reset = 5
-        res_before_reset, res_after_reset = reset_after_n_next_calls(concat_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            concat_dp, n_elements_before_reset
+        )
         self.assertEqual(list(range(5)), res_before_reset)
         self.assertEqual(list(range(10)) + list(range(5)), res_after_reset)
 
@@ -997,7 +1105,9 @@ def test_fork_iterdatapipe(self):
             self.assertEqual(n1, n2)
 
         # Functional Test: two child DataPipes yield deep copies with copy equals deep
-        dp1, dp2 = input_dp.map(_to_list).map(_to_list).fork(num_instances=2, copy="deep")
+        dp1, dp2 = (
+            input_dp.map(_to_list).map(_to_list).fork(num_instances=2, copy="deep")
+        )
         for n1, n2 in zip(dp1, dp2):
             self.assertIsNot(n1[0], n2[0])
             self.assertEqual(n1, n2)
@@ -1030,7 +1140,9 @@ def test_fork_iterdatapipe(self):
                     with warnings.catch_warnings(record=True) as wa:
                         _ = iter(dp1)  # This will reset all child DataPipes
                         self.assertEqual(len(wa), 1)
-                        self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted")
+                        self.assertRegex(
+                            str(wa[0].message), r"child DataPipes are not exhausted"
+                        )
         self.assertEqual(list(range(5)), output2)
 
         # Reset Test: DataPipe resets when some of it has been read
@@ -1043,7 +1155,9 @@ def test_fork_iterdatapipe(self):
                 with warnings.catch_warnings(record=True) as wa:
                     _ = iter(dp1)  # Reset both all child DataPipe
                     self.assertEqual(len(wa), 1)
-                    self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+                    self.assertRegex(
+                        str(wa[0].message), r"Some child DataPipes are not exhausted"
+                    )
                 break
         with warnings.catch_warnings(record=True) as wa:
             for i, (n1, n2) in enumerate(zip(dp1, dp2)):
@@ -1060,9 +1174,13 @@ def test_fork_iterdatapipe(self):
         self.assertEqual(list(range(10)), output1)
         self.assertEqual(list(range(10)), output2)
         with warnings.catch_warnings(record=True) as wa:
-            self.assertEqual(list(range(10)), list(dp1))  # Resets even though dp3 has not been read
+            self.assertEqual(
+                list(range(10)), list(dp1)
+            )  # Resets even though dp3 has not been read
             self.assertEqual(len(wa), 1)
-            self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+            self.assertRegex(
+                str(wa[0].message), r"Some child DataPipes are not exhausted"
+            )
         output3 = []
         for i, n3 in enumerate(dp3):
             output3.append(n3)
@@ -1070,11 +1188,15 @@ def test_fork_iterdatapipe(self):
                 with warnings.catch_warnings(record=True) as wa:
                     output1 = list(dp1)  # Resets even though dp3 is only partially read
                     self.assertEqual(len(wa), 1)
-                    self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+                    self.assertRegex(
+                        str(wa[0].message), r"Some child DataPipes are not exhausted"
+                    )
                 self.assertEqual(list(range(5)), output3)
                 self.assertEqual(list(range(10)), output1)
                 break
-        self.assertEqual(list(range(10)), list(dp3))  # dp3 has to read from the start again
+        self.assertEqual(
+            list(range(10)), list(dp3)
+        )  # dp3 has to read from the start again
 
         # __len__ Test: Each DataPipe inherits the source datapipe's length
         dp1, dp2, dp3 = input_dp.fork(num_instances=3)
@@ -1090,7 +1212,6 @@ def test_fork_iterdatapipe(self):
         traverse_dps(dp2)  # This should not raise any error either
 
     def test_mux_iterdatapipe(self):
-
         # Functional Test: Elements are yielded one at a time from each DataPipe, until they are all exhausted
         input_dp1 = dp.iter.IterableWrapper(range(4))
         input_dp2 = dp.iter.IterableWrapper(range(4, 8))
@@ -1143,15 +1264,21 @@ def test_demux_iterdatapipe(self):
         self.assertEqual([(i, i + 1) for i in range(0, 10, 2)], output)
 
         # Functional Test: values of the same classification are lumped together, and buffer_size = 3 being too small
-        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: 0 if x >= 5 else 1, buffer_size=4)
+        dp1, dp2 = input_dp.demux(
+            num_instances=2, classifier_fn=lambda x: 0 if x >= 5 else 1, buffer_size=4
+        )
         it1 = iter(dp1)
         with self.assertRaises(BufferError):
-            next(it1)  # Buffer raises because first 5 elements all belong to the a different child
+            next(
+                it1
+            )  # Buffer raises because first 5 elements all belong to the a different child
         with self.assertRaises(BufferError):
             list(dp2)
 
         # Functional Test: values of the same classification are lumped together, and buffer_size = 5 is just enough
-        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: 0 if x >= 5 else 1, buffer_size=5)
+        dp1, dp2 = input_dp.demux(
+            num_instances=2, classifier_fn=lambda x: 0 if x >= 5 else 1, buffer_size=5
+        )
         output1, output2 = list(dp1), list(dp2)
         self.assertEqual(list(range(5, 10)), output1)
         self.assertEqual(list(range(0, 5)), output2)
@@ -1161,7 +1288,7 @@ def test_demux_iterdatapipe(self):
             dp1, dp2 = input_dp.demux(
                 num_instances=2,
                 classifier_fn=lambda x: 0 if x >= 5 else 1,
-                buffer_size=-1
+                buffer_size=-1,
             )
             exp_l = 1 if HAS_DILL else 2
             self.assertEqual(len(wa), exp_l)
@@ -1188,7 +1315,9 @@ def test_demux_iterdatapipe(self):
                     with warnings.catch_warnings(record=True) as wa:
                         _ = iter(dp1)  # This will reset all child DataPipes
                         self.assertEqual(len(wa), 1)
-                        self.assertRegex(str(wa[0].message), r"child DataPipes are not exhausted")
+                        self.assertRegex(
+                            str(wa[0].message), r"child DataPipes are not exhausted"
+                        )
         self.assertEqual(list(range(1, 10, 2)), output2)
 
         # Reset Test: DataPipe resets when some of it has been read
@@ -1202,7 +1331,9 @@ def test_demux_iterdatapipe(self):
         with warnings.catch_warnings(record=True) as wa:
             i1 = iter(dp1)  # Reset all child DataPipes
             self.assertEqual(len(wa), 1)
-            self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+            self.assertRegex(
+                str(wa[0].message), r"Some child DataPipes are not exhausted"
+            )
             for n1, n2 in zip(dp1, dp2):
                 output1.append(n1)
                 output2.append(n2)
@@ -1216,31 +1347,45 @@ def test_demux_iterdatapipe(self):
         output1 = list(dp1)
         self.assertEqual(list(range(0, 10, 2)), output1)
         with warnings.catch_warnings(record=True) as wa:
-            self.assertEqual(list(range(0, 10, 2)), list(dp1))  # Reset even when dp2 is not read
+            self.assertEqual(
+                list(range(0, 10, 2)), list(dp1)
+            )  # Reset even when dp2 is not read
             self.assertEqual(len(wa), 1)
-            self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+            self.assertRegex(
+                str(wa[0].message), r"Some child DataPipes are not exhausted"
+            )
         output2 = []
         for i, n2 in enumerate(dp2):
             output2.append(n2)
             if i == 1:
                 self.assertEqual(list(range(1, 5, 2)), output2)
                 with warnings.catch_warnings(record=True) as wa:
-                    self.assertEqual(list(range(0, 10, 2)), list(dp1))  # Can reset even when dp2 is partially read
+                    self.assertEqual(
+                        list(range(0, 10, 2)), list(dp1)
+                    )  # Can reset even when dp2 is partially read
                     self.assertEqual(len(wa), 1)
-                    self.assertRegex(str(wa[0].message), r"Some child DataPipes are not exhausted")
+                    self.assertRegex(
+                        str(wa[0].message), r"Some child DataPipes are not exhausted"
+                    )
                 break
         output2 = list(dp2)  # output2 has to read from beginning again
         self.assertEqual(list(range(1, 10, 2)), output2)
 
         # Functional Test: drop_none = True
-        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2 if x % 5 != 0 else None,
-                                  drop_none=True)
+        dp1, dp2 = input_dp.demux(
+            num_instances=2,
+            classifier_fn=lambda x: x % 2 if x % 5 != 0 else None,
+            drop_none=True,
+        )
         self.assertEqual([2, 4, 6, 8], list(dp1))
         self.assertEqual([1, 3, 7, 9], list(dp2))
 
         # Functional Test: drop_none = False
-        dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2 if x % 5 != 0 else None,
-                                  drop_none=False)
+        dp1, dp2 = input_dp.demux(
+            num_instances=2,
+            classifier_fn=lambda x: x % 2 if x % 5 != 0 else None,
+            drop_none=False,
+        )
         it1 = iter(dp1)
         with self.assertRaises(ValueError):
             next(it1)
@@ -1248,7 +1393,9 @@ def test_demux_iterdatapipe(self):
         # __len__ Test: __len__ not implemented
         dp1, dp2 = input_dp.demux(num_instances=2, classifier_fn=lambda x: x % 2)
         with self.assertRaises(TypeError):
-            len(dp1)  # It is not implemented as we do not know length for each child in advance
+            len(
+                dp1
+            )  # It is not implemented as we do not know length for each child in advance
         with self.assertRaises(TypeError):
             len(dp2)
 
@@ -1292,7 +1439,9 @@ def fn(item, dtype=torch.float, *, sum=False):
 
         # Reset Test: DataPipe resets properly
         n_elements_before_reset = 5
-        res_before_reset, res_after_reset = reset_after_n_next_calls(map_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            map_dp, n_elements_before_reset
+        )
         self.assertEqual(list(range(n_elements_before_reset)), res_before_reset)
         self.assertEqual(list(range(10)), res_after_reset)
 
@@ -1327,11 +1476,15 @@ def fn_cmplx(d0, d1=1, *args, d2, **kwargs):
 
         p_fn_n1 = partial(fn_n1, d1=1)
         p_fn_cmplx = partial(fn_cmplx, d2=2)
-        p_fn_cmplx_large_arg = partial(fn_cmplx, d2={i: list(range(i)) for i in range(10_000)})
+        p_fn_cmplx_large_arg = partial(
+            fn_cmplx, d2={i: list(range(i)) for i in range(10_000)}
+        )
 
         def _helper(ref_fn, fn, input_col=None, output_col=None, error=None):
             for constr in (list, tuple):
-                datapipe = dp.iter.IterableWrapper([constr((0, 1, 2)), constr((3, 4, 5)), constr((6, 7, 8))])
+                datapipe = dp.iter.IterableWrapper(
+                    [constr((0, 1, 2)), constr((3, 4, 5)), constr((6, 7, 8))]
+                )
                 if ref_fn is None:
                     with self.assertRaises(error):
                         res_dp = datapipe.map(fn, input_col, output_col)
@@ -1342,13 +1495,18 @@ def _helper(ref_fn, fn, input_col=None, output_col=None, error=None):
                     self.assertEqual(list(res_dp), list(ref_dp))
                     # Reset
                     self.assertEqual(list(res_dp), list(ref_dp))
+
         _helper(lambda data: data, fn_n1_def, 0, 1)
-        _helper(lambda data: (data[0], data[1], data[0] + data[1]), fn_n1_def, [0, 1], 2)
+        _helper(
+            lambda data: (data[0], data[1], data[0] + data[1]), fn_n1_def, [0, 1], 2
+        )
         _helper(lambda data: data, p_fn_n1, 0, 1)
         _helper(lambda data: data, p_fn_cmplx, 0, 1)
         _helper(lambda data: data, p_fn_cmplx_large_arg, 0, 1)
-        _helper(lambda data: (data[0], data[1], data[0] + data[1]), p_fn_cmplx, [0, 1], 2)
-        _helper(lambda data: (data[0] + data[1], ), fn_n1_pos, [0, 1, 2])
+        _helper(
+            lambda data: (data[0], data[1], data[0] + data[1]), p_fn_cmplx, [0, 1], 2
+        )
+        _helper(lambda data: (data[0] + data[1],), fn_n1_pos, [0, 1, 2])
 
         # Replacing with one input column and default output column
         _helper(lambda data: (data[0], -data[1], data[2]), fn_11, 1)
@@ -1373,7 +1531,11 @@ def _helper(ref_fn, fn, input_col=None, output_col=None, error=None):
 
         # Replacing with multiple input columns and default output column (the left-most input column)
         _helper(lambda data: (data[1], data[2] + data[0]), fn_n1, [2, 0])
-        _helper(lambda data: (data[0], (-data[2], -data[1], data[2] + data[1])), fn_nn, [2, 1])
+        _helper(
+            lambda data: (data[0], (-data[2], -data[1], data[2] + data[1])),
+            fn_nn,
+            [2, 1],
+        )
 
         # output_col can only be specified when input_col is not None
         _helper(None, fn_n1, None, 1, error=ValueError)
@@ -1387,13 +1549,23 @@ def _helper(ref_fn, fn, input_col=None, output_col=None, error=None):
         # The index of output column is out of range
         _helper(None, fn_1n, 1, 3, error=IndexError)
         _helper(lambda data: (data[0], data[0] + data[2], data[2]), fn_n1, [0, 2], 1)
-        _helper(lambda data: ((-data[1], -data[2], data[1] + data[2]), data[1], data[2]), fn_nn, [1, 2], 0)
+        _helper(
+            lambda data: ((-data[1], -data[2], data[1] + data[2]), data[1], data[2]),
+            fn_nn,
+            [1, 2],
+            0,
+        )
 
         # Appending the output at the end
         _helper(lambda data: (*data, -data[1]), fn_11, 1, -1)
         _helper(lambda data: (*data, (-data[1], data[1])), fn_1n, 1, -1)
         _helper(lambda data: (*data, data[0] + data[2]), fn_n1, [0, 2], -1)
-        _helper(lambda data: (*data, (-data[1], -data[2], data[1] + data[2])), fn_nn, [1, 2], -1)
+        _helper(
+            lambda data: (*data, (-data[1], -data[2], data[1] + data[2])),
+            fn_nn,
+            [1, 2],
+            -1,
+        )
 
         # Handling built-in functions (e.g. `dict`, `iter`, `int`, `str`) whose signatures cannot be inspected
         _helper(lambda data: (str(data[0]), data[1], data[2]), str, 0)
@@ -1439,7 +1611,9 @@ def fn_cmplx(d0, d1=1, *args, d2, **kwargs):
             return d0 + d1
 
         p_fn_cmplx = partial(fn_cmplx, d2=2)
-        p_fn_cmplx_large_arg = partial(fn_cmplx, d2={i: list(range(i)) for i in range(10_000)})
+        p_fn_cmplx_large_arg = partial(
+            fn_cmplx, d2={i: list(range(i)) for i in range(10_000)}
+        )
 
         # Prevent modification in-place to support resetting
         def _dict_update(data, newdata, remove_idx=None):
@@ -1452,9 +1626,11 @@ def _dict_update(data, newdata, remove_idx=None):
 
         def _helper(ref_fn, fn, input_col=None, output_col=None, error=None):
             datapipe = dp.iter.IterableWrapper(
-                [{"x": 0, "y": 1, "z": 2},
-                 {"x": 3, "y": 4, "z": 5},
-                 {"x": 6, "y": 7, "z": 8}]
+                [
+                    {"x": 0, "y": 1, "z": 2},
+                    {"x": 3, "y": 4, "z": 5},
+                    {"x": 6, "y": 7, "z": 8},
+                ]
             )
             if ref_fn is None:
                 with self.assertRaises(error):
@@ -1467,26 +1643,36 @@ def _helper(ref_fn, fn, input_col=None, output_col=None, error=None):
                 # Reset
                 self.assertEqual(list(res_dp), list(ref_dp))
 
-        _helper(lambda data: data, fn_n1_def, 'x', 'y')
-        _helper(lambda data: data, p_fn_n1, 'x', 'y')
-        _helper(lambda data: data, p_fn_cmplx, 'x', 'y')
-        _helper(lambda data: data, p_fn_cmplx_large_arg, 'x', 'y')
-        _helper(lambda data: _dict_update(data, {"z": data["x"] + data["y"]}),
-                p_fn_cmplx, ["x", "y", "z"], "z")
+        _helper(lambda data: data, fn_n1_def, "x", "y")
+        _helper(lambda data: data, p_fn_n1, "x", "y")
+        _helper(lambda data: data, p_fn_cmplx, "x", "y")
+        _helper(lambda data: data, p_fn_cmplx_large_arg, "x", "y")
+        _helper(
+            lambda data: _dict_update(data, {"z": data["x"] + data["y"]}),
+            p_fn_cmplx,
+            ["x", "y", "z"],
+            "z",
+        )
 
-        _helper(lambda data: _dict_update(data, {"z": data["x"] + data["y"]}), fn_n1_def, ['x', 'y'], 'z')
+        _helper(
+            lambda data: _dict_update(data, {"z": data["x"] + data["y"]}),
+            fn_n1_def,
+            ["x", "y"],
+            "z",
+        )
 
-        _helper(None, fn_n1_pos, 'x', error=ValueError)
-        _helper(None, fn_n1_kwargs, 'x', error=ValueError)
+        _helper(None, fn_n1_pos, "x", error=ValueError)
+        _helper(None, fn_n1_kwargs, "x", error=ValueError)
         # non-default kw-only args
-        _helper(None, fn_kwonly, ['x', 'y'], error=ValueError)
-        _helper(None, fn_has_nondefault_kwonly, ['x', 'y'], error=ValueError)
-        _helper(None, fn_cmplx, ['x', 'y'], error=ValueError)
-
+        _helper(None, fn_kwonly, ["x", "y"], error=ValueError)
+        _helper(None, fn_has_nondefault_kwonly, ["x", "y"], error=ValueError)
+        _helper(None, fn_cmplx, ["x", "y"], error=ValueError)
 
         # Replacing with one input column and default output column
         _helper(lambda data: _dict_update(data, {"y": -data["y"]}), fn_11, "y")
-        _helper(lambda data: _dict_update(data, {"y": (-data["y"], data["y"])}), fn_1n, "y")
+        _helper(
+            lambda data: _dict_update(data, {"y": (-data["y"], data["y"])}), fn_1n, "y"
+        )
         # The key of input column is not in dict
         _helper(None, fn_1n, "a", error=KeyError)
         # Unmatched input columns with fn arguments
@@ -1496,9 +1682,18 @@ def _helper(ref_fn, fn, input_col=None, output_col=None, error=None):
         _helper(None, p_fn_n1, ["x", "y"], error=ValueError)
         _helper(None, fn_n1_kwargs, ["x", "y", "z"], error=ValueError)
         # Replacing with multiple input columns and default output column (the left-most input column)
-        _helper(lambda data: _dict_update(data, {"z": data["x"] + data["z"]}, ["x"]), fn_n1, ["z", "x"])
-        _helper(lambda data: _dict_update(
-            data, {"z": (-data["z"], -data["y"], data["y"] + data["z"])}, ["y"]), fn_nn, ["z", "y"])
+        _helper(
+            lambda data: _dict_update(data, {"z": data["x"] + data["z"]}, ["x"]),
+            fn_n1,
+            ["z", "x"],
+        )
+        _helper(
+            lambda data: _dict_update(
+                data, {"z": (-data["z"], -data["y"], data["y"] + data["z"])}, ["y"]
+            ),
+            fn_nn,
+            ["z", "y"],
+        )
 
         # output_col can only be specified when input_col is not None
         _helper(None, fn_n1, None, "x", error=ValueError)
@@ -1508,17 +1703,49 @@ def _helper(ref_fn, fn, input_col=None, output_col=None, error=None):
         _helper(lambda data: _dict_update(data, {"x": -data["y"]}), fn_11, "y", ["x"])
         # Replacing with one input column and single specified output column
         _helper(lambda data: _dict_update(data, {"x": -data["y"]}), fn_11, "y", "x")
-        _helper(lambda data: _dict_update(data, {"z": (-data["y"], data["y"])}), fn_1n, "y", "z")
-        _helper(lambda data: _dict_update(data, {"y": data["x"] + data["z"]}), fn_n1, ["x", "z"], "y")
-        _helper(lambda data: _dict_update(
-            data, {"x": (-data["y"], -data["z"], data["y"] + data["z"])}), fn_nn, ["y", "z"], "x")
+        _helper(
+            lambda data: _dict_update(data, {"z": (-data["y"], data["y"])}),
+            fn_1n,
+            "y",
+            "z",
+        )
+        _helper(
+            lambda data: _dict_update(data, {"y": data["x"] + data["z"]}),
+            fn_n1,
+            ["x", "z"],
+            "y",
+        )
+        _helper(
+            lambda data: _dict_update(
+                data, {"x": (-data["y"], -data["z"], data["y"] + data["z"])}
+            ),
+            fn_nn,
+            ["y", "z"],
+            "x",
+        )
 
         # Adding new key to dict for the output
         _helper(lambda data: _dict_update(data, {"a": -data["y"]}), fn_11, "y", "a")
-        _helper(lambda data: _dict_update(data, {"a": (-data["y"], data["y"])}), fn_1n, "y", "a")
-        _helper(lambda data: _dict_update(data, {"a": data["x"] + data["z"]}), fn_n1, ["x", "z"], "a")
-        _helper(lambda data: _dict_update(
-            data, {"a": (-data["y"], -data["z"], data["y"] + data["z"])}), fn_nn, ["y", "z"], "a")
+        _helper(
+            lambda data: _dict_update(data, {"a": (-data["y"], data["y"])}),
+            fn_1n,
+            "y",
+            "a",
+        )
+        _helper(
+            lambda data: _dict_update(data, {"a": data["x"] + data["z"]}),
+            fn_n1,
+            ["x", "z"],
+            "a",
+        )
+        _helper(
+            lambda data: _dict_update(
+                data, {"a": (-data["y"], -data["z"], data["y"] + data["z"])}
+            ),
+            fn_nn,
+            ["y", "z"],
+            "a",
+        )
 
     def test_collate_iterdatapipe(self):
         arrs = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
@@ -1544,7 +1771,9 @@ def _collate_fn(batch, default_type=torch.float):
 
         # Reset Test: reset the DataPipe and results are still correct
         n_elements_before_reset = 1
-        res_before_reset, res_after_reset = reset_after_n_next_calls(collate_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            collate_dp, n_elements_before_reset
+        )
         self.assertEqual([torch.tensor(6, dtype=torch.int)], res_before_reset)
         for x, y in zip(arrs, res_after_reset):
             self.assertEqual(torch.tensor(sum(x), dtype=torch.int), y)
@@ -1574,13 +1803,13 @@ def test_batch_iterdatapipe(self):
         self.assertEqual(len(batch_dp), 4)
         for i, batch in enumerate(batch_dp):
             self.assertEqual(len(batch), 1 if i == 3 else bs)
-            self.assertEqual(batch, arrs[i * bs: i * bs + len(batch)])
+            self.assertEqual(batch, arrs[i * bs : i * bs + len(batch)])
 
         # Functional Test: Drop the last batch when specified
         bs = 4
         batch_dp = input_dp.batch(batch_size=bs, drop_last=True)
         for i, batch in enumerate(batch_dp):
-            self.assertEqual(batch, arrs[i * bs: i * bs + len(batch)])
+            self.assertEqual(batch, arrs[i * bs : i * bs + len(batch)])
 
         # __len__ test: verifying that the overall length and of each batch is correct
         for i, batch in enumerate(batch_dp):
@@ -1595,7 +1824,9 @@ def test_batch_iterdatapipe(self):
 
         # Reset Test: Ensures that the DataPipe can properly reset
         n_elements_before_reset = 1
-        res_before_reset, res_after_reset = reset_after_n_next_calls(batch_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            batch_dp, n_elements_before_reset
+        )
         self.assertEqual([[0, 1, 2, 3]], res_before_reset)
         self.assertEqual([[0, 1, 2, 3], [4, 5, 6, 7]], res_after_reset)
 
@@ -1656,7 +1887,9 @@ def test_unbatch_iterdatapipe(self):
         input_dp = dp.iter.IterableWrapper([[0, 1, 2], [3, 4, 5]])
         unbatch_dp = input_dp.unbatch(unbatch_level=-1)
         n_elements_before_reset = 3
-        res_before_reset, res_after_reset = reset_after_n_next_calls(unbatch_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            unbatch_dp, n_elements_before_reset
+        )
         self.assertEqual([0, 1, 2], res_before_reset)
         self.assertEqual([0, 1, 2, 3, 4, 5], res_after_reset)
 
@@ -1683,7 +1916,9 @@ def _non_bool_fn(data):
 
         # Single input_col
         input_col_1_dp = tuple_input_ds.filter(partial(_filter_fn, val=5), input_col=1)
-        self.assertEqual(list(input_col_1_dp), [(d - 1, d, d + 1) for d in range(5, 10)])
+        self.assertEqual(
+            list(input_col_1_dp), [(d - 1, d, d + 1) for d in range(5, 10)]
+        )
 
         # Multiple input_col
         def _mul_filter_fn(a, b):
@@ -1725,7 +1960,9 @@ def _mul_filter_fn_with_kw_only_1_default(*, a, b=1):
         # Reset Test: DataPipe resets correctly
         filter_dp = input_ds.filter(partial(_filter_fn, val=5))
         n_elements_before_reset = 3
-        res_before_reset, res_after_reset = reset_after_n_next_calls(filter_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            filter_dp, n_elements_before_reset
+        )
         self.assertEqual(list(range(5, 10))[:n_elements_before_reset], res_before_reset)
         self.assertEqual(list(range(5, 10)), res_after_reset)
 
@@ -1738,8 +1975,9 @@ def test_sampler_iterdatapipe(self):
             self.assertEqual(x, i)
 
         # RandomSampler
-        random_sampled_dp = dp.iter.Sampler(input_dp, sampler=RandomSampler, sampler_kwargs={
-            'replacement': True})  # type: ignore[var-annotated] # noqa: B950
+        random_sampled_dp = dp.iter.Sampler(
+            input_dp, sampler=RandomSampler, sampler_kwargs={"replacement": True}
+        )  # type: ignore[var-annotated] # noqa: B950
 
         # Requires `__len__` to build SamplerDataPipe
         input_dp_nolen = IDP_NoLen(range(10))
@@ -1749,7 +1987,9 @@ def test_sampler_iterdatapipe(self):
     def test_stream_reader_iterdatapipe(self):
         from io import StringIO
 
-        input_dp = dp.iter.IterableWrapper([("f1", StringIO("abcde")), ("f2", StringIO("bcdef"))])
+        input_dp = dp.iter.IterableWrapper(
+            [("f1", StringIO("abcde")), ("f2", StringIO("bcdef"))]
+        )
         expected_res = ["abcde", "bcdef"]
 
         # Functional Test: Read full chunk
@@ -1794,7 +2034,9 @@ def test_shuffler_iterdatapipe(self):
         # Reset Test:
         shuffler_dp = input_dp.shuffle()
         n_elements_before_reset = 5
-        res_before_reset, res_after_reset = reset_after_n_next_calls(shuffler_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            shuffler_dp, n_elements_before_reset
+        )
         self.assertEqual(5, len(res_before_reset))
         for x in res_before_reset:
             self.assertTrue(x in set(range(10)))
@@ -1814,7 +2056,10 @@ def _serialization_helper(bs):
             for _ in range(2):
                 next(it)
             shuffler_dp_copy = pickle.loads(pickle.dumps(shuffler_dp))
-            _simple_graph_snapshot_restoration(shuffler_dp_copy.datapipe, shuffler_dp.datapipe._number_of_samples_yielded)
+            _simple_graph_snapshot_restoration(
+                shuffler_dp_copy.datapipe,
+                shuffler_dp.datapipe._number_of_samples_yielded,
+            )
 
             exp = list(it)
             shuffler_dp_copy._snapshot_state = _SnapshotState.Restored
@@ -1825,14 +2070,14 @@ def _serialization_helper(bs):
             _serialization_helper(bs)
 
     def test_zip_iterdatapipe(self):
-
         # Functional Test: raises TypeError when an input is not of type `IterDataPipe`
         with self.assertRaises(TypeError):
             dp.iter.Zipper(dp.iter.IterableWrapper(range(10)), list(range(10)))  # type: ignore[arg-type]
 
         # Functional Test: raises TypeError when an input does not have valid length
-        zipped_dp = dp.iter.Zipper(dp.iter.IterableWrapper(
-            range(10)), IDP_NoLen(range(5)))  # type: ignore[var-annotated]
+        zipped_dp = dp.iter.Zipper(
+            dp.iter.IterableWrapper(range(10)), IDP_NoLen(range(5))
+        )  # type: ignore[var-annotated]
         with self.assertRaisesRegex(TypeError, r"instance doesn't have valid length$"):
             len(zipped_dp)
 
@@ -1841,21 +2086,24 @@ def test_zip_iterdatapipe(self):
         self.assertEqual(list(zipped_dp), exp)
 
         # Functional Test: zips the inputs properly even when lengths are different (zips to the shortest)
-        zipped_dp = dp.iter.Zipper(dp.iter.IterableWrapper(range(10)), dp.iter.IterableWrapper(range(5)))
+        zipped_dp = dp.iter.Zipper(
+            dp.iter.IterableWrapper(range(10)), dp.iter.IterableWrapper(range(5))
+        )
 
         # __len__ Test: length matches the length of the shortest input
         self.assertEqual(len(zipped_dp), 5)
 
         # Reset Test:
         n_elements_before_reset = 3
-        res_before_reset, res_after_reset = reset_after_n_next_calls(zipped_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            zipped_dp, n_elements_before_reset
+        )
         expected_res = [(i, i) for i in range(5)]
         self.assertEqual(expected_res[:n_elements_before_reset], res_before_reset)
         self.assertEqual(expected_res, res_after_reset)
 
 
 class TestFunctionalMapDataPipe(TestCase):
-
     def _serialization_test_helper(self, datapipe, use_dill):
         if use_dill:
             serialized_dp = dill.dumps(datapipe)
@@ -1922,14 +2170,18 @@ def _fn1(x):
         ]
 
         if HAS_DILL:
-            for dpipe, dp_args, dp_kwargs in datapipes_with_lambda_fn + datapipes_with_local_fn:
+            for dpipe, dp_args, dp_kwargs in (
+                datapipes_with_lambda_fn + datapipes_with_local_fn
+            ):
                 _ = dill.dumps(dpipe(input_dp, *dp_args, **dp_kwargs))  # type: ignore[call-arg]
         else:
             msgs = (
                 r"^Lambda function is not supported by pickle",
-                r"^Local function is not supported by pickle"
+                r"^Local function is not supported by pickle",
             )
-            for dps, msg in zip((datapipes_with_lambda_fn, datapipes_with_local_fn), msgs):
+            for dps, msg in zip(
+                (datapipes_with_lambda_fn, datapipes_with_local_fn), msgs
+            ):
                 for dpipe, dp_args, dp_kwargs in dps:
                     with self.assertWarnsRegex(UserWarning, msg):
                         datapipe = dpipe(input_dp, *dp_args, **dp_kwargs)  # type: ignore[call-arg]
@@ -1984,7 +2236,9 @@ def test_sequence_wrapper_datapipe(self):
         # Reset Test: reset the DataPipe
         seq = list(range(10))
         n_elements_before_reset = 5
-        res_before_reset, res_after_reset = reset_after_n_next_calls(input_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            input_dp, n_elements_before_reset
+        )
         self.assertEqual(list(range(5)), res_before_reset)
         self.assertEqual(seq, res_after_reset)
 
@@ -1998,13 +2252,17 @@ def test_concat_mapdatapipe(self):
         with self.assertRaisesRegex(ValueError, r"Expected at least one DataPipe"):
             dp.map.Concater()
 
-        with self.assertRaisesRegex(TypeError, r"Expected all inputs to be `MapDataPipe`"):
+        with self.assertRaisesRegex(
+            TypeError, r"Expected all inputs to be `MapDataPipe`"
+        ):
             dp.map.Concater(input_dp1, ())  # type: ignore[arg-type]
 
         concat_dp = input_dp1.concat(input_dp2)
         self.assertEqual(len(concat_dp), 15)
         for index in range(15):
-            self.assertEqual(concat_dp[index], (list(range(10)) + list(range(5)))[index])
+            self.assertEqual(
+                concat_dp[index], (list(range(10)) + list(range(5)))[index]
+            )
         self.assertEqual(list(concat_dp), list(range(10)) + list(range(5)))
 
     def test_zip_mapdatapipe(self):
@@ -2017,7 +2275,9 @@ def test_zip_mapdatapipe(self):
             dp.map.Zipper()
 
         # Functional Test: all inputs must be MapDataPipes
-        with self.assertRaisesRegex(TypeError, r"Expected all inputs to be `MapDataPipe`"):
+        with self.assertRaisesRegex(
+            TypeError, r"Expected all inputs to be `MapDataPipe`"
+        ):
             dp.map.Zipper(input_dp1, ())  # type: ignore[arg-type]
 
         # Functional Test: Zip the elements up as a tuples
@@ -2044,14 +2304,14 @@ def test_zip_mapdatapipe(self):
 
     def test_shuffler_mapdatapipe(self):
         input_dp1 = dp.map.SequenceWrapper(range(10))
-        input_dp2 = dp.map.SequenceWrapper({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5})
+        input_dp2 = dp.map.SequenceWrapper({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5})
 
         # Functional Test: Assumes 0-index when indices is not given
         shuffler_dp = input_dp1.shuffle()
         self.assertEqual(set(range(10)), set(shuffler_dp))
 
         # Functional Test: Custom indices are working
-        shuffler_dp = input_dp2.shuffle(indices=['a', 'b', 'c', 'd', 'e'])
+        shuffler_dp = input_dp2.shuffle(indices=["a", "b", "c", "d", "e"])
         self.assertEqual(set(range(1, 6)), set(shuffler_dp))
 
         # Functional Test: With global seed
@@ -2074,7 +2334,9 @@ def test_shuffler_mapdatapipe(self):
         # Reset Test:
         shuffler_dp = input_dp1.shuffle()
         n_elements_before_reset = 5
-        res_before_reset, res_after_reset = reset_after_n_next_calls(shuffler_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            shuffler_dp, n_elements_before_reset
+        )
         self.assertEqual(5, len(res_before_reset))
         for x in res_before_reset:
             self.assertTrue(x in set(range(10)))
@@ -2125,21 +2387,31 @@ def test_batch_mapdatapipe(self):
 
         # Functional Test: batches top level by default
         batch_dp = dp.map.Batcher(input_dp, batch_size=2)
-        self.assertEqual([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12]], list(batch_dp))
+        self.assertEqual(
+            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12]], list(batch_dp)
+        )
 
         # Functional Test: drop_last on command
         batch_dp = dp.map.Batcher(input_dp, batch_size=2, drop_last=True)
-        self.assertEqual([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11]], list(batch_dp))
+        self.assertEqual(
+            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11]], list(batch_dp)
+        )
 
         # Functional Test: nested batching
         batch_dp_2 = batch_dp.batch(batch_size=3)
-        self.assertEqual([[[0, 1], [2, 3], [4, 5]], [[6, 7], [8, 9], [10, 11]]], list(batch_dp_2))
+        self.assertEqual(
+            [[[0, 1], [2, 3], [4, 5]], [[6, 7], [8, 9], [10, 11]]], list(batch_dp_2)
+        )
 
         # Reset Test:
         n_elements_before_reset = 3
-        res_before_reset, res_after_reset = reset_after_n_next_calls(batch_dp, n_elements_before_reset)
+        res_before_reset, res_after_reset = reset_after_n_next_calls(
+            batch_dp, n_elements_before_reset
+        )
         self.assertEqual([[0, 1], [2, 3], [4, 5]], res_before_reset)
-        self.assertEqual([[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11]], res_after_reset)
+        self.assertEqual(
+            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11]], res_after_reset
+        )
 
         # __len__ Test:
         self.assertEqual(6, len(batch_dp))
@@ -2150,6 +2422,7 @@ def test_batch_mapdatapipe(self):
 # Multiple inheritance with NamedTuple is not supported for Python 3.9
 _generic_namedtuple_allowed = sys.version_info >= (3, 7) and sys.version_info < (3, 9)
 if _generic_namedtuple_allowed:
+
     class InvalidData(Generic[T_co], NamedTuple):
         name: str
         data: T_co
@@ -2172,6 +2445,7 @@ def test_protocol(self):
             from typing import Protocol  # type: ignore[attr-defined]
         except ImportError:
             from typing import _Protocol  # type: ignore[attr-defined]
+
             Protocol = _Protocol
 
         class P(Protocol):
@@ -2184,8 +2458,7 @@ class A(IterDataPipe[P]):
     def test_subtype(self):
         from torch.utils.data.datapipes._typing import issubtype
 
-        basic_type = (int, str, bool, float, complex,
-                      list, tuple, dict, set, T_co)
+        basic_type = (int, str, bool, float, complex, list, tuple, dict, set, T_co)
         for t in basic_type:
             self.assertTrue(issubtype(t, t))
             self.assertTrue(issubtype(t, Any))
@@ -2199,15 +2472,17 @@ def test_subtype(self):
             else:
                 self.assertFalse(issubtype(t1, t2))
 
-        T = TypeVar('T', int, str)
-        S = TypeVar('S', bool, Union[str, int], Tuple[int, T])  # type: ignore[valid-type]
-        types = ((int, Optional[int]),
-                 (List, Union[int, list]),
-                 (Tuple[int, str], S),
-                 (Tuple[int, str], tuple),
-                 (T, S),
-                 (S, T_co),
-                 (T, Union[S, Set]))
+        T = TypeVar("T", int, str)
+        S = TypeVar("S", bool, Union[str, int], Tuple[int, T])  # type: ignore[valid-type]
+        types = (
+            (int, Optional[int]),
+            (List, Union[int, list]),
+            (Tuple[int, str], S),
+            (Tuple[int, str], tuple),
+            (T, S),
+            (S, T_co),
+            (T, Union[S, Set]),
+        )
         for sub, par in types:
             self.assertTrue(issubtype(sub, par))
             self.assertFalse(issubtype(par, sub))
@@ -2232,9 +2507,9 @@ def test_subtype(self):
     def test_issubinstance(self):
         from torch.utils.data.datapipes._typing import issubinstance
 
-        basic_data = (1, '1', True, 1., complex(1., 0.))
+        basic_data = (1, "1", True, 1.0, complex(1.0, 0.0))
         basic_type = (int, str, bool, float, complex)
-        S = TypeVar('S', bool, Union[str, int])
+        S = TypeVar("S", bool, Union[str, int])
         for d in basic_data:
             self.assertTrue(issubinstance(d, Any))
             self.assertTrue(issubinstance(d, T_co))
@@ -2248,20 +2523,20 @@ def test_issubinstance(self):
                 else:
                     self.assertFalse(issubinstance(d, t))
         # list/set
-        dt = (([1, '1', 2], List), (set({1, '1', 2}), Set))
+        dt = (([1, "1", 2], List), (set({1, "1", 2}), Set))
         for d, t in dt:
             self.assertTrue(issubinstance(d, t))
             self.assertTrue(issubinstance(d, t[T_co]))  # type: ignore[index]
             self.assertFalse(issubinstance(d, t[int]))  # type: ignore[index]
 
         # dict
-        d = {'1': 1, '2': 2.}
+        d = {"1": 1, "2": 2.0}
         self.assertTrue(issubinstance(d, Dict))
         self.assertTrue(issubinstance(d, Dict[str, T_co]))
         self.assertFalse(issubinstance(d, Dict[str, int]))
 
         # tuple
-        d = (1, '1', 2)
+        d = (1, "1", 2)
         self.assertTrue(issubinstance(d, Tuple))
         self.assertTrue(issubinstance(d, Tuple[int, str, T_co]))
         self.assertFalse(issubinstance(d, Tuple[int, Any]))
@@ -2271,22 +2546,28 @@ def test_issubinstance(self):
     @skipTyping
     def test_compile_time(self):
         with self.assertRaisesRegex(TypeError, r"Expected 'Iterator' as the return"):
+
             class InvalidDP1(IterDataPipe[int]):
                 def __iter__(self) -> str:  # type: ignore[misc, override]
                     yield 0
 
         with self.assertRaisesRegex(TypeError, r"Expected return type of '__iter__'"):
+
             class InvalidDP2(IterDataPipe[Tuple]):
                 def __iter__(self) -> Iterator[int]:  # type: ignore[override]
                     yield 0
 
         with self.assertRaisesRegex(TypeError, r"Expected return type of '__iter__'"):
+
             class InvalidDP3(IterDataPipe[Tuple[int, str]]):
                 def __iter__(self) -> Iterator[tuple]:  # type: ignore[override]
                     yield (0,)
 
         if _generic_namedtuple_allowed:
-            with self.assertRaisesRegex(TypeError, r"is not supported by Python typing"):
+            with self.assertRaisesRegex(
+                TypeError, r"is not supported by Python typing"
+            ):
+
                 class InvalidDP4(IterDataPipe["InvalidData[int]"]):  # type: ignore[type-arg, misc]
                     pass
 
@@ -2305,6 +2586,7 @@ def __iter__(self) -> Iterator[Tuple[int, str]]:
         self.assertEqual(dp1.type, dp1_.type)
 
         with self.assertRaisesRegex(TypeError, r"is not a generic class"):
+
             class InvalidDP5(DP1[tuple]):  # type: ignore[type-arg]
                 def __iter__(self) -> Iterator[tuple]:  # type: ignore[override]
                     yield (0,)
@@ -2320,7 +2602,7 @@ def __iter__(self) -> Iterator[T_co]:
         self.assertEqual(dp2.type, dp2_.type)
 
         class DP3(IterDataPipe[Tuple[T_co, str]]):
-            r""" DataPipe without fixed type with __init__ function"""
+            r"""DataPipe without fixed type with __init__ function"""
 
             def __init__(self, datasource):
                 self.datasource = datasource
@@ -2336,7 +2618,7 @@ def __iter__(self) -> Iterator[Tuple[T_co, str]]:
         self.assertEqual(dp3.type, dp3_.type)
 
         class DP4(IterDataPipe[tuple]):
-            r""" DataPipe without __iter__ annotation"""
+            r"""DataPipe without __iter__ annotation"""
 
             def __iter__(self):
                 raise NotImplementedError
@@ -2346,7 +2628,7 @@ def __iter__(self):
         self.assertTrue(dp4.type.param == tuple)
 
         class DP5(IterDataPipe):
-            r""" DataPipe without type annotation"""
+            r"""DataPipe without type annotation"""
 
             def __iter__(self) -> Iterator[str]:
                 raise NotImplementedError
@@ -2354,10 +2636,13 @@ def __iter__(self) -> Iterator[str]:
         self.assertTrue(issubclass(DP5, IterDataPipe))
         dp5 = DP5()
         from torch.utils.data.datapipes._typing import issubtype
-        self.assertTrue(issubtype(dp5.type.param, Any) and issubtype(Any, dp5.type.param))
+
+        self.assertTrue(
+            issubtype(dp5.type.param, Any) and issubtype(Any, dp5.type.param)
+        )
 
         class DP6(IterDataPipe[int]):
-            r""" DataPipe with plain Iterator"""
+            r"""DataPipe with plain Iterator"""
 
             def __iter__(self) -> Iterator:
                 raise NotImplementedError
@@ -2367,13 +2652,13 @@ def __iter__(self) -> Iterator:
         self.assertTrue(dp6.type.param == int)
 
         class DP7(IterDataPipe[Awaitable[T_co]]):
-            r""" DataPipe with abstract base class"""
+            r"""DataPipe with abstract base class"""
 
         self.assertTrue(issubclass(DP7, IterDataPipe))
         self.assertTrue(DP7.type.param == Awaitable[T_co])  # type: ignore[attr-defined]
 
         class DP8(DP7[str]):
-            r""" DataPipe subclass from a DataPipe with abc type"""
+            r"""DataPipe subclass from a DataPipe with abc type"""
 
         self.assertTrue(issubclass(DP8, IterDataPipe))
         self.assertTrue(DP8.type.param == Awaitable[str])  # type: ignore[attr-defined]
@@ -2399,12 +2684,16 @@ def __iter__(self) -> Iterator[int]:
                     yield a
 
         # Non-DataPipe input with DataPipe hint
-        datasource = [(1, '1'), (2, '2'), (3, '3')]
-        with self.assertRaisesRegex(TypeError, r"Expected argument 'dp' as a IterDataPipe"):
+        datasource = [(1, "1"), (2, "2"), (3, "3")]
+        with self.assertRaisesRegex(
+            TypeError, r"Expected argument 'dp' as a IterDataPipe"
+        ):
             dp0 = DP0(datasource)
 
         dp0 = DP0(dp.iter.IterableWrapper(range(10)))
-        with self.assertRaisesRegex(TypeError, r"Expected type of argument 'dp' as a subtype"):
+        with self.assertRaisesRegex(
+            TypeError, r"Expected type of argument 'dp' as a subtype"
+        ):
             dp1 = DP1(dp0)
 
     @skipTyping
@@ -2417,20 +2706,23 @@ def __init__(self, datasource):
             def __iter__(self) -> Iterator[Tuple[int, T_co]]:
                 yield from self.ds
 
-        dss = ([(1, '1'), (2, '2')],
-               [(1, 1), (2, '2')])
+        dss = ([(1, "1"), (2, "2")], [(1, 1), (2, "2")])
         for ds in dss:
             dp0 = DP(ds)  # type: ignore[var-annotated]
             self.assertEqual(list(dp0), ds)
             # Reset __iter__
             self.assertEqual(list(dp0), ds)
 
-        dss = ([(1, 1), ('2', 2)],  # type: ignore[assignment, list-item]
-               [[1, '1'], [2, '2']],  # type: ignore[list-item]
-               [1, '1', 2, '2'])
+        dss = (
+            [(1, 1), ("2", 2)],  # type: ignore[assignment, list-item]
+            [[1, "1"], [2, "2"]],  # type: ignore[list-item]
+            [1, "1", 2, "2"],
+        )
         for ds in dss:
             dp0 = DP(ds)
-            with self.assertRaisesRegex(RuntimeError, r"Expected an instance as subtype"):
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected an instance as subtype"
+            ):
                 list(dp0)
 
             with runtime_validation_disabled():
@@ -2438,12 +2730,14 @@ def __iter__(self) -> Iterator[Tuple[int, T_co]]:
                 with runtime_validation_disabled():
                     self.assertEqual(list(dp0), ds)
 
-            with self.assertRaisesRegex(RuntimeError, r"Expected an instance as subtype"):
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected an instance as subtype"
+            ):
                 list(dp0)
 
     @skipTyping
     def test_reinforce(self):
-        T = TypeVar('T', int, str)
+        T = TypeVar("T", int, str)
 
         class DP(IterDataPipe[T]):
             def __init__(self, ds):
@@ -2464,7 +2758,9 @@ def __iter__(self) -> Iterator[T]:
             dp1 = DP(ds).reinforce_type(1)
 
         # Type is not subtype
-        with self.assertRaisesRegex(TypeError, r"Expected 'expected_type' as subtype of"):
+        with self.assertRaisesRegex(
+            TypeError, r"Expected 'expected_type' as subtype of"
+        ):
             dp2 = DP(ds).reinforce_type(float)
 
         # Invalid data at runtime
@@ -2503,7 +2799,6 @@ def __iter__(self):
         def __hash__(self):
             raise NotImplementedError
 
-
     def test_simple_traverse(self):
         numbers_dp = NumbersDataset(size=50)
         shuffled_dp = numbers_dp.shuffle()
@@ -2511,13 +2806,20 @@ def test_simple_traverse(self):
         mapped_dp = sharded_dp.map(lambda x: x * 10)
         graph = traverse_dps(mapped_dp)
         expected: Dict[Any, Any] = {
-            id(mapped_dp): (mapped_dp, {
-                id(sharded_dp): (sharded_dp, {
-                    id(shuffled_dp): (shuffled_dp, {
-                        id(numbers_dp): (numbers_dp, {})
-                    })
-                })
-            })
+            id(mapped_dp): (
+                mapped_dp,
+                {
+                    id(sharded_dp): (
+                        sharded_dp,
+                        {
+                            id(shuffled_dp): (
+                                shuffled_dp,
+                                {id(numbers_dp): (numbers_dp, {})},
+                            )
+                        },
+                    )
+                },
+            )
         }
         self.assertEqual(expected, graph)
 
@@ -2534,47 +2836,96 @@ def test_traverse_forked(self):
         combined_dp = dp0_upd.mux(dp1_upd, dp2)
         graph = traverse_dps(combined_dp)
         expected = {
-            id(combined_dp): (combined_dp, {
-                id(dp0_upd): (dp0_upd, {
-                    id(dp0): (dp0, {
-                        id(dp0.main_datapipe): (dp0.main_datapipe, {
-                            id(dp0.main_datapipe.main_datapipe): (dp0.main_datapipe.main_datapipe, {})
-                        })
-                    })
-                }),
-                id(dp1_upd): (dp1_upd, {
-                    id(dp1): (dp1, {
-                        id(dp1.main_datapipe): (dp1.main_datapipe, {
-                            id(dp1.main_datapipe.main_datapipe): (dp1.main_datapipe.main_datapipe, {})
-                        })
-                    })
-                }),
-                id(dp2): (dp2, {
-                    id(dp2.main_datapipe): (dp2.main_datapipe, {
-                        id(dp2.main_datapipe.main_datapipe): (dp2.main_datapipe.main_datapipe, {})
-                    })
-                })
-            })
+            id(combined_dp): (
+                combined_dp,
+                {
+                    id(dp0_upd): (
+                        dp0_upd,
+                        {
+                            id(dp0): (
+                                dp0,
+                                {
+                                    id(dp0.main_datapipe): (
+                                        dp0.main_datapipe,
+                                        {
+                                            id(dp0.main_datapipe.main_datapipe): (
+                                                dp0.main_datapipe.main_datapipe,
+                                                {},
+                                            )
+                                        },
+                                    )
+                                },
+                            )
+                        },
+                    ),
+                    id(dp1_upd): (
+                        dp1_upd,
+                        {
+                            id(dp1): (
+                                dp1,
+                                {
+                                    id(dp1.main_datapipe): (
+                                        dp1.main_datapipe,
+                                        {
+                                            id(dp1.main_datapipe.main_datapipe): (
+                                                dp1.main_datapipe.main_datapipe,
+                                                {},
+                                            )
+                                        },
+                                    )
+                                },
+                            )
+                        },
+                    ),
+                    id(dp2): (
+                        dp2,
+                        {
+                            id(dp2.main_datapipe): (
+                                dp2.main_datapipe,
+                                {
+                                    id(dp2.main_datapipe.main_datapipe): (
+                                        dp2.main_datapipe.main_datapipe,
+                                        {},
+                                    )
+                                },
+                            )
+                        },
+                    ),
+                },
+            )
         }
         self.assertEqual(expected, graph)
 
         dps = torch.utils.data.graph_settings.get_all_graph_pipes(graph)
         self.assertEqual(len(dps), 8)
-        for _dp in [numbers_dp, dp0.main_datapipe, dp0, dp1, dp2, dp0_upd, dp1_upd, combined_dp]:
+        for _dp in [
+            numbers_dp,
+            dp0.main_datapipe,
+            dp0,
+            dp1,
+            dp2,
+            dp0_upd,
+            dp1_upd,
+            combined_dp,
+        ]:
             self.assertTrue(_dp in dps)
 
     def test_traverse_mapdatapipe(self):
         source_dp = dp.map.SequenceWrapper(range(10))
         map_dp = source_dp.map(partial(_fake_add, 1))
         graph = traverse_dps(map_dp)
-        expected: Dict[Any, Any] = {id(map_dp): (map_dp, {id(source_dp): (source_dp, {})})}
+        expected: Dict[Any, Any] = {
+            id(map_dp): (map_dp, {id(source_dp): (source_dp, {})})
+        }
         self.assertEqual(expected, graph)
 
     def test_traverse_mixdatapipe(self):
         source_map_dp = dp.map.SequenceWrapper(range(10))
         iter_dp = dp.iter.IterableWrapper(source_map_dp)
         graph = traverse_dps(iter_dp)
-        expected: Dict[Any, Any] = {id(iter_dp): (iter_dp, {id(source_map_dp): (source_map_dp, {})})}
+        expected: Dict[Any, Any] = {
+            id(iter_dp): (iter_dp, {id(source_map_dp): (source_map_dp, {})})
+        }
         self.assertEqual(expected, graph)
 
     def test_traverse_circular_datapipe(self):
@@ -2583,11 +2934,15 @@ def test_traverse_circular_datapipe(self):
         graph = traverse_dps(circular_dp)
         # See issue: https://github.com/pytorch/data/issues/535
         expected: Dict[Any, Any] = {
-            id(circular_dp): (circular_dp, {
-                id(circular_dp._dp): (circular_dp._dp, {
-                    id(source_iter_dp): (source_iter_dp, {})
-                })
-            })
+            id(circular_dp): (
+                circular_dp,
+                {
+                    id(circular_dp._dp): (
+                        circular_dp._dp,
+                        {id(source_iter_dp): (source_iter_dp, {})},
+                    )
+                },
+            )
         }
         self.assertEqual(expected, graph)
 
@@ -2603,11 +2958,15 @@ def test_traverse_unhashable_datapipe(self):
         with self.assertRaises(NotImplementedError):
             hash(unhashable_dp)
         expected: Dict[Any, Any] = {
-            id(unhashable_dp): (unhashable_dp, {
-                id(unhashable_dp._dp): (unhashable_dp._dp, {
-                    id(source_iter_dp): (source_iter_dp, {})
-                })
-            })
+            id(unhashable_dp): (
+                unhashable_dp,
+                {
+                    id(unhashable_dp._dp): (
+                        unhashable_dp._dp,
+                        {id(source_iter_dp): (source_iter_dp, {})},
+                    )
+                },
+            )
         }
         self.assertEqual(expected, graph)
 
@@ -2620,23 +2979,34 @@ class TestSerialization(TestCase):
     @skipIfNoDill
     def test_spawn_lambdas_iter(self):
         idp = dp.iter.IterableWrapper(range(3)).map(lambda x: x + 1).shuffle()
-        dl = DataLoader(idp, num_workers=2, shuffle=True,
-                        multiprocessing_context='spawn', collate_fn=unbatch, batch_size=1)
+        dl = DataLoader(
+            idp,
+            num_workers=2,
+            shuffle=True,
+            multiprocessing_context="spawn",
+            collate_fn=unbatch,
+            batch_size=1,
+        )
         result = list(dl)
         self.assertEqual([1, 1, 2, 2, 3, 3], sorted(result))
 
     @skipIfNoDill
     def test_spawn_lambdas_map(self):
         mdp = dp.map.SequenceWrapper(range(3)).map(lambda x: x + 1).shuffle()
-        dl = DataLoader(mdp, num_workers=2, shuffle=True,
-                        multiprocessing_context='spawn', collate_fn=unbatch, batch_size=1)
+        dl = DataLoader(
+            mdp,
+            num_workers=2,
+            shuffle=True,
+            multiprocessing_context="spawn",
+            collate_fn=unbatch,
+            batch_size=1,
+        )
         result = list(dl)
         self.assertEqual([1, 1, 2, 2, 3, 3], sorted(result))
 
 
 class TestCircularSerialization(TestCase):
     class CustomIterDataPipe(IterDataPipe):
-
         @staticmethod
         def add_one(x):
             return x + 1
@@ -2650,8 +3020,14 @@ def add_v(self, x):
 
         def __init__(self, fn, source_dp=None):
             self.fn = fn
-            self.source_dp = source_dp if source_dp else dp.iter.IterableWrapper([1, 2, 4])
-            self._dp = self.source_dp.map(self.add_one).map(self.add_v).demux(2, self.classify)[0]
+            self.source_dp = (
+                source_dp if source_dp else dp.iter.IterableWrapper([1, 2, 4])
+            )
+            self._dp = (
+                self.source_dp.map(self.add_one)
+                .map(self.add_v)
+                .demux(2, self.classify)[0]
+            )
             self.v = 1
 
         def __iter__(self):
@@ -2669,12 +3045,28 @@ def test_circular_serialization_with_pickle(self):
         src_1 = m1_1.datapipe
 
         res1 = traverse_dps(dp1)
-        exp_res_1 = {id(dp1): (dp1, {
-            id(src_1): (src_1, {}),
-            id(child_1): (child_1, {id(dm_1): (dm_1, {
-                id(m2_1): (m2_1, {id(m1_1): (m1_1, {id(src_1): (src_1, {})})})
-            })})
-        })}
+        exp_res_1 = {
+            id(dp1): (
+                dp1,
+                {
+                    id(src_1): (src_1, {}),
+                    id(child_1): (
+                        child_1,
+                        {
+                            id(dm_1): (
+                                dm_1,
+                                {
+                                    id(m2_1): (
+                                        m2_1,
+                                        {id(m1_1): (m1_1, {id(src_1): (src_1, {})})},
+                                    )
+                                },
+                            )
+                        },
+                    ),
+                },
+            )
+        }
         self.assertEqual(res1, exp_res_1)
         dp2 = TestCircularSerialization.CustomIterDataPipe(fn=_fake_fn, source_dp=dp1)
         self.assertTrue(list(dp2) == list(pickle.loads(pickle.dumps(dp2))))
@@ -2685,33 +3077,107 @@ def test_circular_serialization_with_pickle(self):
         m1_2 = m2_2.datapipe
 
         res2 = traverse_dps(dp2)
-        exp_res_2 = {id(dp2): (dp2, {
-            id(dp1): (dp1, {
-                id(src_1): (src_1, {}),
-                id(child_1): (child_1, {id(dm_1): (dm_1, {
-                    id(m2_1): (m2_1, {id(m1_1): (m1_1, {id(src_1): (src_1, {})})})
-                })})
-            }),
-            id(child_2): (child_2, {id(dm_2): (dm_2, {
-                id(m2_2): (m2_2, {id(m1_2): (m1_2, {
-                    id(dp1): (dp1, {
-                        id(src_1): (src_1, {}),
-                        id(child_1): (child_1, {id(dm_1): (dm_1, {
-                            id(m2_1): (m2_1, {id(m1_1): (m1_1, {id(src_1): (src_1, {})})})
-                        })})
-                    }),
-                })})
-            })})
-        })}
+        exp_res_2 = {
+            id(dp2): (
+                dp2,
+                {
+                    id(dp1): (
+                        dp1,
+                        {
+                            id(src_1): (src_1, {}),
+                            id(child_1): (
+                                child_1,
+                                {
+                                    id(dm_1): (
+                                        dm_1,
+                                        {
+                                            id(m2_1): (
+                                                m2_1,
+                                                {
+                                                    id(m1_1): (
+                                                        m1_1,
+                                                        {id(src_1): (src_1, {})},
+                                                    )
+                                                },
+                                            )
+                                        },
+                                    )
+                                },
+                            ),
+                        },
+                    ),
+                    id(child_2): (
+                        child_2,
+                        {
+                            id(dm_2): (
+                                dm_2,
+                                {
+                                    id(m2_2): (
+                                        m2_2,
+                                        {
+                                            id(m1_2): (
+                                                m1_2,
+                                                {
+                                                    id(dp1): (
+                                                        dp1,
+                                                        {
+                                                            id(src_1): (src_1, {}),
+                                                            id(child_1): (
+                                                                child_1,
+                                                                {
+                                                                    id(dm_1): (
+                                                                        dm_1,
+                                                                        {
+                                                                            id(m2_1): (
+                                                                                m2_1,
+                                                                                {
+                                                                                    id(
+                                                                                        m1_1
+                                                                                    ): (
+                                                                                        m1_1,
+                                                                                        {
+                                                                                            id(
+                                                                                                src_1
+                                                                                            ): (
+                                                                                                src_1,
+                                                                                                {},
+                                                                                            )
+                                                                                        },
+                                                                                    )
+                                                                                },
+                                                                            )
+                                                                        },
+                                                                    )
+                                                                },
+                                                            ),
+                                                        },
+                                                    ),
+                                                },
+                                            )
+                                        },
+                                    )
+                                },
+                            )
+                        },
+                    ),
+                },
+            )
+        }
         self.assertEqual(res2, exp_res_2)
 
     class LambdaIterDataPipe(CustomIterDataPipe):
-
         def __init__(self, fn, source_dp=None):
             super().__init__(fn, source_dp)
-            self.container = [lambda x: x + 1, ]
+            self.container = [
+                lambda x: x + 1,
+            ]
             self.lambda_fn = lambda x: x + 1
-            self._dp = self.source_dp.map(self.add_one).map(self.lambda_fn).map(self.add_v).demux(2, self.classify)[0]
+            self._dp = (
+                self.source_dp.map(self.add_one)
+                .map(self.lambda_fn)
+                .map(self.add_v)
+                .demux(2, self.classify)[0]
+            )
 
     @skipIfNoDill
     @skipIf(True, "Dill Tests")
@@ -2728,12 +3194,28 @@ def test_circular_serialization_with_dill(self):
 
         res1 = traverse_dps(dp1)
 
-        exp_res_1 = {id(dp1): (dp1, {
-            id(src_1): (src_1, {}),
-            id(child_1): (child_1, {id(dm_1): (dm_1, {
-                id(m2_1): (m2_1, {id(m1_1): (m1_1, {id(src_1): (src_1, {})})})
-            })})
-        })}
+        exp_res_1 = {
+            id(dp1): (
+                dp1,
+                {
+                    id(src_1): (src_1, {}),
+                    id(child_1): (
+                        child_1,
+                        {
+                            id(dm_1): (
+                                dm_1,
+                                {
+                                    id(m2_1): (
+                                        m2_1,
+                                        {id(m1_1): (m1_1, {id(src_1): (src_1, {})})},
+                                    )
+                                },
+                            )
+                        },
+                    ),
+                },
+            )
+        }
 
         self.assertEqual(res1, exp_res_1)
 
@@ -2746,24 +3228,92 @@ def test_circular_serialization_with_dill(self):
         m1_2 = m2_2.datapipe
 
         res2 = traverse_dps(dp2)
-        exp_res_2 = {id(dp2): (dp2, {
-            id(dp1): (dp1, {
-                id(src_1): (src_1, {}),
-                id(child_1): (child_1, {id(dm_1): (dm_1, {
-                    id(m2_1): (m2_1, {id(m1_1): (m1_1, {id(src_1): (src_1, {})})})
-                })})
-            }),
-            id(child_2): (child_2, {id(dm_2): (dm_2, {
-                id(m2_2): (m2_2, {id(m1_2): (m1_2, {
-                    id(dp1): (dp1, {
-                        id(src_1): (src_1, {}),
-                        id(child_1): (child_1, {id(dm_1): (dm_1, {
-                            id(m2_1): (m2_1, {id(m1_1): (m1_1, {id(src_1): (src_1, {})})})
-                        })})
-                    }),
-                })})
-            })})
-        })}
+        exp_res_2 = {
+            id(dp2): (
+                dp2,
+                {
+                    id(dp1): (
+                        dp1,
+                        {
+                            id(src_1): (src_1, {}),
+                            id(child_1): (
+                                child_1,
+                                {
+                                    id(dm_1): (
+                                        dm_1,
+                                        {
+                                            id(m2_1): (
+                                                m2_1,
+                                                {
+                                                    id(m1_1): (
+                                                        m1_1,
+                                                        {id(src_1): (src_1, {})},
+                                                    )
+                                                },
+                                            )
+                                        },
+                                    )
+                                },
+                            ),
+                        },
+                    ),
+                    id(child_2): (
+                        child_2,
+                        {
+                            id(dm_2): (
+                                dm_2,
+                                {
+                                    id(m2_2): (
+                                        m2_2,
+                                        {
+                                            id(m1_2): (
+                                                m1_2,
+                                                {
+                                                    id(dp1): (
+                                                        dp1,
+                                                        {
+                                                            id(src_1): (src_1, {}),
+                                                            id(child_1): (
+                                                                child_1,
+                                                                {
+                                                                    id(dm_1): (
+                                                                        dm_1,
+                                                                        {
+                                                                            id(m2_1): (
+                                                                                m2_1,
+                                                                                {
+                                                                                    id(
+                                                                                        m1_1
+                                                                                    ): (
+                                                                                        m1_1,
+                                                                                        {
+                                                                                            id(
+                                                                                                src_1
+                                                                                            ): (
+                                                                                                src_1,
+                                                                                                {},
+                                                                                            )
+                                                                                        },
+                                                                                    )
+                                                                                },
+                                                                            )
+                                                                        },
+                                                                    )
+                                                                },
+                                                            ),
+                                                        },
+                                                    ),
+                                                },
+                                            )
+                                        },
+                                    )
+                                },
+                            )
+                        },
+                    ),
+                },
+            )
+        }
         self.assertEqual(res2, exp_res_2)
 
 
@@ -2784,7 +3334,6 @@ def __iter__(self):
 
 
 class TestSharding(TestCase):
-
     def _get_pipeline(self):
         numbers_dp = NumbersDataset(size=10)
         dp0, dp1 = numbers_dp.fork(num_instances=2)
@@ -2819,9 +3368,13 @@ def test_sharding_groups(self):
         def construct_sharded_pipe():
             sharding_pipes = []
             dp = NumbersDataset(size=90)
-            dp = dp.sharding_filter(sharding_group_filter=SHARDING_PRIORITIES.DISTRIBUTED)
+            dp = dp.sharding_filter(
+                sharding_group_filter=SHARDING_PRIORITIES.DISTRIBUTED
+            )
             sharding_pipes.append(dp)
-            dp = dp.sharding_filter(sharding_group_filter=SHARDING_PRIORITIES.MULTIPROCESSING)
+            dp = dp.sharding_filter(
+                sharding_group_filter=SHARDING_PRIORITIES.MULTIPROCESSING
+            )
             sharding_pipes.append(dp)
             dp = dp.sharding_filter(sharding_group_filter=300)
             sharding_pipes.append(dp)
@@ -2831,7 +3384,9 @@ def construct_sharded_pipe():
 
         for pipe in sharding_pipes:
             pipe.apply_sharding(2, 1, sharding_group=SHARDING_PRIORITIES.DISTRIBUTED)
-            pipe.apply_sharding(5, 3, sharding_group=SHARDING_PRIORITIES.MULTIPROCESSING)
+            pipe.apply_sharding(
+                5, 3, sharding_group=SHARDING_PRIORITIES.MULTIPROCESSING
+            )
             pipe.apply_sharding(3, 1, sharding_group=300)
 
         actual = list(dp)
@@ -2852,16 +3407,25 @@ def construct_sharded_pipe():
     # Test tud.datapipes.iter.grouping.SHARDING_PRIORITIES for backward compatbility
     # TODO: Remove this test once tud.datapipes.iter.grouping.SHARDING_PRIORITIES is deprecated
     def test_sharding_groups_in_legacy_grouping_package(self):
-        with self.assertWarnsRegex(FutureWarning, r'Please use `SHARDING_PRIORITIES` '
-                                                  'from the `torch.utils.data.datapipes.iter.sharding`'):
-            from torch.utils.data.datapipes.iter.grouping import SHARDING_PRIORITIES as LEGACY_SHARDING_PRIORITIES
+        with self.assertWarnsRegex(
+            FutureWarning,
+            r"Please use `SHARDING_PRIORITIES` "
+            "from the `torch.utils.data.datapipes.iter.sharding`",
+        ):
+            from torch.utils.data.datapipes.iter.grouping import (
+                SHARDING_PRIORITIES as LEGACY_SHARDING_PRIORITIES,
+            )
 
         def construct_sharded_pipe():
             sharding_pipes = []
             dp = NumbersDataset(size=90)
-            dp = dp.sharding_filter(sharding_group_filter=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED)
+            dp = dp.sharding_filter(
+                sharding_group_filter=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED
+            )
             sharding_pipes.append(dp)
-            dp = dp.sharding_filter(sharding_group_filter=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING)
+            dp = dp.sharding_filter(
+                sharding_group_filter=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
+            )
             sharding_pipes.append(dp)
             dp = dp.sharding_filter(sharding_group_filter=300)
             sharding_pipes.append(dp)
@@ -2870,8 +3434,12 @@ def construct_sharded_pipe():
         dp, sharding_pipes = construct_sharded_pipe()
 
         for pipe in sharding_pipes:
-            pipe.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED)
-            pipe.apply_sharding(5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING)
+            pipe.apply_sharding(
+                2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DISTRIBUTED
+            )
+            pipe.apply_sharding(
+                5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
+            )
             pipe.apply_sharding(3, 1, sharding_group=300)
 
         actual = list(dp)
@@ -2882,10 +3450,14 @@ def construct_sharded_pipe():
         dp, _ = construct_sharded_pipe()
         dp.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DEFAULT)
         with self.assertRaises(Exception):
-            dp.apply_sharding(5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING)
+            dp.apply_sharding(
+                5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
+            )
 
         dp, _ = construct_sharded_pipe()
-        dp.apply_sharding(5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING)
+        dp.apply_sharding(
+            5, 3, sharding_group=LEGACY_SHARDING_PRIORITIES.MULTIPROCESSING
+        )
         with self.assertRaises(Exception):
             dp.apply_sharding(2, 1, sharding_group=LEGACY_SHARDING_PRIORITIES.DEFAULT)
 
@@ -2943,7 +3515,9 @@ def test_multi_sharding(self):
         numbers_dp = dp.iter.IterableWrapper(range(13))
         sharded_dp = numbers_dp.sharding_filter()
         sharded_dp = sharded_dp.sharding_filter()
-        with self.assertRaisesRegex(RuntimeError, "Sharding twice on a single pipeline"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Sharding twice on a single pipeline"
+        ):
             torch.utils.data.graph_settings.apply_sharding(sharded_dp, 3, 0)
 
         # Raises Error when sharding on both data source and branch
@@ -2951,7 +3525,9 @@ def test_multi_sharding(self):
         dp1, dp2 = numbers_dp.fork(2)
         sharded_dp = dp1.sharding_filter()
         zip_dp = dp2.zip(sharded_dp)
-        with self.assertRaisesRegex(RuntimeError, "Sharding twice on a single pipeline"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Sharding twice on a single pipeline"
+        ):
             torch.utils.data.graph_settings.apply_sharding(zip_dp, 3, 0)
 
         # Raises Error when multiple sharding on the branch and end
@@ -2959,7 +3535,9 @@ def test_multi_sharding(self):
         dp1, dp2 = numbers_dp.fork(2)
         sharded_dp = dp1.sharding_filter()
         zip_dp = dp2.zip(sharded_dp).sharding_filter()
-        with self.assertRaisesRegex(RuntimeError, "Sharding twice on a single pipeline"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Sharding twice on a single pipeline"
+        ):
             torch.utils.data.graph_settings.apply_sharding(zip_dp, 3, 0)
 
         # Single sharding_filter on data source
@@ -2980,7 +3558,6 @@ def test_multi_sharding(self):
 
 
 class TestIterDataPipeSingletonConstraint(TestCase):
-
     r"""
     Each `IterDataPipe` can only have one active iterator. Whenever a new iterator is created, older
     iterators are invalidated. These tests aim to ensure `IterDataPipe` follows this behavior.
@@ -2994,7 +3571,9 @@ def _check_single_iterator_invalidation_logic(self, source_dp: IterDataPipe):
         it1 = iter(source_dp)
         self.assertEqual(list(range(10)), list(it1))
         it1 = iter(source_dp)
-        self.assertEqual(list(range(10)), list(it1))  # A fresh iterator can be read in full again
+        self.assertEqual(
+            list(range(10)), list(it1)
+        )  # A fresh iterator can be read in full again
         it1 = iter(source_dp)
         self.assertEqual(0, next(it1))
         it2 = iter(source_dp)  # This should invalidate `it1`
@@ -3002,7 +3581,6 @@ def _check_single_iterator_invalidation_logic(self, source_dp: IterDataPipe):
         with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
             next(it1)
 
-
     def test_iterdatapipe_singleton_generator(self):
         r"""
         Testing for the case where IterDataPipe's `__iter__` is a generator function.
@@ -3030,6 +3608,7 @@ def test_iterdatapipe_singleton_self_next(self):
         Testing for the case where IterDataPipe's `__iter__` returns `self` and there is a `__next__` method
         Note that the following DataPipe by is singleton by default (because `__iter__` returns `self`).
         """
+
         class _CustomIterDP_Self(IterDataPipe):
             def __init__(self, iterable):
                 self.source = iterable
@@ -3054,12 +3633,18 @@ def reset(self):
         # Functional Test: Check if invalidation logic is correct
         source_dp = _CustomIterDP_Self(range(10))
         self._check_single_iterator_invalidation_logic(source_dp)
-        self.assertEqual(1, next(source_dp))  # `source_dp` is still valid and can be read
+        self.assertEqual(
+            1, next(source_dp)
+        )  # `source_dp` is still valid and can be read
 
         # Functional Test: extend the test to a pipeline
-        source_dp = _CustomIterDP_Self(dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn))
+        source_dp = _CustomIterDP_Self(
+            dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn)
+        )
         self._check_single_iterator_invalidation_logic(source_dp)
-        self.assertEqual(1, next(source_dp))  # `source_dp` is still valid and can be read
+        self.assertEqual(
+            1, next(source_dp)
+        )  # `source_dp` is still valid and can be read
 
         # Functional Test: multiple simultaneous references to the same DataPipe fails
         with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
@@ -3071,6 +3656,7 @@ def test_iterdatapipe_singleton_new_object(self):
         Testing for the case where IterDataPipe's `__iter__` isn't a generator nor returns `self`,
         and there isn't a `__next__` method.
         """
+
         class _CustomIterDP(IterDataPipe):
             def __init__(self, iterable):
                 self.iterable = iter(iterable)
@@ -3088,7 +3674,9 @@ def __iter__(self):  # Note that this doesn't reset
             next(it1)
 
         # Functional Test: extend the test to a pipeline
-        source_dp = _CustomIterDP(dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn))
+        source_dp = _CustomIterDP(
+            dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn)
+        )
         it1 = iter(source_dp)
         self.assertEqual(0, next(it1))
         it2 = iter(source_dp)
@@ -3106,6 +3694,7 @@ def test_iterdatapipe_singleton_buggy(self):
         Buggy test case case where IterDataPipe's `__iter__` returns a new object, but also has
         a `__next__` method.
         """
+
         class _CustomIterDP(IterDataPipe):
             def __init__(self, iterable):
                 self.source = iterable
@@ -3132,7 +3721,9 @@ def __next__(self):
         with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
             next(it1)
         self.assertEqual(2, next(source_dp))  # not impacted by the creation of `it2`
-        self.assertEqual(list(range(10)), list(it2))  # `it2` still works because it is a new object
+        self.assertEqual(
+            list(range(10)), list(it2)
+        )  # `it2` still works because it is a new object
 
     def test_iterdatapipe_singleton_constraint_multiple_outputs(self):
         r"""
@@ -3171,13 +3762,17 @@ def test_iterdatapipe_singleton_constraint_multiple_outputs(self):
         self.assertEqual(0, next(it1))
         self.assertEqual(0, next(it2))
         it3 = iter(source_dp)  # note that a new iterator is created from `source_dp`
-        self.assertEqual(0, next(it3))  # `it3` should invalidate `it1` and `it2` since they both use `source_dp`
+        self.assertEqual(
+            0, next(it3)
+        )  # `it3` should invalidate `it1` and `it2` since they both use `source_dp`
         with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
             next(it1)
         self.assertEqual(1, next(it3))
 
         # Function Test: Extending test to pipeline
-        source_dp = dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn)
+        source_dp = (
+            dp.iter.IterableWrapper(range(10)).map(_fake_fn).filter(_fake_filter_fn)
+        )
         cdp1, cdp2 = source_dp.fork(num_instances=2)
         it1, it2 = iter(cdp1), iter(cdp2)
         self.assertEqual(list(range(10)), list(it1))
@@ -3198,15 +3793,16 @@ def test_iterdatapipe_singleton_constraint_multiple_outputs(self):
         self.assertEqual(0, next(it1))
         self.assertEqual(0, next(it2))
         it3 = iter(source_dp)  # note that a new iterator is created from `source_dp`
-        self.assertEqual(0, next(it3))  # `it3` should invalidate `it1` and `it2` since they both use `source_dp`
+        self.assertEqual(
+            0, next(it3)
+        )  # `it3` should invalidate `it1` and `it2` since they both use `source_dp`
         with self.assertRaisesRegex(RuntimeError, "This iterator has been invalidated"):
             next(it1)
         self.assertEqual(1, next(it3))
 
-class TestIterDataPipeCountSampleYielded(TestCase):
 
+class TestIterDataPipeCountSampleYielded(TestCase):
     def _yield_count_test_helper(self, datapipe, n_expected_samples):
-
         # Functional Test: Check if number of samples yielded is as expected
         res = list(datapipe)
         self.assertEqual(len(res), datapipe._number_of_samples_yielded)
@@ -3243,13 +3839,17 @@ def __iter__(self):
 
         # Functional Test: Ensure the count is correct even when exception is raised
         datapipe: IterDataPipe = _CustomGeneratorFnDataPipe()
-        with self.assertRaisesRegex(RuntimeError, "Custom test error after yielding 3 elements"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Custom test error after yielding 3 elements"
+        ):
             list(datapipe)
         self.assertEqual(3, datapipe._number_of_samples_yielded)
 
         # Functional Test: Check for reset behavior and if iterator also works
         it = iter(datapipe)  # reset the DataPipe
-        with self.assertRaisesRegex(RuntimeError, "Custom test error after yielding 3 elements"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Custom test error after yielding 3 elements"
+        ):
             list(it)
         self.assertEqual(3, datapipe._number_of_samples_yielded)
 
@@ -3308,13 +3908,17 @@ def reset(self):
 
         # Functional Test: Ensure the count is correct even when exception is raised
         datapipe: IterDataPipe = _CustomNextDataPipe()
-        with self.assertRaisesRegex(RuntimeError, "Custom test error after yielding 3 elements"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Custom test error after yielding 3 elements"
+        ):
             list(datapipe)
         self.assertEqual(3, datapipe._number_of_samples_yielded)
 
         # Functional Test: Check for reset behavior and if iterator also works
         it = iter(datapipe)  # reset the DataPipe
-        with self.assertRaisesRegex(RuntimeError, "Custom test error after yielding 3 elements"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Custom test error after yielding 3 elements"
+        ):
             list(it)
         self.assertEqual(3, datapipe._number_of_samples_yielded)
 
@@ -3351,8 +3955,9 @@ def __len__(self):
 
 
 class TestIterDataPipeGraphFastForward(TestCase):
-
-    def _fast_forward_graph_test_helper(self, datapipe, fast_forward_fn, expected_res, n_iterations=3, rng=None):
+    def _fast_forward_graph_test_helper(
+        self, datapipe, fast_forward_fn, expected_res, n_iterations=3, rng=None
+    ):
         if rng is None:
             rng = torch.Generator()
         rng = rng.manual_seed(0)
@@ -3378,32 +3983,39 @@ def _fast_forward_graph_test_helper(self, datapipe, fast_forward_fn, expected_re
     def test_simple_snapshot_graph(self):
         graph1 = dp.iter.IterableWrapper(range(10))
         res1 = list(range(10))
-        self._fast_forward_graph_test_helper(graph1, _simple_graph_snapshot_restoration,
-                                             expected_res=res1)
+        self._fast_forward_graph_test_helper(
+            graph1, _simple_graph_snapshot_restoration, expected_res=res1
+        )
 
         graph2 = graph1.map(_mul_10)
         res2 = [10 * x for x in res1]
-        self._fast_forward_graph_test_helper(graph2, _simple_graph_snapshot_restoration,
-                                             expected_res=res2)
+        self._fast_forward_graph_test_helper(
+            graph2, _simple_graph_snapshot_restoration, expected_res=res2
+        )
 
         rng = torch.Generator()
         graph3 = graph2.shuffle()
         rng.manual_seed(0)
         torch.utils.data.graph_settings.apply_random_seed(graph3, rng)
         res3 = list(graph3)
-        self._fast_forward_graph_test_helper(graph3, _simple_graph_snapshot_restoration,
-                                             expected_res=res3)
+        self._fast_forward_graph_test_helper(
+            graph3, _simple_graph_snapshot_restoration, expected_res=res3
+        )
 
         graph4 = graph3.map(_mul_10)
         res4 = [10 * x for x in res3]
-        self._fast_forward_graph_test_helper(graph4, _simple_graph_snapshot_restoration,
-                                             expected_res=res4)
+        self._fast_forward_graph_test_helper(
+            graph4, _simple_graph_snapshot_restoration, expected_res=res4
+        )
 
         batch_size = 2
         graph5 = graph4.batch(batch_size)
-        res5 = [res4[i:i + batch_size] for i in range(0, len(res4), batch_size)]  # .batch(2)
-        self._fast_forward_graph_test_helper(graph5, _simple_graph_snapshot_restoration,
-                                             expected_res=res5)
+        res5 = [
+            res4[i : i + batch_size] for i in range(0, len(res4), batch_size)
+        ]  # .batch(2)
+        self._fast_forward_graph_test_helper(
+            graph5, _simple_graph_snapshot_restoration, expected_res=res5
+        )
 
         # With `fork` and `zip`
         cdp1, cdp2 = graph5.fork(2)
@@ -3411,27 +4023,35 @@ def test_simple_snapshot_graph(self):
         rng = rng.manual_seed(100)
         torch.utils.data.graph_settings.apply_random_seed(graph6, rng)
         res6 = [(x, x) for x in res5]
-        self._fast_forward_graph_test_helper(graph6, _simple_graph_snapshot_restoration,
-                                             expected_res=res6)
+        self._fast_forward_graph_test_helper(
+            graph6, _simple_graph_snapshot_restoration, expected_res=res6
+        )
 
         # With `fork` and `concat`
         graph7 = cdp1.concat(cdp2)
         res7 = res5 * 2
-        self._fast_forward_graph_test_helper(graph7, _simple_graph_snapshot_restoration,
-                                             expected_res=res7)
+        self._fast_forward_graph_test_helper(
+            graph7, _simple_graph_snapshot_restoration, expected_res=res7
+        )
 
         # Raises an exception if the graph has already been restored
-        with self.assertRaisesRegex(RuntimeError, "Snapshot restoration cannot be applied."):
+        with self.assertRaisesRegex(
+            RuntimeError, "Snapshot restoration cannot be applied."
+        ):
             _simple_graph_snapshot_restoration(graph7, 1)
             _simple_graph_snapshot_restoration(graph7, 1)
 
     def test_simple_snapshot_custom_non_generator(self):
         graph = _CustomNonGeneratorTestDataPipe()
-        self._fast_forward_graph_test_helper(graph, _simple_graph_snapshot_restoration, expected_res=range(10))
+        self._fast_forward_graph_test_helper(
+            graph, _simple_graph_snapshot_restoration, expected_res=range(10)
+        )
 
     def test_simple_snapshot_custom_self_next(self):
         graph = _CustomSelfNextTestDataPipe()
-        self._fast_forward_graph_test_helper(graph, _simple_graph_snapshot_restoration, expected_res=range(10))
+        self._fast_forward_graph_test_helper(
+            graph, _simple_graph_snapshot_restoration, expected_res=range(10)
+        )
 
     def _snapshot_test_helper(self, datapipe, expected_res, n_iter=3, rng=None):
         """
@@ -3451,7 +4071,9 @@ def _snapshot_test_helper(self, datapipe, expected_res, n_iter=3, rng=None):
 
         rng_for_deserialized = torch.Generator()
         rng_for_deserialized.manual_seed(0)
-        _simple_graph_snapshot_restoration(deserialized_graph, n_iter, rng=rng_for_deserialized)
+        _simple_graph_snapshot_restoration(
+            deserialized_graph, n_iter, rng=rng_for_deserialized
+        )
         self.assertEqual(expected_res[n_iter:], list(it))
         self.assertEqual(expected_res[n_iter:], list(deserialized_graph))
 
@@ -3477,7 +4099,9 @@ def test_simple_snapshot_graph_with_serialization(self):
 
         batch_size = 2
         graph5 = graph4.batch(batch_size)
-        res5 = [res4[i:i + batch_size] for i in range(0, len(res4), batch_size)]  # .batch(2)
+        res5 = [
+            res4[i : i + batch_size] for i in range(0, len(res4), batch_size)
+        ]  # .batch(2)
         self._snapshot_test_helper(graph5, expected_res=res5)
 
         # With `fork` and `zip`
@@ -3492,7 +4116,14 @@ def test_simple_snapshot_graph_with_serialization(self):
         self._snapshot_test_helper(graph7, expected_res=res7)
 
     def test_simple_snapshot_graph_repeated(self):
-        cdp1, cdp2 = dp.iter.IterableWrapper(range(10)).map(_mul_10).shuffle().map(_mul_10).map(_mul_10).fork(2)
+        cdp1, cdp2 = (
+            dp.iter.IterableWrapper(range(10))
+            .map(_mul_10)
+            .shuffle()
+            .map(_mul_10)
+            .map(_mul_10)
+            .fork(2)
+        )
         graph = cdp1.zip(cdp2)
 
         rng = torch.Generator()
@@ -3515,8 +4146,11 @@ def test_simple_snapshot_graph_repeated(self):
 
         rng_for_deserialized = torch.Generator()
         rng_for_deserialized.manual_seed(0)
-        _simple_graph_snapshot_restoration(deserialized_graph, deserialized_graph._number_of_samples_yielded,
-                                           rng=rng_for_deserialized)
+        _simple_graph_snapshot_restoration(
+            deserialized_graph,
+            deserialized_graph._number_of_samples_yielded,
+            rng=rng_for_deserialized,
+        )
 
         it = iter(deserialized_graph)
         # Get the next element and ensure it is as expected
@@ -3528,12 +4162,15 @@ def test_simple_snapshot_graph_repeated(self):
 
         rng_for_deserialized = torch.Generator()
         rng_for_deserialized.manual_seed(0)
-        _simple_graph_snapshot_restoration(deserialized_graph2, deserialized_graph._number_of_samples_yielded,
-                                           rng=rng_for_deserialized)
+        _simple_graph_snapshot_restoration(
+            deserialized_graph2,
+            deserialized_graph._number_of_samples_yielded,
+            rng=rng_for_deserialized,
+        )
 
         # Get the next element and ensure it is as expected
         self.assertEqual(expected_res[4:], list(deserialized_graph2))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_fake_tensor.py b/test/test_fake_tensor.py
index 079d2211a6e8..7456feb45d82 100644
--- a/test/test_fake_tensor.py
+++ b/test/test_fake_tensor.py
@@ -1,76 +1,93 @@
 # Owner(s): ["module: meta tensors"]
 
 
-from torch.testing._internal.common_utils import (
-    TestCase, TEST_WITH_TORCHDYNAMO, run_tests, skipIfCrossRef, skipIfRocm, skipIfTorchDynamo, parametrize,
-    instantiate_parametrized_tests, TemporaryFileName)
-import torch
-import torch._dynamo
-from torch._dynamo.testing import make_test_cls_with_patches
+import contextlib
+import copy
+import dataclasses
+import inspect
 import itertools
+import pickle
+import unittest
+import weakref
+from unittest.mock import patch
+
 import numpy as np
-from torch.testing._internal.jit_utils import RUN_CUDA
+import torch
+import torch._dynamo
+import torch._functorch.config
+import torch._prims as prims
+import torch.testing._internal.optests as optests
+import torch.utils._pytree as pytree
+
+from torch import distributed as dist
+from torch._C._functorch import _add_batch_dim, get_unwrapped, is_batchedtensor
+from torch._dynamo.testing import make_test_cls_with_patches, rand_strided
 from torch._guards import tracing, TracingContext
 from torch._subclasses.fake_tensor import (
+    DynamicOutputShapeException,
     extract_tensor_metadata,
     FakeTensor,
-    FakeTensorMode,
     FakeTensorConverter,
-    DynamicOutputShapeException,
-    UnsupportedOperatorException,
+    FakeTensorMode,
     unset_fake_temporarily,
+    UnsupportedOperatorException,
 )
+from torch.fx.experimental.proxy_tensor import make_fx
 from torch.fx.experimental.symbolic_shapes import (
-    ShapeEnv, DimDynamic, free_symbols, StatelessSymbolicContext, ShapeEnvSettings, statically_known_true
+    DimDynamic,
+    free_symbols,
+    ShapeEnv,
+    ShapeEnvSettings,
+    StatelessSymbolicContext,
+    statically_known_true,
 )
-from torch.testing._internal.custom_op_db import custom_op_db
-from torch.testing._internal.common_device_type import ops
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, OpDTypes
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
 from torch.fx.passes.fake_tensor_prop import FakeTensorProp
-from torch._dynamo.testing import rand_strided
-from torch._C._functorch import is_batchedtensor, _add_batch_dim, get_unwrapped
 from torch.testing import FileCheck
-import dataclasses
-import inspect
-import unittest
-import torch._prims as prims
-import contextlib
-import weakref
-import copy
-import pickle
-import torch._functorch.config
-import torch.testing._internal.optests as optests
-from unittest.mock import patch
-
-from torch import distributed as dist
+from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    OpDTypes,
+    ops,
+)
+from torch.testing._internal.common_utils import (
+    instantiate_parametrized_tests,
+    parametrize,
+    run_tests,
+    skipIfCrossRef,
+    skipIfRocm,
+    skipIfTorchDynamo,
+    TemporaryFileName,
+    TEST_WITH_TORCHDYNAMO,
+    TestCase,
+)
+from torch.testing._internal.custom_op_db import custom_op_db
+from torch.testing._internal.jit_utils import RUN_CUDA
 from torch.utils._mode_utils import no_dispatch
 from torch.utils._python_dispatch import TorchDispatchMode
-import torch.utils._pytree as pytree
-from torch.fx.experimental.proxy_tensor import make_fx
 
 aten = torch.ops.aten
 
 torch._dynamo.config.fake_tensor_cache_enabled = True
 torch._dynamo.config.fake_tensor_cache_crosscheck_enabled = True
 
+
 def expectedFailurePropagateRealTensors(fn):
     fn._expected_failure_propagate_real_tensors = True
     return fn
 
+
 class FakeTensorTest(TestCase):
     def checkType(self, t, device_str, size):
         self.assertTrue(isinstance(t, FakeTensor))
         self.assertEqual(t.device.type, device_str)
         self.assertEqual(list(t.size()), size)
 
-
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_cuda_initialized(self):
         # doesnt error
         with FakeTensorMode():
-            p = torch.randn(4, 2, requires_grad=True, device='cuda')
-            x = torch.randn(8, 4, device='cuda')
+            p = torch.randn(4, 2, requires_grad=True, device="cuda")
+            x = torch.randn(8, 4, device="cuda")
             y = torch.mm(x, p).square().sum()
             y.backward()
 
@@ -86,18 +103,20 @@ def test_basic(self):
             self.assertTrue(isinstance(z, FakeTensor))
 
     def test_custom_op_fallback(self):
-        from torch.library import Library, impl
+        from torch.library import impl, Library
 
         try:
             test_lib = Library("my_test_op", "DEF")  # noqa: TOR901
-            test_lib.define('foo(Tensor self) -> Tensor')
+            test_lib.define("foo(Tensor self) -> Tensor")
 
-            @impl(test_lib, 'foo', 'CPU')
+            @impl(test_lib, "foo", "CPU")
             def foo_impl(self):
                 return self.cos()
 
             x = torch.empty(2, 2, device="cpu")
-            with self.assertRaisesRegex(UnsupportedOperatorException, "my_test_op.foo.default"):
+            with self.assertRaisesRegex(
+                UnsupportedOperatorException, "my_test_op.foo.default"
+            ):
                 with FakeTensorMode(allow_fallback_kernels=True) as mode:
                     x = mode.from_tensor(x)
                     torch.ops.my_test_op.foo(x)
@@ -114,6 +133,7 @@ def test_parameter_instantiation(self):
     @unittest.skipIf(not dist.is_available(), "requires distributed")
     def test_fsdp_flat_param(self):
         from torch.distributed.fsdp._flat_param import FlatParameter
+
         with FakeTensorMode() as m:
             data = torch.randn(2, 2)
             param = FlatParameter(data, requires_grad=True)
@@ -127,11 +147,13 @@ def test_non_parameter_grad(self):
         fake_t = mode.from_tensor(t)
         self.assertEqual(fake_t.requires_grad, t.requires_grad)
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_index_cuda_with_cpu(self):
         with FakeTensorMode():
-            x = torch.rand([2048], device='cuda')
+            x = torch.rand([2048], device="cuda")
             out = x[torch.zeros([36], dtype=torch.int64)]
             self.checkType(out, "cuda", [36])
 
@@ -148,14 +170,14 @@ def test_shape_take_not_device(self):
     def test_repr(self):
         with FakeTensorMode():
             x = torch.empty(2, 2, device="cpu")
-            self.assertEqual(repr(x), 'FakeTensor(..., size=(2, 2))')
+            self.assertEqual(repr(x), "FakeTensor(..., size=(2, 2))")
             x = torch.empty(2, 2, device="meta")
             self.assertEqual(repr(x), "FakeTensor(..., device='meta', size=(2, 2))")
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_zero_dim(self):
         with FakeTensorMode() as mode:
-            x = torch.tensor(0.)
+            x = torch.tensor(0.0)
             y = torch.rand([4, 4], device="cuda")
             out = x + y
             self.assertEqual(out.shape, (4, 4))
@@ -173,7 +195,7 @@ def test_nan_to_num(self):
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_throw(self):
-        x = torch.tensor(0.)  # TODO: tensor() errors
+        x = torch.tensor(0.0)  # TODO: tensor() errors
         with FakeTensorMode() as mode:
             x_conv = mode.from_tensor(x)
             y = torch.rand([4, 4], device="cuda")
@@ -207,17 +229,25 @@ def test_device_inplace_copy(self):
     def test_fake_dispatch_keys(self):
         with FakeTensorMode():
             x = torch.rand([4])
-            f = FileCheck().check("CPU").check("ADInplaceOrView").check("AutogradCPU").check("AutocastCPU")
+            f = (
+                FileCheck()
+                .check("CPU")
+                .check("ADInplaceOrView")
+                .check("AutogradCPU")
+                .check("AutocastCPU")
+            )
             f.run(torch._C._dispatch_key_set(x))
 
             with torch.inference_mode():
                 x = torch.rand([4])
                 y = x + x
-                FileCheck().check("CPU").check("AutocastCPU").run(torch._C._dispatch_key_set(y))
-                FileCheck().check_not("ADInplaceOrView").check_not("Autograd").run(torch._C._dispatch_key_set(y))
+                FileCheck().check("CPU").check("AutocastCPU").run(
+                    torch._C._dispatch_key_set(y)
+                )
+                FileCheck().check_not("ADInplaceOrView").check_not("Autograd").run(
+                    torch._C._dispatch_key_set(y)
+                )
 
-    # TODO: functorch support for propagate real tensors
-    @expectedFailurePropagateRealTensors
     def test_batch_tensor(self):
         x = torch.rand((3, 4, 5))
         b = _add_batch_dim(x, 0, 0)
@@ -289,7 +319,9 @@ def test_fake_mode_error(self):
             with FakeTensorMode():
                 y = x[0]
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     def test_fake_grad_copy(self):
         x = torch.rand([4, 4], requires_grad=True)
         x.grad = torch.rand([4, 4])
@@ -306,7 +338,7 @@ def test_index_put_error(self):
         for context in [contextlib.nullcontext, lambda: mode]:
             with context():
                 y = torch.randn(2, 2, 3)
-                x = torch.randn(2, 2, 3).to('cuda')
+                x = torch.randn(2, 2, 3).to("cuda")
                 with self.assertRaises(RuntimeError):
                     x[[1, 1]] = y
 
@@ -314,10 +346,12 @@ def test_index_put_error(self):
                     torch.ops.aten.index_put(x, torch.tensor([1, 1], device="cuda"), y)
 
                 # no error
-                torch.ops.aten.index_put(x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.))
-                torch.ops.aten.index_put_(x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.))
-
-
+                torch.ops.aten.index_put(
+                    x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.0)
+                )
+                torch.ops.aten.index_put_(
+                    x, torch.tensor([1, 1], device="cuda"), torch.tensor(5.0)
+                )
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_like_constructor(self):
@@ -338,7 +372,9 @@ def test_binary_op_type_promotion(self):
             self.assertEqual(out.dtype, torch.float)
             self.assertEqual(out.device.type, "cpu")
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     def test_from_numpy(self):
         with FakeTensorMode():
             x = torch.tensor(np.zeros([4, 4]))
@@ -366,9 +402,15 @@ def test_upsample_bilinear_small_channels(self):
         mode = FakeTensorMode()
         for i, context in enumerate([contextlib.nullcontext, lambda: mode]):
             with context():
-                arg0_1 = torch.empty_strided((3, 427, 640), (1, 1920, 3), dtype=torch.float32, device='cuda')
+                arg0_1 = torch.empty_strided(
+                    (3, 427, 640), (1, 1920, 3), dtype=torch.float32, device="cuda"
+                )
                 unsqueeze = torch.ops.aten.unsqueeze.default(arg0_1, 0)
-                out.append(torch.ops.aten.upsample_bilinear2d.default(unsqueeze, [800, 1199], False))
+                out.append(
+                    torch.ops.aten.upsample_bilinear2d.default(
+                        unsqueeze, [800, 1199], False
+                    )
+                )
 
         self.assertTrue(out[1].is_contiguous())
         self.checkMetaProps(out[0], out[1])
@@ -409,8 +451,9 @@ def test_out_multi_device(self):
             with self.assertRaisesRegex(Exception, "found.+two.+devices"):
                 x.add_(y)
 
-
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_normalize_device(self):
         with FakeTensorMode():
@@ -427,10 +470,15 @@ def test_recursive_invocation(self):
             y = x + x
             self.assertTrue(mode.in_kernel_invocation)
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @skipIfRocm
-    @parametrize("allow_fallback_kernels", [False, True],
-                 lambda a: 'with_fallback' if a else 'without_fallback')
+    @parametrize(
+        "allow_fallback_kernels",
+        [False, True],
+        lambda a: "with_fallback" if a else "without_fallback",
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_cudnn_rnn(self, allow_fallback_kernels):
         def fn(
@@ -526,7 +574,7 @@ def fn(
                     for ten in out:
                         if i == 1:
                             self.assertTrue(isinstance(ten, FakeTensor))
-                        self.assertEqual(ten.device.type, 'cuda')
+                        self.assertEqual(ten.device.type, "cuda")
 
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_cuda_lstm(self):
@@ -544,13 +592,20 @@ def test_cuda_lstm(self):
                 D = 2 if bidir else 1
                 H_out = proj_size if proj_size > 0 else hidden_size
 
-                lstm = torch.nn.LSTM(input_size=H_in, hidden_size=hidden_size,
-                                     num_layers=num_layers, proj_size=proj_size, batch_first=False,
-                                     bias=True, bidirectional=bidir, device='cuda')
+                lstm = torch.nn.LSTM(
+                    input_size=H_in,
+                    hidden_size=hidden_size,
+                    num_layers=num_layers,
+                    proj_size=proj_size,
+                    batch_first=False,
+                    bias=True,
+                    bidirectional=bidir,
+                    device="cuda",
+                )
 
-                h_0 = torch.randn((num_layers * D, N, H_out), device='cuda')
-                c_0 = torch.randn((num_layers * D, N, hidden_size), device='cuda')
-                inp = torch.randn((L, N, H_in), device='cuda')
+                h_0 = torch.randn((num_layers * D, N, H_out), device="cuda")
+                c_0 = torch.randn((num_layers * D, N, hidden_size), device="cuda")
+                inp = torch.randn((L, N, H_in), device="cuda")
                 (output, (h_n, c_n)) = lstm(inp, (h_0, c_0))
                 output.sum().backward()
 
@@ -578,9 +633,8 @@ def test_same_shape_env_preserved(self):
         t1 = mode1.from_tensor(
             torch.randn(10),
             symbolic_context=StatelessSymbolicContext(
-                dynamic_sizes=[DimDynamic.DYNAMIC],
-                constraint_sizes=[None]
-            )
+                dynamic_sizes=[DimDynamic.DYNAMIC], constraint_sizes=[None]
+            ),
         )
         mode2 = FakeTensorMode(shape_env=shape_env)
         t2 = mode2.from_tensor(t1)
@@ -630,11 +684,16 @@ def test_deepcopy(self):
             mod_copied = copy.deepcopy(mod)
 
         def check_copy(mod, mod_copied):
-            for name, param in itertools.chain(mod.named_parameters(), mod.named_buffers()):
+            for name, param in itertools.chain(
+                mod.named_parameters(), mod.named_buffers()
+            ):
                 param_copied = getattr(mod_copied, name)
                 self.checkMetaProps(param, param_copied)
                 self.assertTrue(isinstance(param_copied, FakeTensor))
-                self.assertEqual(isinstance(param, torch.nn.Parameter), isinstance(param_copied, torch.nn.Parameter))
+                self.assertEqual(
+                    isinstance(param, torch.nn.Parameter),
+                    isinstance(param_copied, torch.nn.Parameter),
+                )
                 self.assertEqual(param.requires_grad, param_copied.requires_grad)
 
         check_copy(mod, mod_copied)
@@ -653,18 +712,22 @@ def __init__(self):
         self.assertIs(mod_copied.a, mod_copied.b)
         self.assertEqual(mod_copied.b.storage()._cdata, mod_copied.a.storage()._cdata)
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_new(self):
         with FakeTensorMode():
             a = torch.rand([16, 1])
             self.checkType(a.new(10, 10), "cpu", [10, 10])
             self.checkType(a.new([1, 2, 3, 4]), "cpu", [4])
-            b = torch.rand([4, 4], device='cuda')
-            self.checkType(b.new(device='cuda'), "cuda", [0])
+            b = torch.rand([4, 4], device="cuda")
+            self.checkType(b.new(device="cuda"), "cuda", [0])
             self.checkType(a.new(torch.rand([1])), "cpu", [1])
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     def test_scalar_inputs(self):
         with FakeTensorMode():
             self.checkType(torch.div(3, 2), "cpu", [])
@@ -672,7 +735,9 @@ def test_scalar_inputs(self):
             self.assertEqual(ten.dtype, torch.float)
             self.checkType(ten, "cpu", [2])
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     def test_allow_meta(self):
         def run_meta():
             with FakeTensorMode():
@@ -688,7 +753,7 @@ def test_embedding_bag_meta(self):
         def f():
             # This behavior was originally unintentional but we see people
             # relying on it
-            embedding = torch.nn.EmbeddingBag(10, 3, mode='sum', device='meta')
+            embedding = torch.nn.EmbeddingBag(10, 3, mode="sum", device="meta")
             input = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9], dtype=torch.long)
             offsets = torch.tensor([0, 4], dtype=torch.long)
             return embedding(input, offsets)
@@ -701,7 +766,9 @@ def f():
             self.assertEqual(r.size(), f.size())
             self.assertEqual(r.device, f.device)
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     def test_mixed_real_and_fake_inputs(self):
         class _TestPattern(torch.nn.Module):
             def __init__(self):
@@ -730,7 +797,9 @@ def forward(self, input):
             out = mod(torch.randn(1, 1, 3, 3))
         self.checkType(out, "cpu", (1, 1, 3, 3))
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_aten_copy_multi_device(self):
         with FakeTensorMode():
@@ -744,7 +813,9 @@ def test_aten_copy_multi_device(self):
         self.checkType(copy2, "cuda", (4,))
         self.checkType(out, "cpu", (4,))
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_aten_index_multi_device(self):
         with FakeTensorMode():
@@ -768,7 +839,9 @@ def test_aten_index_multi_device(self):
         self.checkType(r3, "cpu", (4, 4))
         self.checkType(r4, "cuda", (4, 4))
 
-    @unittest.skipIf(TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile")
+    @unittest.skipIf(
+        TEST_WITH_TORCHDYNAMO, "isinstance check for FakeTensor won't work with compile"
+    )
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
     def test_aten_slice_scatter_multi_device(self):
         with FakeTensorMode():
@@ -790,11 +863,11 @@ def test__adaptive_avg_pool2d_backward(self):
             grad_out = torch.rand(2, 3, 4, 4)
             inp = torch.rand(2, 3, 4, 4).to(memory_format=torch.channels_last)
             grad_in = torch.ops.aten._adaptive_avg_pool2d_backward(grad_out, inp)
-            self.assertTrue(torch._prims_common.suggest_memory_format(grad_in) == torch.channels_last)
+            self.assertTrue(
+                torch._prims_common.suggest_memory_format(grad_in)
+                == torch.channels_last
+            )
 
-    # Propagate real tensors doesn't work when original input arguments are
-    # fake
-    @expectedFailurePropagateRealTensors
     def test_export_numpy(self):
         class MyNumpyModel(torch.nn.Module):
             def forward(self, input):
@@ -812,8 +885,12 @@ def test_unsqueeze_copy(self):
             t = fake_mode.from_tensor(
                 t1,
                 symbolic_context=StatelessSymbolicContext(
-                    dynamic_sizes=[DimDynamic.DYNAMIC, DimDynamic.STATIC, DimDynamic.STATIC],
-                )
+                    dynamic_sizes=[
+                        DimDynamic.DYNAMIC,
+                        DimDynamic.STATIC,
+                        DimDynamic.STATIC,
+                    ],
+                ),
             )
 
         self.assertEqual(t.shape[0], torch.ops.aten.unsqueeze_copy(t, 1).shape[0])
@@ -866,20 +943,20 @@ def assertNotConst(self, *args):
 
     def test_simple(self):
         with FakeTensorMode():
-            x = torch.tensor(4.)
-            self.assertEqual(x.item(), 4.)
+            x = torch.tensor(4.0)
+            self.assertEqual(x.item(), 4.0)
 
     def test_inplace_add(self):
         with FakeTensorMode():
-            x = torch.tensor(4.)
+            x = torch.tensor(4.0)
             y = x.add_(1)
-            self.assertEqual(x.item(), 5.)
-            self.assertEqual(y.item(), 5.)
+            self.assertEqual(x.item(), 5.0)
+            self.assertEqual(y.item(), 5.0)
             self.assertConst(x, y)
 
     def test_shared_storages(self):
         with FakeTensorMode():
-            x = torch.tensor([4.])
+            x = torch.tensor([4.0])
             y = x[:]
 
             self.assertEqual(x.storage()._cdata, y.storage()._cdata)
@@ -887,7 +964,7 @@ def test_shared_storages(self):
 
     def test_constant_invalidation(self):
         with FakeTensorMode():
-            x = torch.tensor([1.])
+            x = torch.tensor([1.0])
             self.assertConst(x)
             y = torch.rand([1])
             x.add_(y)
@@ -902,13 +979,14 @@ def test_inplace_view_invalidation(self):
             self.assertNotConst(x)
 
     def test_fake_tensor_in_intlist_repro(self):
-
         def fn(tensors):
             max_size = torch.tensor([800, 1216], dtype=torch.int64)
             batch_shape = [len(tensors)] + list(tensors[0].shape[:-2]) + list(max_size)
             return tensors[0].new_full(batch_shape, 0.0)
 
-        with self.assertRaises(torch._subclasses.fake_tensor.DataDependentOutputException):
+        with self.assertRaises(
+            torch._subclasses.fake_tensor.DataDependentOutputException
+        ):
             with torch._subclasses.fake_tensor.FakeTensorMode():
                 a = torch.randn(3, 800, 1199)
                 b = torch.randn(3, 800, 800)
@@ -926,7 +1004,7 @@ def test_fake_tensor_batch_norm_cpu(self):
 
     def test_shared_storage_invalidation(self):
         with FakeTensorMode():
-            x = torch.tensor([1.])
+            x = torch.tensor([1.0])
             y = x[:]
             self.assertConst(x, y)
             y.add_(torch.rand([1]))
@@ -942,7 +1020,7 @@ def test_aliased_const_write(self):
 
     def test_constant_propagate_through_functions(self):
         with FakeTensorMode():
-            y = torch.div(4, 4, rounding_mode='trunc')
+            y = torch.div(4, 4, rounding_mode="trunc")
             self.assertConst(y)
 
 
@@ -967,7 +1045,9 @@ def test_fake(self, device, dtype, op):
 
 make_propagate_real_tensors_cls(FakeTensorOpInfoTest)
 instantiate_device_type_tests(FakeTensorOpInfoTest, globals(), only_for=("cpu", "cuda"))
-instantiate_device_type_tests(PropagateRealTensorsFakeTensorOpInfoTest, globals(), only_for=("cpu",))  # noqa: F821
+instantiate_device_type_tests(
+    PropagateRealTensorsFakeTensorOpInfoTest, globals(), only_for=("cpu",)  # noqa: F821
+)
 
 
 class FakeTensorConverterTest(TestCase):
@@ -980,7 +1060,10 @@ def test_memoized_conversion_from_meta(self):
         x = torch.rand(2, 2).to(device="meta")
         mode = FakeTensorMode()
         converter = mode.fake_tensor_converter
-        self.assertTrue(converter.from_meta_and_device(mode, x, "cpu") is converter.from_meta_and_device(mode, x, "cpu"))
+        self.assertTrue(
+            converter.from_meta_and_device(mode, x, "cpu")
+            is converter.from_meta_and_device(mode, x, "cpu")
+        )
 
     def test_separate_tensor_storages_view(self):
         x = torch.rand(2, 2, 2)
@@ -1011,7 +1094,6 @@ def test_separate_tensor_storages_non_view(self):
         self.assertEqual(len(converter.tensor_memo), 0)
         self.assertEqual(len(converter.meta_converter.storage_memo), 0)
 
-
     @skipIfTorchDynamo("https://github.com/pytorch/torchdynamo/issues/1991")
     def test_dead_weak_ref(self):
         x = torch.rand(2, 2, 2)
@@ -1114,7 +1196,8 @@ def test_non_kwarg_only_device(self):
             )
             if has_non_kwarg_device:
                 self.assertTrue(
-                    self.get_aten_op(schema) in torch._subclasses.fake_tensor._device_not_kwarg_ops
+                    self.get_aten_op(schema)
+                    in torch._subclasses.fake_tensor._device_not_kwarg_ops
                 )
 
     def test_tensor_constructors_all_have_kwarg_device(self):
@@ -1153,24 +1236,35 @@ def test_like_ops(self):
         for schema in self.get_all_aten_schemas():
             if "_like" == schema.name[-5:]:
                 op = self.get_aten_op(schema)
-                self.assertIn(op, torch._subclasses.fake_tensor._like_tensor_constructors)
+                self.assertIn(
+                    op, torch._subclasses.fake_tensor._like_tensor_constructors
+                )
 
     def test_str_storage(self):
         x = torch.zeros(3)
         with FakeTensorMode() as m:
             y = m.from_tensor(x)
-            self.assertExpectedInline(str(x.storage()), '''\
+            self.assertExpectedInline(
+                str(x.storage()),
+                """\
  0.0
  0.0
  0.0
-[torch.storage.TypedStorage(dtype=torch.float32, device=cpu) of size 3]''')
-            self.assertExpectedInline(str(y.storage()), '''\
+[torch.storage.TypedStorage(dtype=torch.float32, device=cpu) of size 3]""",
+            )
+            self.assertExpectedInline(
+                str(y.storage()),
+                """\
 ...
-[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]''')
+[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]""",
+            )
 
-        self.assertExpectedInline(str(y.storage()), '''\
+        self.assertExpectedInline(
+            str(y.storage()),
+            """\
 ...
-[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]''')
+[torch.storage.TypedStorage(dtype=torch.float32, device=meta) of size 3]""",
+        )
 
     # at::_embedding_bag has no op info,
     # and returns extra tensors that at::embedding bag throws away
@@ -1185,7 +1279,9 @@ def test_embedding_bag_private(self):
 
         ref_out = torch.ops.aten._embedding_bag(*args)
         with FakeTensorMode() as m:
-            meta_args = [m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args]
+            meta_args = [
+                m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args
+            ]
             meta_out = torch.ops.aten._embedding_bag(*meta_args)
 
         self.assertEqual(len(ref_out), len(meta_out))
@@ -1201,20 +1297,29 @@ def test_cross_entropy_loss(self):
             args = (inp, target, w)
             ref = fn(*args)
             with FakeTensorMode() as m:
-                meta_args = [m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args]
-                meta_out = torch.nn.functional.cross_entropy(*meta_args, label_smoothing=0.5)
+                meta_args = [
+                    m.from_tensor(a) if isinstance(a, torch.Tensor) else a for a in args
+                ]
+                meta_out = torch.nn.functional.cross_entropy(
+                    *meta_args, label_smoothing=0.5
+                )
 
             self.assertEqual(ref.size(), meta_out.size())
 
     @skipIfRocm
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Does not support SDPA or pre-SM80 hardware")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION,
+        "Does not support SDPA or pre-SM80 hardware",
+    )
     def test_flash_attention(self):
         class Repro(torch.nn.Module):
             def __init__(self):
                 super().__init__()
 
             def forward(self, arg1, arg2, arg3):
-                torch.ops.aten._scaled_dot_product_flash_attention(arg1, arg2, arg3, scale=0.17677669529663687)
+                torch.ops.aten._scaled_dot_product_flash_attention(
+                    arg1, arg2, arg3, scale=0.17677669529663687
+                )
 
         args_new = [
             [
@@ -1226,11 +1331,13 @@ def forward(self, arg1, arg2, arg3):
                 ((4, 2, 16, 32), (1024, 512, 32, 1), torch.float16, "cuda"),
                 ((4, 2, 16, 32), (1024, 512, 32, 1), torch.float16, "cuda"),
                 ((4, 2, 16, 32), (1024, 512, 32, 1), torch.float16, "cuda"),
-            ]
+            ],
         ]
         for args_list in args_new:
-            args = [rand_strided(bsz, num_heads, seq_len, head_dim) for
-                    (bsz, num_heads, seq_len, head_dim) in args_list]
+            args = [
+                rand_strided(bsz, num_heads, seq_len, head_dim)
+                for (bsz, num_heads, seq_len, head_dim) in args_list
+            ]
             try:
                 with torch._subclasses.CrossRefFakeMode():
                     Repro()(*args)
@@ -1238,7 +1345,10 @@ def forward(self, arg1, arg2, arg3):
                 # We expect the cross ref to succed for the first output to fail
                 # for the rng state, see Note [Seed and Offset]
                 self.assertTrue("output[0]" not in str(e))
-                self.assertTrue("found mismatched tensor metadata for output[6]: Devices cpu and cuda:0 are not equal!" in str(e))
+                self.assertTrue(
+                    "found mismatched tensor metadata for output[6]: Devices cpu and cuda:0 are not equal!"
+                    in str(e)
+                )
 
     # IMPORTANT!!! Always run even if CUDA is not available
     def test_fake_cuda_no_init(self):
@@ -1247,12 +1357,12 @@ def test_fake_cuda_no_init(self):
         if torch._functorch.config.fake_tensor_propagate_real_tensors:
             return
         with FakeTensorMode():
-            torch.empty(10, device='cuda')
-            torch.ones(10, device='cuda')
-            torch.zeros(10, device='cuda')
-            torch.rand(10, device='cuda')
-            torch.tensor(3.14, device='cuda')
-            torch.tensor([[3.14, 2], [1, 2]], device='cuda')
+            torch.empty(10, device="cuda")
+            torch.ones(10, device="cuda")
+            torch.zeros(10, device="cuda")
+            torch.rand(10, device="cuda")
+            torch.tensor(3.14, device="cuda")
+            torch.tensor([[3.14, 2], [1, 2]], device="cuda")
 
     @skipIfRocm
     @unittest.skipIf(not RUN_CUDA, "requires cuda")
@@ -1364,8 +1474,6 @@ def to_fake_tensor(x):
                     failed = True
                 self.assertTrue(failed)
 
-
-    @expectedFailurePropagateRealTensors  # Propagate real tensors doesn't work with fake-on-fake
     def test_fake_tensor_prop_on_nn_module_with_optional_args(self):
         class OptionalArgumentInBetween(torch.nn.Module):
             def __init__(self):
@@ -1384,16 +1492,20 @@ def forward(self, value, another_value=None, another_optional_value=None):
                 value = value + another_value + another_optional_value
                 return value * value
 
-        fake_mode = FakeTensorMode(allow_non_fake_inputs=True, allow_fallback_kernels=False)
+        fake_mode = FakeTensorMode(
+            allow_non_fake_inputs=True, allow_fallback_kernels=False
+        )
         with fake_mode:
             model = OptionalArgumentInBetween()
             value = torch.randn(5, 4)
             another_optional_value = torch.randn(5, 4)
-            graph_model = torch.fx.symbolic_trace(model, (value, None, another_optional_value))
-            FakeTensorProp(graph_model, fake_mode).propagate(value, None, another_optional_value)
-
+            graph_model = torch.fx.symbolic_trace(
+                model, (value, None, another_optional_value)
+            )
+            FakeTensorProp(graph_model, fake_mode).propagate(
+                value, None, another_optional_value
+            )
 
-    @expectedFailurePropagateRealTensors  # TODO: not sure about this one, kinda strange
     def test_unbacked_shape_realloc(self):
         def f(x):
             return x.nonzero()
@@ -1403,12 +1515,14 @@ def f(x):
         with fake_mode:
             value = torch.randn(5)
             gm = make_fx(f)(value)
-        nonzero_nodes = [n for n in gm.graph.nodes if n.target is torch.ops.aten.nonzero.default]
+        nonzero_nodes = [
+            n for n in gm.graph.nodes if n.target is torch.ops.aten.nonzero.default
+        ]
         self.assertEqual(len(nonzero_nodes), 1)
-        self.assertIsInstance(nonzero_nodes[0].meta['val'].shape[0], torch.SymInt)
-        u0 = nonzero_nodes[0].meta['val'].shape[0]
+        self.assertIsInstance(nonzero_nodes[0].meta["val"].shape[0], torch.SymInt)
+        u0 = nonzero_nodes[0].meta["val"].shape[0]
         FakeTensorProp(gm, fake_mode).propagate(value)
-        u1 = nonzero_nodes[0].meta['val'].shape[0]
+        u1 = nonzero_nodes[0].meta["val"].shape[0]
         # Test that this test is actually doing something in that the
         # FakeTensorProp actually triggered a reallocation.  If this assert is
         # failing, it could be because we started memoizing the nnz count for
@@ -1420,9 +1534,7 @@ def f(x):
         self.assertIsNot(u0, u1)
         self.assertTrue(statically_known_true(u0 == u1))
 
-
     def test_torch_load_with_fake_mode(self):
-
         class TheModelClass(torch.nn.Module):
             def __init__(self):
                 super().__init__()
@@ -1475,7 +1587,8 @@ def test_shape_env_settings(self):
         """
         init_sig = inspect.signature(ShapeEnv._init)
         args = [
-            name for name, param in init_sig.parameters.items()
+            name
+            for name, param in init_sig.parameters.items()
             if type(param.default) is bool
         ]
 
@@ -1783,5 +1896,6 @@ def test_inference_mode(self):
                 extract_tensor_metadata(res4),
             )
 
+
 if __name__ == "__main__":
     run_tests()
diff --git a/test/test_flop_counter.py b/test/test_flop_counter.py
index 43f5cb9dadf4..4f9c7020c0e6 100644
--- a/test/test_flop_counter.py
+++ b/test/test_flop_counter.py
@@ -1,15 +1,24 @@
 # Owner(s): ["module: unknown"]
 
+import functools
+import unittest
+
 import torch
-from torch.testing._internal.common_utils import TestCase, run_tests, TEST_WITH_TORCHDYNAMO
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FLASH_ATTENTION, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION
-import torch.utils.flop_counter
 import torch.nn.functional as F
-import unittest
-import functools
+import torch.utils.flop_counter
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+    PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+)
+from torch.testing._internal.common_utils import (
+    run_tests,
+    TEST_WITH_TORCHDYNAMO,
+    TestCase,
+)
 
 try:
     from torchvision import models as torchvision_models
+
     HAS_TORCHVISION = True
 except ImportError:
     HAS_TORCHVISION = False
@@ -17,16 +26,22 @@
 
 HAS_CUDA = torch.cuda.is_available()
 
+
 def FlopCounterMode(*args, **kwargs):
     return torch.utils.flop_counter.FlopCounterMode(*args, **kwargs, display=False)
 
+
 def get_total_flops(mode):
     return str(sum(v for _, v in mode.flop_counts["Global"].items()))
 
+
 def T(*shape, requires_grad=False):
     return torch.randn(*shape, requires_grad=requires_grad)
 
-@unittest.skipIf(TEST_WITH_TORCHDYNAMO, "torchdynamo doesn't work with __torch_dispatch__ right now")
+
+@unittest.skipIf(
+    TEST_WITH_TORCHDYNAMO, "torchdynamo doesn't work with __torch_dispatch__ right now"
+)
 class TestFlopCounter(TestCase):
     def test_flop_counter_variety(self):
         mod = torch.nn.Linear(9, 10)
@@ -109,6 +124,7 @@ def test_backward_reset(self):
     def test_torchscript(self):
         def foo(x):
             return torch.mm(x, x)
+
         with FlopCounterMode() as mode:
             foo(T(5, 5))
         unscripted_flops = get_total_flops(mode)
@@ -125,7 +141,9 @@ def forward(ctx, input: torch.Tensor) -> torch.Tensor:
 
             @staticmethod
             def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor:
-                return torch.mm(grad_output, grad_output) + torch.mm(grad_output, grad_output)
+                return torch.mm(grad_output, grad_output) + torch.mm(
+                    grad_output, grad_output
+                )
 
         a = T(5, 5, requires_grad=True)
         with FlopCounterMode() as mode:
@@ -160,11 +178,13 @@ def backward(ctx, grad_out):
                     return grad_inp, grad_weight, None
                 else:
                     grad_inp = F.conv1d(grad_out, weight)
-                    grad_weight = F.conv1d(grad_out.transpose(1, 0), inp.transpose(1, 0))
+                    grad_weight = F.conv1d(
+                        grad_out.transpose(1, 0), inp.transpose(1, 0)
+                    )
                     return grad_inp, grad_weight.transpose(1, 0), None
 
-
         from torch.func import grad
+
         x = torch.randn(2, 3, 16, dtype=torch.float64)
         weight = torch.randn(3, 4, 4, dtype=torch.float64)
 
@@ -182,13 +202,16 @@ def only_convs(x, weight, transposed):
 
         self.assertEqual(boring_grads, fun_grads)
 
-
     def test_convs(self):
         def assert_equivalence(f, expected_forward=None):
             with FlopCounterMode() as mode:
                 f()
-            conv_forward_flops = mode.get_flop_counts()['Global'][torch.ops.aten.convolution]
-            conv_backward_flops = mode.get_flop_counts()['Global'][torch.ops.aten.convolution_backward]
+            conv_forward_flops = mode.get_flop_counts()["Global"][
+                torch.ops.aten.convolution
+            ]
+            conv_backward_flops = mode.get_flop_counts()["Global"][
+                torch.ops.aten.convolution_backward
+            ]
 
             self.assertEqual(conv_forward_flops * 2, conv_backward_flops)
             if expected_forward is not None:
@@ -213,8 +236,12 @@ def assert_equivalence(f, expected_forward=None):
             x = torch.rand(1, in_channels, 4, 4, requires_grad=True)
             weight = torch.randn(out_channels, in_channels, 2, 2, requires_grad=True)
             assert_equivalence(lambda: F.conv2d(x, weight).sum().backward())
-            transposed_weight = torch.randn(in_channels, out_channels, 2, 2, requires_grad=True)
-            assert_equivalence(lambda: F.conv_transpose2d(x, transposed_weight).sum().backward())
+            transposed_weight = torch.randn(
+                in_channels, out_channels, 2, 2, requires_grad=True
+            )
+            assert_equivalence(
+                lambda: F.conv_transpose2d(x, transposed_weight).sum().backward()
+            )
 
     @skipIfNoTorchVision
     def test_module(self):
@@ -224,12 +251,15 @@ def test_module(self):
             resnet18(a).sum().backward()
 
         self.assertExpectedInline(get_total_flops(mode), """10884440064""")
-        layer1_conv_flops = mode.flop_counts['ResNet.layer1'][torch.ops.aten.convolution]
-        layer1_conv_back_flops = mode.flop_counts['ResNet.layer1'][torch.ops.aten.convolution_backward]
+        layer1_conv_flops = mode.flop_counts["ResNet.layer1"][
+            torch.ops.aten.convolution
+        ]
+        layer1_conv_back_flops = mode.flop_counts["ResNet.layer1"][
+            torch.ops.aten.convolution_backward
+        ]
         self.assertExpectedInline(str(layer1_conv_flops), """924844032""")
         self.assertExpectedInline(str(layer1_conv_back_flops), """1849688064""")
 
-
     def test_conv_transpose_loop(self):
         x = torch.rand(1, 4, 30, 2)
         model = torch.nn.ConvTranspose2d(4, 8, (2, 2), stride=2)
@@ -241,7 +271,9 @@ def test_conv_transpose_loop(self):
         self.assertExpectedInline(str(mode.get_total_flops()), """1536000""")
 
     def test_custom(self):
-        mode = FlopCounterMode(custom_mapping={torch.ops.aten.add: lambda *args, out_shape: 5})
+        mode = FlopCounterMode(
+            custom_mapping={torch.ops.aten.add: lambda *args, out_shape: 5}
+        )
         with mode:
             a = T(4, 5)
             a + a
@@ -250,6 +282,7 @@ def test_custom(self):
 
         def count(*args, out_val):
             return out_val.numel()
+
         count._get_raw = True
 
         mode = FlopCounterMode(custom_mapping={torch.ops.aten.add: count})
@@ -264,8 +297,11 @@ def test_noop(self):
             T(4, 5).cos()
 
     @unittest.skipIf(not HAS_CUDA, "CUDA not available")
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
-                     "Does not support all SDPA backends (pre-SM80 hardware on CUDA)")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION
+        or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+        "Does not support all SDPA backends (pre-SM80 hardware on CUDA)",
+    )
     def test_sdpa(self):
         batch_size = 4
         n_heads = 8
@@ -277,73 +313,154 @@ def test_sdpa(self):
 
         torch.manual_seed(0)
 
-        def get_flops(batch_size, n_heads, seq_len_q, seq_len_k, head_dim, head_dim_v, dtype, backend, with_backward=False):
-            query = torch.randn(batch_size, n_heads, seq_len_q, head_dim, device='cuda', dtype=dtype, requires_grad=True)
-            key = torch.randn(batch_size, n_heads, seq_len_k, head_dim, device='cuda', dtype=dtype, requires_grad=True)
-            value = torch.randn(batch_size, n_heads, seq_len_k, head_dim_v, device='cuda', dtype=dtype, requires_grad=True)
+        def get_flops(
+            batch_size,
+            n_heads,
+            seq_len_q,
+            seq_len_k,
+            head_dim,
+            head_dim_v,
+            dtype,
+            backend,
+            with_backward=False,
+        ):
+            query = torch.randn(
+                batch_size,
+                n_heads,
+                seq_len_q,
+                head_dim,
+                device="cuda",
+                dtype=dtype,
+                requires_grad=True,
+            )
+            key = torch.randn(
+                batch_size,
+                n_heads,
+                seq_len_k,
+                head_dim,
+                device="cuda",
+                dtype=dtype,
+                requires_grad=True,
+            )
+            value = torch.randn(
+                batch_size,
+                n_heads,
+                seq_len_k,
+                head_dim_v,
+                device="cuda",
+                dtype=dtype,
+                requires_grad=True,
+            )
 
             if backend == "math":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=False, enable_math=True, enable_mem_efficient=False
+                )
             elif backend == "flash":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=True, enable_math=False, enable_mem_efficient=False
+                )
             elif backend == "mem_efficient":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=False, enable_math=False, enable_mem_efficient=True
+                )
 
             mode = FlopCounterMode()
             with backend, mode:
-                out = F.scaled_dot_product_attention(query, key, value, dropout_p=0, is_causal=True)
+                out = F.scaled_dot_product_attention(
+                    query, key, value, dropout_p=0, is_causal=True
+                )
                 if with_backward:
                     out.sum().backward()
             return int(get_total_flops(mode))
 
         # Sets seq_len_q == seq_len_k and dim_q == dim_v
-        run_uniform_flops = functools.partial(get_flops, batch_size, n_heads, seq_len_q, seq_len_q, head_dim, head_dim, dtype)
+        run_uniform_flops = functools.partial(
+            get_flops,
+            batch_size,
+            n_heads,
+            seq_len_q,
+            seq_len_q,
+            head_dim,
+            head_dim,
+            dtype,
+        )
 
-        flops = [run_uniform_flops(backend, with_backward=False) for backend in ["math", "flash", "mem_efficient"]]
+        flops = [
+            run_uniform_flops(backend, with_backward=False)
+            for backend in ["math", "flash", "mem_efficient"]
+        ]
         flops_fw_math, flops_fw_flash, flops_fw_efficient = flops
         self.assertEqual(flops_fw_math, flops_fw_flash)
         self.assertEqual(flops_fw_math, flops_fw_efficient)
 
         self.assertExpectedInline(str(flops_fw_math), """134217728""")
 
-        flops = [run_uniform_flops(backend, with_backward=True) for backend in ["math", "flash", "mem_efficient"]]
+        flops = [
+            run_uniform_flops(backend, with_backward=True)
+            for backend in ["math", "flash", "mem_efficient"]
+        ]
         flops_fw_bw_math, flops_fw_bw_flash, flops_fw_bw_efficient = flops
         self.assertEqual(flops_fw_math * 3, flops_fw_bw_math)
         self.assertEqual(flops_fw_math * 7 // 2, flops_fw_bw_flash)
         self.assertEqual(flops_fw_bw_flash, flops_fw_bw_efficient)
 
-
-        run_nonuniform_flops = functools.partial(get_flops, batch_size, n_heads, seq_len_q, seq_len_k, head_dim, head_dim_v, dtype)
+        run_nonuniform_flops = functools.partial(
+            get_flops,
+            batch_size,
+            n_heads,
+            seq_len_q,
+            seq_len_k,
+            head_dim,
+            head_dim_v,
+            dtype,
+        )
         # Flash does not support non-uniform attention, i.e. seq_len_q != seq_len_k or dim_q != dim_v"
         non_uniform_backends = ["math", "mem_efficient"]
-        flops = [run_nonuniform_flops(backend, with_backward=False) for backend in non_uniform_backends]
+        flops = [
+            run_nonuniform_flops(backend, with_backward=False)
+            for backend in non_uniform_backends
+        ]
         flops_fw_math, flops_fw_efficient = flops
         self.assertEqual(flops_fw_math, flops_fw_efficient)
 
         self.assertExpectedInline(str(flops_fw_math), """268435456""")
 
-        flops = [run_nonuniform_flops(backend, with_backward=True) for backend in non_uniform_backends]
+        flops = [
+            run_nonuniform_flops(backend, with_backward=True)
+            for backend in non_uniform_backends
+        ]
         flops_fw_bw_math, flops_fw_bw_efficient = flops
         self.assertExpectedInline(str(flops_fw_bw_math), """805306368""")
         self.assertExpectedInline(str(flops_fw_bw_efficient), """939524096""")
 
     @unittest.skipIf(not HAS_CUDA, "CUDA not available")
-    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
-                     "Does not support all SDPA backends (pre-SM80 hardware on CUDA)")
+    @unittest.skipIf(
+        not PLATFORM_SUPPORTS_FLASH_ATTENTION
+        or not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+        "Does not support all SDPA backends (pre-SM80 hardware on CUDA)",
+    )
     def test_sdpa_nested_tensor(self):
-
         def get_flops(q, k, v, backend, with_backward=False):
             mode = FlopCounterMode()
 
             if backend == "math":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=False, enable_math=True, enable_mem_efficient=False
+                )
             elif backend == "flash":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=True, enable_math=False, enable_mem_efficient=False
+                )
             elif backend == "mem_efficient":
-                backend = torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=False, enable_mem_efficient=True)
+                backend = torch.backends.cuda.sdp_kernel(
+                    enable_flash=False, enable_math=False, enable_mem_efficient=True
+                )
 
             with backend, mode:
-                out = F.scaled_dot_product_attention(q, k, v, dropout_p=0, is_causal=True)
+                out = F.scaled_dot_product_attention(
+                    q, k, v, dropout_p=0, is_causal=True
+                )
                 if with_backward:
                     if out.is_nested:
                         out.values().sum().backward()
@@ -361,25 +478,47 @@ def get_nested_inputs(
             head_dim_v,
             dtype,
         ):
-            q_lengths = torch.tensor([
-                max_seq_len_q // 4,
-                max_seq_len_q // 4 * 2,
-                max_seq_len_q // 4 * 3,
-                max_seq_len_q // 4 * 4
-            ])
-            k_lengths = torch.tensor([
-                max_seq_len_k // 4,
-                max_seq_len_k // 4 * 2,
-                max_seq_len_k // 4 * 3,
-                max_seq_len_k // 4 * 4
-            ])
+            q_lengths = torch.tensor(
+                [
+                    max_seq_len_q // 4,
+                    max_seq_len_q // 4 * 2,
+                    max_seq_len_q // 4 * 3,
+                    max_seq_len_q // 4 * 4,
+                ]
+            )
+            k_lengths = torch.tensor(
+                [
+                    max_seq_len_k // 4,
+                    max_seq_len_k // 4 * 2,
+                    max_seq_len_k // 4 * 3,
+                    max_seq_len_k // 4 * 4,
+                ]
+            )
             q_offsets, k_offsets = (
                 torch.cat((torch.tensor([0]), torch.cumsum(lengths, dim=0))).cuda()
                 for lengths in (q_lengths, k_lengths)
             )
-            q_values = torch.randn(q_offsets[-1], head_dim * n_heads, dtype=dtype, requires_grad=True, device="cuda")
-            k_values = torch.randn(k_offsets[-1], head_dim * n_heads, dtype=dtype, requires_grad=True, device="cuda")
-            v_values = torch.randn(k_offsets[-1], head_dim_v * n_heads, dtype=dtype, requires_grad=True, device="cuda")
+            q_values = torch.randn(
+                q_offsets[-1],
+                head_dim * n_heads,
+                dtype=dtype,
+                requires_grad=True,
+                device="cuda",
+            )
+            k_values = torch.randn(
+                k_offsets[-1],
+                head_dim * n_heads,
+                dtype=dtype,
+                requires_grad=True,
+                device="cuda",
+            )
+            v_values = torch.randn(
+                k_offsets[-1],
+                head_dim_v * n_heads,
+                dtype=dtype,
+                requires_grad=True,
+                device="cuda",
+            )
 
             q = torch.nested.nested_tensor_from_jagged(q_values, q_offsets)
             k = torch.nested.nested_tensor_from_jagged(k_values, k_offsets)
@@ -397,13 +536,16 @@ def split_tensor(x):
                     y.unsqueeze(0).transpose(1, 2).detach().requires_grad_(True)
                     for y in x.transpose(1, 2).unbind(0)
                 )
+
             q_tensors = split_tensor(q)
             k_tensors = split_tensor(k)
             v_tensors = split_tensor(v)
 
             flops = 0
             for q_i, k_i, v_i in zip(q_tensors, k_tensors, v_tensors):
-                flops += get_flops(q_i, k_i, v_i, backend=backend, with_backward=with_backward)
+                flops += get_flops(
+                    q_i, k_i, v_i, backend=backend, with_backward=with_backward
+                )
 
             return flops
 
@@ -429,29 +571,77 @@ def split_tensor(x):
         }
 
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=False),
-            get_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=False),
+            get_dense_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="flash",
+                with_backward=False,
+            ),
+            get_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="flash",
+                with_backward=False,
+            ),
         )
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=False),
-            get_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=False),
+            get_dense_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="mem_efficient",
+                with_backward=False,
+            ),
+            get_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="mem_efficient",
+                with_backward=False,
+            ),
         )
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=False),
-            get_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=False),
+            get_dense_flops(
+                *get_nested_inputs(**differing_config),
+                backend="mem_efficient",
+                with_backward=False,
+            ),
+            get_flops(
+                *get_nested_inputs(**differing_config),
+                backend="mem_efficient",
+                with_backward=False,
+            ),
         )
 
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=True),
-            get_flops(*get_nested_inputs(**uniform_config), backend="flash", with_backward=True),
+            get_dense_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="flash",
+                with_backward=True,
+            ),
+            get_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="flash",
+                with_backward=True,
+            ),
         )
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=True),
-            get_flops(*get_nested_inputs(**uniform_config), backend="mem_efficient", with_backward=True),
+            get_dense_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="mem_efficient",
+                with_backward=True,
+            ),
+            get_flops(
+                *get_nested_inputs(**uniform_config),
+                backend="mem_efficient",
+                with_backward=True,
+            ),
         )
         self.assertEqual(
-            get_dense_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=True),
-            get_flops(*get_nested_inputs(**differing_config), backend="mem_efficient", with_backward=True),
+            get_dense_flops(
+                *get_nested_inputs(**differing_config),
+                backend="mem_efficient",
+                with_backward=True,
+            ),
+            get_flops(
+                *get_nested_inputs(**differing_config),
+                backend="mem_efficient",
+                with_backward=True,
+            ),
         )
 
     def test_addmm_out(self):
@@ -479,8 +669,8 @@ def test_hook_registration(self):
     def test_pytrees(self):
         class Foo(torch.nn.Module):
             def forward(self, x):
-                x = x['a'].relu_()
-                return {'a': torch.mm(x, x)}
+                x = x["a"].relu_()
+                return {"a": torch.mm(x, x)}
 
         class Mod(torch.nn.Module):
             def __init__(self):
@@ -493,8 +683,12 @@ def forward(self, x):
 
         mod = Mod()
         with FlopCounterMode() as mode:
-            mod({'a': torch.randn(10, 10, requires_grad=True).clone()})['a'].sum().backward()
-        self.assertExpectedInline((mode.flop_counts['Mod'][torch.ops.aten.mm]), """12000""")
+            mod({"a": torch.randn(10, 10, requires_grad=True).clone()})[
+                "a"
+            ].sum().backward()
+        self.assertExpectedInline(
+            (mode.flop_counts["Mod"][torch.ops.aten.mm]), """12000"""
+        )
 
         class Mod2(torch.nn.Module):
             def forward(self, x):
@@ -503,7 +697,9 @@ def forward(self, x):
         mod = Mod2()
         with FlopCounterMode() as mode:
             mod(torch.randn(10, 10, requires_grad=True))[0].sum().backward()
-        self.assertExpectedInline((mode.flop_counts['Mod2'][torch.ops.aten.mm]), """6000""")
+        self.assertExpectedInline(
+            (mode.flop_counts["Mod2"][torch.ops.aten.mm]), """6000"""
+        )
 
     def test_warning(self):
         mod = torch.nn.Linear(2, 2)
@@ -511,5 +707,5 @@ def test_warning(self):
             FlopCounterMode(mod)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_function_schema.py b/test/test_function_schema.py
index 47586147dbbc..439a3c66d3f0 100644
--- a/test/test_function_schema.py
+++ b/test/test_function_schema.py
@@ -1,8 +1,8 @@
 # Owner(s): ["module: unknown"]
 
 import torch
-from torch.testing._internal.common_utils import TestCase, run_tests
 from torch._C import parse_schema
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class TestFunctionSchema(TestCase):
@@ -16,216 +16,306 @@ def test_serialize_and_deserialize(self):
             self.assertTrue(parsed_schema.is_backward_compatible_with(schema))
 
     def test_out_schema(self):
-        schema_with_out = parse_schema('any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)')
+        schema_with_out = parse_schema(
+            "any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
+        )
         self.assertTrue(schema_with_out.arguments[-1].is_out)
-        schema_without_out = parse_schema('any.not_out(Tensor self, Tensor b) -> Tensor')
+        schema_without_out = parse_schema(
+            "any.not_out(Tensor self, Tensor b) -> Tensor"
+        )
         self.assertFalse(schema_without_out.arguments[-1].is_out)
 
     def test_hash_schema(self):
-        schema1 = parse_schema('any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)')
-        schema2 = parse_schema('any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)')
+        schema1 = parse_schema("any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
+        schema2 = parse_schema("any.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)")
         self.assertEqual(hash(schema1), hash(schema2))
 
-        schema3 = parse_schema('any.not_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)')
+        schema3 = parse_schema(
+            "any.not_out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
+        )
         self.assertNotEqual(hash(schema2), hash(schema3))
 
-        schema4 = parse_schema('foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)')
+        schema4 = parse_schema(
+            "foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)"
+        )
         self.assertNotEqual(hash(schema2), hash(schema4))
 
         # schemas with different default value, or different kw-only arg, should have different hash
-        default_val_schema0 = parse_schema('foo(Tensor self, int a = 2) -> Tensor(a!)')
-        default_val_schema1 = parse_schema('foo(Tensor self, int a = 3) -> Tensor(a!)')
-        default_val_schema2 = parse_schema('foo(Tensor self, *, int a = 2) -> Tensor(a!)')
+        default_val_schema0 = parse_schema("foo(Tensor self, int a = 2) -> Tensor(a!)")
+        default_val_schema1 = parse_schema("foo(Tensor self, int a = 3) -> Tensor(a!)")
+        default_val_schema2 = parse_schema(
+            "foo(Tensor self, *, int a = 2) -> Tensor(a!)"
+        )
         self.assertNotEqual(hash(default_val_schema0), hash(default_val_schema1))
         self.assertNotEqual(hash(default_val_schema0), hash(default_val_schema2))
 
         # schema with different alias annotation should have different hash
-        alias_schema = parse_schema('foo(Tensor(a!) self, int a = 2) -> Tensor(a!)')
+        alias_schema = parse_schema("foo(Tensor(a!) self, int a = 2) -> Tensor(a!)")
         self.assertNotEqual(hash(default_val_schema0), hash(alias_schema))
-        alias_schema2 = parse_schema('foo(Tensor(b!) self, int a = 2) -> Tensor(a!)')
+        alias_schema2 = parse_schema("foo(Tensor(b!) self, int a = 2) -> Tensor(a!)")
         self.assertNotEqual(hash(alias_schema), hash(alias_schema2))
 
         # schema with different alias infos
-        alias_schema3 = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)')
-        alias_schema4 = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(b!)')
-        alias_schema5 = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(b!) out, Tensor(a!) b) -> Tensor(a!)')
+        alias_schema3 = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)"
+        )
+        alias_schema4 = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(b!)"
+        )
+        alias_schema5 = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(b!) out, Tensor(a!) b) -> Tensor(a!)"
+        )
         self.assertNotEqual(hash(alias_schema3), hash(alias_schema4))
         self.assertNotEqual(hash(alias_schema3), hash(alias_schema5))
 
     def test_backward_compatible_structure(self):
-        old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        old_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor")
         # BC: A new schema without changes.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor")
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with different name.
-        new_schema = parse_schema('any_.over(Tensor self, *, Tensor b) -> Tensor')
+        new_schema = parse_schema("any_.over(Tensor self, *, Tensor b) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with different overload name.
-        new_schema = parse_schema('any.other(Tensor self, *, Tensor b) -> Tensor')
+        new_schema = parse_schema("any.other(Tensor self, *, Tensor b) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema that adds vararg.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b, ...) -> Tensor')
+        new_schema = parse_schema("any.over(Tensor self, *, Tensor b, ...) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with different number of outputs.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> (Tensor, Tensor)')
+        new_schema = parse_schema(
+            "any.over(Tensor self, *, Tensor b) -> (Tensor, Tensor)"
+        )
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
 
     def test_backward_compatible_outputs(self):
-        old_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor')
+        old_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor")
         # No-BC: A new schema with output becoming of optional type.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor?')
+        new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor?")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # BC: (the opposite case) An schema where the output is not of optional type anymore.
         self.assertTrue(old_schema.is_backward_compatible_with(new_schema))
         # No-BC: A new schema with a different output type.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> int')
+        new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> int")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with a different output type.
-        new_schema = parse_schema('any.over(Tensor self, *, Tensor b) -> Tensor out')
+        new_schema = parse_schema("any.over(Tensor self, *, Tensor b) -> Tensor out")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
 
     def test_backward_compatible_arguments(self):
-        old_schema = parse_schema('any(Tensor self, *, Tensor b, int c) -> Tensor')
+        old_schema = parse_schema("any(Tensor self, *, Tensor b, int c) -> Tensor")
         # No-BC: A new schema with less arguments.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, *, Tensor b) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with more arguments, appended, but no default value.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d) -> Tensor')
+        new_schema = parse_schema(
+            "any(Tensor self, *, Tensor b, int c, int d) -> Tensor"
+        )
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # BC: A new schema with more arguments, appended, that have a default value.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c, int d=1) -> Tensor')
+        new_schema = parse_schema(
+            "any(Tensor self, *, Tensor b, int c, int d=1) -> Tensor"
+        )
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema with more arguments, not-appended, that have a default value.
-        new_schema = parse_schema('any(Tensor self, int d=1, *, Tensor b, int c) -> Tensor')
+        new_schema = parse_schema(
+            "any(Tensor self, int d=1, *, Tensor b, int c) -> Tensor"
+        )
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # BC: A new schema where old kwargs becomes positional.
-        new_schema = parse_schema('any(Tensor self, Tensor b, *, int c) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, Tensor b, *, int c) -> Tensor")
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # BC: (the opposite case) A new schema where an old positional argument becomes kwarg.
         self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
         # BC: A new schema where all old kwargs become positional.
-        new_schema = parse_schema('any(Tensor self, Tensor b, int c) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, Tensor b, int c) -> Tensor")
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # BC: (the opposite case) A new schema where all old positional arguments become kwarg.
         self.assertFalse(old_schema.is_backward_compatible_with(new_schema))
         # No-BC: A new schema where old kwargs appear in different order.
-        new_schema = parse_schema('any(Tensor self, *, int c, Tensor b) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, *, int c, Tensor b) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # BC: A new schema where argument becomes of type optional.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int? c) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, *, Tensor b, int? c) -> Tensor")
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # BC: A new schema where argument gains a default value.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int c=1) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, *, Tensor b, int c=1) -> Tensor")
         self.assertTrue(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema where argument is "renamed".
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int renamed) -> Tensor')
+        new_schema = parse_schema(
+            "any(Tensor self, *, Tensor b, int renamed) -> Tensor"
+        )
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
         # No-BC: A new schema where argument type changes to an incompatible type.
-        new_schema = parse_schema('any(Tensor self, *, Tensor b, int[] c) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, *, Tensor b, int[] c) -> Tensor")
         self.assertFalse(new_schema.is_backward_compatible_with(old_schema))
 
     def test_backward_compatible_with_smart_serialization(self):
         # cases where out arg is provided
-        old_schema = parse_schema('foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)')
-        new_schema_same_out = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)')
-        new_schema_wrong_default = parse_schema('foo(Tensor self, *, int b=1, int a, Tensor(a!) out) -> Tensor(a!)')
-        new_schema_more_out = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)')
-        new_schema_wrong_pos = parse_schema('foo(Tensor self, *, int a, int b=1, Tensor(b!) b, Tensor(a!) out) -> Tensor(a!)')
+        old_schema = parse_schema(
+            "foo(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)"
+        )
+        new_schema_same_out = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)"
+        )
+        new_schema_wrong_default = parse_schema(
+            "foo(Tensor self, *, int b=1, int a, Tensor(a!) out) -> Tensor(a!)"
+        )
+        new_schema_more_out = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(a!) out, Tensor(b!) b) -> Tensor(a!)"
+        )
+        new_schema_wrong_pos = parse_schema(
+            "foo(Tensor self, *, int a, int b=1, Tensor(b!) b, Tensor(a!) out) -> Tensor(a!)"
+        )
         self.assertTrue(new_schema_same_out.is_backward_compatible_with(old_schema))
         self.assertTrue(new_schema_more_out.is_backward_compatible_with(old_schema))
-        self.assertFalse(new_schema_wrong_default.is_backward_compatible_with(old_schema))
+        self.assertFalse(
+            new_schema_wrong_default.is_backward_compatible_with(old_schema)
+        )
         self.assertFalse(new_schema_wrong_pos.is_backward_compatible_with(old_schema))
 
         # cases where out arg is not provided
-        old_schema_without_arg = parse_schema('foo(Tensor self, int a, int b=1) -> int')
-        new_schema_without_arg = parse_schema('foo(Tensor self, int a, int b=1, int c=2) -> int')
-        new_schema_without_arg_multiple_default = parse_schema('foo(Tensor self, int a, int b=1, int c=2, int d=3) -> int')
-        new_schema_without_arg_wrong_pos = parse_schema('foo(Tensor self, int a, int c=2, int b=1) -> int')
-        self.assertTrue(new_schema_without_arg.is_backward_compatible_with(old_schema_without_arg))
-        self.assertTrue(new_schema_without_arg_multiple_default.is_backward_compatible_with(old_schema_without_arg))
-        self.assertFalse(new_schema_without_arg_wrong_pos.is_backward_compatible_with(old_schema_without_arg))
+        old_schema_without_arg = parse_schema("foo(Tensor self, int a, int b=1) -> int")
+        new_schema_without_arg = parse_schema(
+            "foo(Tensor self, int a, int b=1, int c=2) -> int"
+        )
+        new_schema_without_arg_multiple_default = parse_schema(
+            "foo(Tensor self, int a, int b=1, int c=2, int d=3) -> int"
+        )
+        new_schema_without_arg_wrong_pos = parse_schema(
+            "foo(Tensor self, int a, int c=2, int b=1) -> int"
+        )
+        self.assertTrue(
+            new_schema_without_arg.is_backward_compatible_with(old_schema_without_arg)
+        )
+        self.assertTrue(
+            new_schema_without_arg_multiple_default.is_backward_compatible_with(
+                old_schema_without_arg
+            )
+        )
+        self.assertFalse(
+            new_schema_without_arg_wrong_pos.is_backward_compatible_with(
+                old_schema_without_arg
+            )
+        )
 
     def test_string_optional_parameter_default_value(self):
-        schema_a = parse_schema("example::op(str? order=\"NCHW\") -> (Tensor)")
+        schema_a = parse_schema('example::op(str? order="NCHW") -> (Tensor)')
         schema_b = parse_schema(str(schema_a))
         self.assertEqual(schema_a, schema_b)
 
     def test_forward_compatible_arguments_without_out(self):
-        old_schema = parse_schema('any(Tensor self, int a, int b=1) -> Tensor')
+        old_schema = parse_schema("any(Tensor self, int a, int b=1) -> Tensor")
         # deleting default arg is FC compatible
-        new_schema = parse_schema('any(Tensor self, int a) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a) -> Tensor")
         is_fc, _ = new_schema.check_forward_compatible_with(old_schema)
         self.assertTrue(is_fc)
         # adding default arg is FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int b=1, int c=1) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a, int b=1, int c=1) -> Tensor")
         is_fc, _ = new_schema.check_forward_compatible_with(old_schema)
         self.assertTrue(is_fc)
         # adding default arg with container type is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int b=1, int[2] c=1) -> Tensor')
+        new_schema = parse_schema(
+            "any(Tensor self, int a, int b=1, int[2] c=1) -> Tensor"
+        )
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "Function schema is not forward compatible since the new argument"
-                                 " \'c\' of type int[] has a container type as its default value.")
+        self.assertEqual(
+            reason,
+            "Function schema is not forward compatible since the new argument"
+            " 'c' of type int[] has a container type as its default value.",
+        )
         # updating the default value of a default arg is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int b=4) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a, int b=4) -> Tensor")
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'b\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason, "'b' is not forward compatible with the older version of the schema"
+        )
         # updating the arg name of a default arg is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int c=1) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a, int c=1) -> Tensor")
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'c\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason, "'c' is not forward compatible with the older version of the schema"
+        )
         # not adding default arg in the end is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int c=1, int b=1) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a, int c=1, int b=1) -> Tensor")
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'c\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason, "'c' is not forward compatible with the older version of the schema"
+        )
         # making default arg into positional arg is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a, int b) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a, int b) -> Tensor")
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'b\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason, "'b' is not forward compatible with the older version of the schema"
+        )
         # making positional arg into default arg is NOT FC compatible
-        new_schema = parse_schema('any(Tensor self, int a=1, int b=1) -> Tensor')
+        new_schema = parse_schema("any(Tensor self, int a=1, int b=1) -> Tensor")
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'a\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason, "'a' is not forward compatible with the older version of the schema"
+        )
 
     def test_forward_compatible_arguments_real_use_case(self):
         # this change introduced forward incompatibility in the past
-        old_slice_schema = parse_schema('slice(Tensor(a) self, int dim=0, int start=0, int end=0, int step=1) -> Tensor(a)')
-        new_slice_schema = parse_schema('slice(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)')
+        old_slice_schema = parse_schema(
+            "slice(Tensor(a) self, int dim=0, int start=0, int end=0, int step=1) -> Tensor(a)"
+        )
+        new_slice_schema = parse_schema(
+            "slice(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)"
+        )
         is_fc, reason = new_slice_schema.check_forward_compatible_with(old_slice_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "\'start\' is not forward compatible with the older version of the schema")
+        self.assertEqual(
+            reason,
+            "'start' is not forward compatible with the older version of the schema",
+        )
 
     def test_forward_compatible_arguments_with_out(self):
-        old_schema = parse_schema('any(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)')
-        new_schema = parse_schema('any(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)')
+        old_schema = parse_schema(
+            "any(Tensor self, *, int a, int b=1, Tensor(a!) out) -> Tensor(a!)"
+        )
+        new_schema = parse_schema(
+            "any(Tensor self, *, int a, Tensor(a!) out) -> Tensor(a!)"
+        )
         is_fc, _ = new_schema.check_forward_compatible_with(old_schema)
         self.assertTrue(is_fc)
-        new_schema = parse_schema('any(Tensor self, *, int a, int b=1, int c=1, Tensor(a!) out) -> Tensor(a!)')
+        new_schema = parse_schema(
+            "any(Tensor self, *, int a, int b=1, int c=1, Tensor(a!) out) -> Tensor(a!)"
+        )
         is_fc, _ = new_schema.check_forward_compatible_with(old_schema)
         self.assertTrue(is_fc)
-        new_schema = parse_schema('any(Tensor self, *, int a, Tensor(d!) d, int b=1, Tensor(a!) out) -> Tensor(a!)')
+        new_schema = parse_schema(
+            "any(Tensor self, *, int a, Tensor(d!) d, int b=1, Tensor(a!) out) -> Tensor(a!)"
+        )
         is_fc, reason = new_schema.check_forward_compatible_with(old_schema)
         self.assertFalse(is_fc)
-        self.assertEqual(reason, "Function schema should have the same number of out arguments")
+        self.assertEqual(
+            reason, "Function schema should have the same number of out arguments"
+        )
 
     def test_schema_error(self):
-        with self.assertRaisesRegex(RuntimeError, r"schemas with vararg \(...\) can't have default value args"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"schemas with vararg \(...\) can't have default value args"
+        ):
             schema = parse_schema("any.foo(int arg1, int arg2=0, ...)")
 
     def test_tensor_list_alias_annotation_properly_parsed(self):
-        schema_str = 'foo(Tensor self, *, Tensor(a!)[] out) -> ()'
+        schema_str = "foo(Tensor self, *, Tensor(a!)[] out) -> ()"
         schema = parse_schema(schema_str)
         self.assertTrue(schema.arguments[-1].alias_info.is_write)
         self.assertEqual(str(schema), schema_str)
 
     def test_tensor_option_arguments_properly_parsed(self):
-        schema_str = '_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, ' \
-                     'bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor'
+        schema_str = (
+            "_to_copy(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, "
+            "bool? pin_memory=None, bool non_blocking=False, MemoryFormat? memory_format=None) -> Tensor"
+        )
         schema = parse_schema(schema_str)
         # fake type of MemoryFormat? is int?
         self.assertEqual(schema.arguments[-1].type.str(), "int?")
@@ -237,7 +327,7 @@ def test_tensor_option_arguments_properly_parsed(self):
         self.assertEqual(str(schema), schema_str)
 
     def test_sym_int_argument_properly_parsed(self):
-        schema_str = 'sym_size.int(Tensor self, int dim) -> SymInt'
+        schema_str = "sym_size.int(Tensor self, int dim) -> SymInt"
         schema = parse_schema(schema_str)
         # fake type of SymInt is int
         self.assertEqual(schema.returns[-1].type.str(), "int")
@@ -247,5 +337,5 @@ def test_sym_int_argument_properly_parsed(self):
         self.assertEqual(str(schema), schema_str)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_functional_autograd_benchmark.py b/test/test_functional_autograd_benchmark.py
index 57a67ccead89..b0141479dd38 100644
--- a/test/test_functional_autograd_benchmark.py
+++ b/test/test_functional_autograd_benchmark.py
@@ -1,14 +1,21 @@
 # Owner(s): ["module: autograd"]
 
-from torch.testing._internal.common_utils import TestCase, run_tests, slowTest, IS_WINDOWS
+import os
 
 import subprocess
 import tempfile
-import os
 import unittest
 
+from torch.testing._internal.common_utils import (
+    IS_WINDOWS,
+    run_tests,
+    slowTest,
+    TestCase,
+)
+
 PYTORCH_COLLECT_COVERAGE = bool(os.environ.get("PYTORCH_COLLECT_COVERAGE"))
 
+
 # This is a very simple smoke test for the functional autograd benchmarking script.
 class TestFunctionalAutogradBenchmark(TestCase):
     def _test_runner(self, model, disable_gpu=False):
@@ -17,18 +24,20 @@ def _test_runner(self, model, disable_gpu=False):
         # is not allowed to open it again. As this is a simple smoke test, we choose for now
         # not to run this on windows and keep the code here simple.
         with tempfile.NamedTemporaryFile() as out_file:
-            cmd = ['python3',
-                   '../benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py']
+            cmd = [
+                "python3",
+                "../benchmarks/functional_autograd_benchmark/functional_autograd_benchmark.py",
+            ]
             # Only run the warmup
-            cmd += ['--num-iters', '0']
+            cmd += ["--num-iters", "0"]
             # Only run the vjp task (fastest one)
-            cmd += ['--task-filter', 'vjp']
+            cmd += ["--task-filter", "vjp"]
             # Only run the specified model
-            cmd += ['--model-filter', model]
+            cmd += ["--model-filter", model]
             # Output file
-            cmd += ['--output', out_file.name]
+            cmd += ["--output", out_file.name]
             if disable_gpu:
-                cmd += ['--gpu', '-1']
+                cmd += ["--gpu", "-1"]
 
             res = subprocess.run(cmd)
 
@@ -37,20 +46,34 @@ def _test_runner(self, model, disable_gpu=False):
             out_file.seek(0, os.SEEK_END)
             self.assertTrue(out_file.tell() > 0)
 
-
-    @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows does not have all the features we need.")
-    @unittest.skipIf(PYTORCH_COLLECT_COVERAGE, "Can deadlocks with gcov, see https://github.com/pytorch/pytorch/issues/49656")
+    @unittest.skipIf(
+        IS_WINDOWS,
+        "NamedTemporaryFile on windows does not have all the features we need.",
+    )
+    @unittest.skipIf(
+        PYTORCH_COLLECT_COVERAGE,
+        "Can deadlocks with gcov, see https://github.com/pytorch/pytorch/issues/49656",
+    )
     def test_fast_tasks(self):
-        fast_tasks = ['resnet18', 'ppl_simple_reg', 'ppl_robust_reg', 'wav2letter',
-                      'transformer', 'multiheadattn']
+        fast_tasks = [
+            "resnet18",
+            "ppl_simple_reg",
+            "ppl_robust_reg",
+            "wav2letter",
+            "transformer",
+            "multiheadattn",
+        ]
 
         for task in fast_tasks:
             self._test_runner(task)
 
     @slowTest
-    @unittest.skipIf(IS_WINDOWS, "NamedTemporaryFile on windows does not have all the features we need.")
+    @unittest.skipIf(
+        IS_WINDOWS,
+        "NamedTemporaryFile on windows does not have all the features we need.",
+    )
     def test_slow_tasks(self):
-        slow_tasks = ['fcn_resnet', 'detr']
+        slow_tasks = ["fcn_resnet", "detr"]
         # deepspeech is voluntarily excluded as it takes too long to run without
         # proper tuning of the number of threads it should use.
 
@@ -59,5 +82,5 @@ def test_slow_tasks(self):
             self._test_runner(task, disable_gpu=True)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_functional_optim.py b/test/test_functional_optim.py
index da3d40d305e3..5e2a1e67e015 100644
--- a/test/test_functional_optim.py
+++ b/test/test_functional_optim.py
@@ -1,15 +1,16 @@
 # Owner(s): ["oncall: distributed"]
 
-from typing import List, Optional, Tuple
 import unittest
+from typing import List, Optional, Tuple
 
 import torch
 import torch.distributed
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
-from torch.optim import SGD, Adam, AdamW
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.optim import Adam, AdamW, SGD
+from torch.testing._internal.common_utils import run_tests, TestCase
+
 
 class MyModule(torch.nn.Module):
     def __init__(self):
@@ -21,6 +22,7 @@ def __init__(self):
     def forward(self, t1):
         return self.lin2(F.relu(self.lin1(t1)))
 
+
 # dummy class to showcase custom optimizer registration with functional wrapper
 class MyDummyFnOptimizer:
     def __init__(
@@ -32,7 +34,6 @@ def __init__(
         weight_decay: float = 0.0,
         _allow_empty_param_list: bool = False,
     ):
-
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= eps:
@@ -58,17 +59,26 @@ def __init__(
     def step_param(self, param: Tensor, grad: Optional[Tensor]):
         # call the custom optimizer step_param implementation
         with torch.no_grad():
-            raise RuntimeError("MyDummyFnOptimizer does not support step_param() as of now")
+            raise RuntimeError(
+                "MyDummyFnOptimizer does not support step_param() as of now"
+            )
 
     def step(self, gradients: List[Optional[Tensor]]):
         # call the custom optimizer step implementation
         with torch.no_grad():
             raise RuntimeError("MyDummyFnOptimizer does not support step() as of now")
 
+
 if torch.distributed.is_available():
-    from torch.distributed.optim.utils import functional_optim_map, register_functional_optim
+    from torch.distributed.optim.utils import (
+        functional_optim_map,
+        register_functional_optim,
+    )
+
 
-@unittest.skipIf(not torch.distributed.is_available(), "These are testing distributed functions")
+@unittest.skipIf(
+    not torch.distributed.is_available(), "These are testing distributed functions"
+)
 class TestFunctionalOptimParity(TestCase):
     def _validate_parameters(self, params_1, params_2):
         for p1, p2 in zip(params_1, params_2):
diff --git a/test/test_functionalization_of_rng_ops.py b/test/test_functionalization_of_rng_ops.py
index b2ac62e4f278..bba22ff34a0b 100644
--- a/test/test_functionalization_of_rng_ops.py
+++ b/test/test_functionalization_of_rng_ops.py
@@ -1,36 +1,34 @@
 # Owner(s): ["oncall: pt2"]
+import functools
 import sys
 import unittest
-import torch
-from torch.testing._internal.common_utils import (
-    TestCase,
-    run_tests,
-)
-
-from torch.testing._internal.common_device_type import instantiate_device_type_tests, dtypes
-from functorch.compile import aot_function, nop, min_cut_rematerialization_partition
 from unittest.mock import patch
-import functools
-import torch.utils.checkpoint
 
+import torch
+import torch.utils.checkpoint
+from functorch.compile import aot_function, min_cut_rematerialization_partition, nop
 
-from torch.testing._internal.common_utils import (
-    IS_CI,
-    IS_WINDOWS,
+from torch.testing._internal.common_device_type import (
+    dtypes,
+    instantiate_device_type_tests,
 )
 
+from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, run_tests, TestCase
+
 if IS_WINDOWS and IS_CI:
-    sys.stderr.write(
-        "torch.compile not supported on windows"
-    )
+    sys.stderr.write("torch.compile not supported on windows")
     if __name__ == "__main__":
         sys.exit(0)
     raise unittest.SkipTest("torch.compile not supported on windows")
 
+
 def count_philox_rand(gm, args, freq):
-    assert [node.target for node in gm.graph.nodes].count(torch.ops.rngprims.philox_rand.default) == freq
+    assert [node.target for node in gm.graph.nodes].count(
+        torch.ops.rngprims.philox_rand.default
+    ) == freq
     return gm
 
+
 class TestFunctionalizationRngOps(TestCase):
     @dtypes(torch.float32)
     @patch.object(torch._functorch.config, "functionalize_rng_ops", True)
@@ -72,8 +70,6 @@ def fn(x):
 
             self.assertEqual(ref, res)
 
-
-
     @dtypes(torch.float32)
     @patch.object(torch._functorch.config, "functionalize_rng_ops", True)
     def test_rand_like_dynamic_bwd(self, dtype, device):
@@ -96,7 +92,6 @@ def fn(x):
 
             self.assertEqual(ref, res)
 
-
     @dtypes(torch.float32)
     @patch.object(torch._functorch.config, "functionalize_rng_ops", True)
     def test_rand(self, dtype, device):
@@ -134,7 +129,7 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, grad_out):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 return grad_out * torch.rand_like(grad_out) * torch.cos(x)
 
         custom = Custom.apply
@@ -174,7 +169,7 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, grad_out):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 return grad_out * torch.rand_like(grad_out) * torch.cos(x)
 
         class CustomOp2(torch.autograd.Function):
@@ -186,10 +181,9 @@ def forward(ctx, x):
 
             @staticmethod
             def backward(ctx, grad_out):
-                x, = ctx.saved_tensors
+                (x,) = ctx.saved_tensors
                 return grad_out * torch.rand_like(grad_out) * torch.rand_like(x)
 
-
         custom_op1 = CustomOp1.apply
         custom_op2 = CustomOp2.apply
 
@@ -210,7 +204,6 @@ def aot_fn(x):
             b = a.sin()
             return aot_custom_op2(b)
 
-
         for seed in range(10):
             torch.cuda.manual_seed(seed)
             x = torch.rand(*shape, device=device, dtype=dtype, requires_grad=True)
@@ -265,7 +258,6 @@ def fn(x):
             a = torch.sin(a)
             return a
 
-
         x = torch.rand(*shape, device=device, dtype=dtype, requires_grad=True)
 
         x_clone = x.clone().detach().requires_grad_(True)
@@ -277,7 +269,12 @@ def fn(x):
         torch.cuda.manual_seed(123)
         fwd_compiler = functools.partial(count_philox_rand, freq=2)
         bwd_compiler = functools.partial(count_philox_rand, freq=0)
-        aot_custom = aot_function(fn, fwd_compiler, bwd_compiler, partition_fn=min_cut_rematerialization_partition)
+        aot_custom = aot_function(
+            fn,
+            fwd_compiler,
+            bwd_compiler,
+            partition_fn=min_cut_rematerialization_partition,
+        )
         # aot_custom = aot_function(fn, fwd_compiler, bwd_compiler)
         res = aot_custom(x_clone)
         res.sum().backward()
diff --git a/test/test_fx.py b/test/test_fx.py
index eadcd750aede..a58abb906d89 100644
--- a/test/test_fx.py
+++ b/test/test_fx.py
@@ -4078,7 +4078,7 @@ def test_function_back_compat(self):
                   f"unintended, please revert it. If it was intended, check with the FX " \
                   f"team to ensure that the proper deprecation protocols have been followed " \
                   f"and subsequently --accept the change."
-            raise AssertionError(msg)  # noqa: TRY200
+            raise AssertionError(msg)  # noqa: B904
 
     def test_class_member_back_compat(self):
         """
diff --git a/test/test_linalg.py b/test/test_linalg.py
index a0860adb2a19..eec45db75383 100644
--- a/test/test_linalg.py
+++ b/test/test_linalg.py
@@ -185,7 +185,7 @@ def test_linalg_lstsq(self, device, dtype):
         if self.device_type == 'cpu':
             drivers = ('gels', 'gelsy', 'gelsd', 'gelss', None)
         else:
-            drivers = ('gels', None)
+            drivers = ('gels', 'gelss')
 
         def check_solution_correctness(a, b, sol):
             sol2 = a.pinverse() @ b
@@ -219,8 +219,8 @@ def select_if_not_empty(t, i):
             if a.numel() > 0:
                 for i in range(batch_size):
                     sol, residuals, rank, singular_values = ref(
-                        a_3d.select(0, i).numpy(),
-                        b_3d.select(0, i).numpy()
+                        a_3d.select(0, i).cpu().numpy(),
+                        b_3d.select(0, i).cpu().numpy()
                     )
                     # Singular values are None when lapack_driver='gelsy' in SciPy
                     if singular_values is None:
@@ -337,6 +337,9 @@ def check_correctness(a, b):
             sol = torch.linalg.lstsq(a, b).solution
             sol2 = a.pinverse() @ b
             self.assertEqual(sol, sol2, rtol=1e-5, atol=1e-5)
+            sol_svd = torch.linalg.lstsq(a, b, driver='gelss').solution
+            sol2_svd = a.pinverse() @ b
+            self.assertEqual(sol_svd, sol2_svd, rtol=1e-5, atol=1e-5)
 
         ms = [2 ** i for i in range(5)]
         batches = [(), (0,), (2,), (2, 2), (2, 2, 2)]
@@ -378,6 +381,10 @@ def test_linalg_lstsq_input_checks(self, device, dtype):
             torch.linalg.lstsq(a, b)[0],
             torch.zeros(0, 0, 3, 2, dtype=dtype, device=device)
         )
+        self.assertEqual(
+            torch.linalg.lstsq(a, b, driver='gelss')[0],
+            torch.zeros(0, 0, 3, 2, dtype=dtype, device=device)
+        )
         # empty a and b
         a = torch.rand(2, 2, 0, 0, dtype=dtype, device=device)
         b = torch.rand(2, 2, 0, 0, dtype=dtype, device=device)
@@ -385,6 +392,10 @@ def test_linalg_lstsq_input_checks(self, device, dtype):
             torch.linalg.lstsq(a, b)[0],
             torch.zeros(2, 2, 0, 0, dtype=dtype, device=device)
         )
+        self.assertEqual(
+            torch.linalg.lstsq(a, b, driver='gelss')[0],
+            torch.zeros(2, 2, 0, 0, dtype=dtype, device=device)
+        )
         # empty a and b
         a = torch.rand(2, 2, 3, 0, dtype=dtype, device=device)
         b = torch.rand(2, 2, 3, 0, dtype=dtype, device=device)
@@ -392,6 +403,10 @@ def test_linalg_lstsq_input_checks(self, device, dtype):
             torch.linalg.lstsq(a, b)[0],
             torch.zeros(2, 2, 0, 0, dtype=dtype, device=device)
         )
+        self.assertEqual(
+            torch.linalg.lstsq(a, b, driver='gelss')[0],
+            torch.zeros(2, 2, 0, 0, dtype=dtype, device=device)
+        )
         # empty a but not b
         a = torch.rand(2, 2, 3, 0, dtype=dtype, device=device)
         b = torch.rand(2, 2, 3, 2, dtype=dtype, device=device)
@@ -399,6 +414,10 @@ def test_linalg_lstsq_input_checks(self, device, dtype):
             torch.linalg.lstsq(a, b)[0],
             torch.zeros(2, 2, 0, 2, dtype=dtype, device=device)
         )
+        self.assertEqual(
+            torch.linalg.lstsq(a, b, driver='gelss')[0],
+            torch.zeros(2, 2, 0, 2, dtype=dtype, device=device)
+        )
 
         # empty a and b
         if torch.device(device).type == 'cpu':
@@ -445,7 +464,10 @@ def complement_device(device):
         b = torch.rand(2, 2, 2, dtype=dtype, device=device)
 
         if device != 'cpu':
-            with self.assertRaisesRegex(RuntimeError, '`driver` other than `gels` is not supported on CUDA'):
+            with self.assertRaisesRegex(
+                RuntimeError,
+                'torch.linalg.lstsq: `driver` other than `gels` or `gelss` is not supported on CUDA'
+            ):
                 torch.linalg.lstsq(a, b, driver='fictitious_driver')
         # if on cpu
         else:
diff --git a/test/test_nn.py b/test/test_nn.py
index 008354ad721e..76bc614f025d 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -8205,6 +8205,16 @@ def help(input, conv, memory_format):
             weight = torch.empty([1, 0, 1], dtype=dtype, device=device)
             torch._C._nn.slow_conv3d(inp, weight, 1)
 
+        with self.assertRaisesRegex(RuntimeError, re.escape("2D kernel_size expected")):
+            torch._C._nn.thnn_conv2d(torch.rand([1, 1, 1, 1]), kernel_size=[], padding=[1, 1], stride=[1, 1],
+                                     weight=torch.rand([1, 1]))
+        with self.assertRaisesRegex(RuntimeError, re.escape("2D stride expected")):
+            torch._C._nn.thnn_conv2d(torch.rand([1, 1, 1, 1]), kernel_size=[1, 1], padding=[1, 1], stride=[],
+                                     weight=torch.rand([1, 1]))
+        with self.assertRaisesRegex(RuntimeError, re.escape("2D padding expected")):
+            torch._C._nn.thnn_conv2d(torch.rand([1, 1, 1, 1]), kernel_size=[1, 1], padding=[], stride=[1, 1],
+                                     weight=torch.rand([1, 1]))
+
     def test_InstanceNorm1d_general(self, device):
         b = random.randint(3, 5)
         c = random.randint(3, 5)
diff --git a/test/test_optim.py b/test/test_optim.py
index f875b4ed669e..7fa612e89da0 100644
--- a/test/test_optim.py
+++ b/test/test_optim.py
@@ -604,8 +604,16 @@ def _compare_between(self, inputs, models, optimizers, assert_eq_kwargs=None, as
             for input, model, optimizer in zip(inputs, models, optimizers):
                 optimizer.zero_grad()
 
+                if i == 3:
+                    # Freeze a layer to test if the step of this layer in 'fused' or 'foreach'
+                    # is same as the step in 'forloop'.
+                    model[2].requires_grad_(False)
+                if i == 5:
+                    # Unfreeze the layer after 2 iters.
+                    model[2].requires_grad_(True)
+
                 # Test that step behaves as expected (a no-op) when grads are set to None
-                if i != 3:
+                if i != 2:
                     output = model(input)
                     loss = output.sum()
                     loss.backward()
@@ -699,8 +707,8 @@ def test_mixed_device_dtype(self, device, dtype, optim_info, impl):
         assert impl in ("foreach", "fused")
         if impl == "foreach" and "foreach" not in optim_info.supported_impls:
             return unittest.skip(f"foreach not supported for {optim_info.optim_cls.__name__}")
-        elif impl == "fused" and "fused" not in optim_info.supported_impls:
-            return unittest.skip(f"fused not supported for {optim_info.optim_cls.__name__}")
+        elif impl == "fused" and "cuda" not in optim_info.supports_fused_on:
+            return unittest.skip(f"fused not supported for {optim_info.optim_cls.__name__} on cuda")
 
         params = [
             torch.rand(2, 3, dtype=torch.float64, device='cuda:0', requires_grad=True),
@@ -911,6 +919,8 @@ def test_fused_large_tensor(self, device, dtype, optim_info):
     @onlyCUDA
     @optims([optim for optim in optim_db if "fused" in optim.supported_impls], dtypes=[torch.float32])
     def test_fused_does_not_step_if_foundinf(self, device, dtype, optim_info):
+        if device not in optim_info.supports_fused_on:
+            self.skipTest(f"{device} is not supported for fused on {optim_info.optim_cls.__name__}")
         optim_cls = optim_info.optim_cls
         optim_inputs = optim_info.optim_inputs_func(device=device)
         num_params = 5
@@ -940,9 +950,12 @@ def test_cpu_load_state_dict(self, device, dtype, impl, optim_info):
         # Since this is a unit test, it is more expedient to simulate what the state_dict
         # would look like, which is basically CPU tensors with fused/capturable flag = True.
         optim_cls = optim_info.optim_cls
-        if optim_cls.__name__ == "SGD" and impl == "capturable":
-            # Capturable SGD does not exist
+        opt_name = optim_cls.__name__
+        if opt_name in ("SGD", "Adagrad", ) and impl == "capturable":
+            # Capturable SGD/Adagrad does not exist
             self.skipTest("SGD does not currently support capturable")
+        if impl == "fused" and device not in optim_info.supports_fused_on:
+            self.skipTest(f"{device} is not supported for fused on {opt_name}")
 
         cpu_optim_inputs = optim_info.optim_inputs_func(device="cpu")
         for optim_input in cpu_optim_inputs:
@@ -1318,6 +1331,8 @@ def closure():
             return closure_loss if optim_info.step_requires_closure else None
 
         for optim_input in cpu_optim_inputs:
+            if "fused" in optim_input.kwargs and "cuda" not in optim_info.supports_fused_on:
+                self.skipTest(f"cuda is not supported for fused on {optim_cls.__name__}")
             params = [Parameter(torch.randn(2, 3, device="cpu", dtype=dtype)) for _ in range(2)]
             for p in params:
                 p.grad = torch.randn_like(p)
diff --git a/test/test_public_bindings.py b/test/test_public_bindings.py
index 59e535e1447d..1db0e5718ce6 100644
--- a/test/test_public_bindings.py
+++ b/test/test_public_bindings.py
@@ -1,6 +1,6 @@
 # Owner(s): ["module: autograd"]
 
-from torch.testing._internal.common_utils import TestCase, run_tests, IS_JETSON, IS_WINDOWS
+from torch.testing._internal.common_utils import TestCase, run_tests, IS_JETSON, IS_WINDOWS, IS_MACOS, skipIfTorchDynamo
 from torch._utils_internal import get_file_path_2
 
 import pkgutil
@@ -279,9 +279,11 @@ def _is_mod_public(modname):
         return True
 
 
+    @unittest.skipIf(IS_WINDOWS or IS_MACOS, "Inductor/Distributed modules hard fail on windows and macos")
+    @skipIfTorchDynamo("Broken and not relevant for now")
     def test_modules_can_be_imported(self):
         failures = []
-        for _, modname, _ in _discover_path_importables(str(torch.__path__), "torch"):
+        for modname in _find_all_importables(torch):
             try:
                 # TODO: fix "torch/utils/model_dump/__main__.py"
                 # which calls sys.exit() when we try to import it
@@ -298,6 +300,7 @@ def test_modules_can_be_imported(self):
             "torch._inductor.codegen.cuda.cuda_kernel",
             "torch.onnx._internal.fx._pass",
             "torch.onnx._internal.fx.analysis",
+            "torch.onnx._internal.fx.analysis.unsupported_nodes",
             "torch.onnx._internal.fx.decomposition_skip",
             "torch.onnx._internal.fx.diagnostics",
             "torch.onnx._internal.fx.fx_onnx_interpreter",
@@ -305,6 +308,13 @@ def test_modules_can_be_imported(self):
             "torch.onnx._internal.fx.onnxfunction_dispatcher",
             "torch.onnx._internal.fx.op_validation",
             "torch.onnx._internal.fx.passes",
+            "torch.onnx._internal.fx.passes._utils",
+            "torch.onnx._internal.fx.passes.decomp",
+            "torch.onnx._internal.fx.passes.functionalization",
+            "torch.onnx._internal.fx.passes.modularization",
+            "torch.onnx._internal.fx.passes.readability",
+            "torch.onnx._internal.fx.passes.type_promotion",
+            "torch.onnx._internal.fx.passes.virtualization",
             "torch.onnx._internal.fx.type_utils",
             "torch.testing._internal.common_distributed",
             "torch.testing._internal.common_fsdp",
@@ -371,6 +381,12 @@ def test_modules_can_be_imported(self):
             "torch.distributed.examples.memory_tracker_example",
             "torch.testing._internal.distributed.rpc.fb.thrift_rpc_agent_test_fixture",
             "torch.utils._cxx_pytree",
+            "torch.utils.tensorboard._convert_np",
+            "torch.utils.tensorboard._embedding",
+            "torch.utils.tensorboard._onnx_graph",
+            "torch.utils.tensorboard._proto_graph",
+            "torch.utils.tensorboard._pytorch_graph",
+            "torch.utils.tensorboard._utils",
         }
 
         # No new entries should be added to this list.
@@ -408,6 +424,12 @@ def test_modules_can_be_imported(self):
             "torch.distributed.tensor.parallel",
             "torch.distributed.utils",
             "torch.utils.tensorboard",
+            "torch.utils.tensorboard.summary",
+            "torch.utils.tensorboard.writer",
+            "torch.ao.quantization.experimental.fake_quantize",
+            "torch.ao.quantization.experimental.linear",
+            "torch.ao.quantization.experimental.observer",
+            "torch.ao.quantization.experimental.qconfig",
         }
 
         errors = []
@@ -424,7 +446,8 @@ def test_modules_can_be_imported(self):
         self.assertEqual("", "\n".join(errors))
 
     # AttributeError: module 'torch.distributed' has no attribute '_shard'
-    @unittest.skipIf(IS_WINDOWS or IS_JETSON, "Distributed Attribute Error")
+    @unittest.skipIf(IS_WINDOWS or IS_JETSON or IS_MACOS, "Distributed Attribute Error")
+    @skipIfTorchDynamo("Broken and not relevant for now")
     def test_correct_module_names(self):
         '''
         An API is considered public, if  its  `__module__` starts with `torch.`
@@ -536,7 +559,7 @@ def check_one_element(elem, modname, mod, *, is_public, is_all):
                     if not elem.startswith('_'):
                         check_one_element(elem, modname, mod, is_public=True, is_all=False)
 
-        for _, modname, _ in _discover_path_importables(str(torch.__path__), "torch"):
+        for modname in _find_all_importables(torch):
             test_module(modname)
 
         test_module('torch')
@@ -546,6 +569,7 @@ def check_one_element(elem, modname, mod, *, is_public, is_all):
         msg += "Make sure that everything that is public is expected (in particular that the module " \
             "has a properly populated `__all__` attribute) and that everything that is supposed to be public " \
             "does look public (it does not start with `_` and has a `__module__` that is properly populated)."
+
         msg += "\n\nFull list:\n"
         msg += "\n".join(map(str, failure_list))
 
diff --git a/test/test_reductions.py b/test/test_reductions.py
index acce78958673..d1f72b49694f 100644
--- a/test/test_reductions.py
+++ b/test/test_reductions.py
@@ -19,6 +19,7 @@
 )
 from torch.testing._internal.common_utils import (
     TestCase, run_tests, skipIfNoSciPy, slowTest, torch_to_numpy_dtype_dict,
+    parametrize,
     IS_WINDOWS)
 from torch.testing._internal.common_device_type import (
     OpDTypes, expectedFailureMeta, instantiate_device_type_tests, onlyCPU, dtypes, dtypesIfCUDA, dtypesIfCPU,
@@ -1219,10 +1220,18 @@ def test_amax(self, device, dtype):
     def test_aminmax(self, device, dtype):
 
         def _amin_wrapper(x, dim=None, keepdims=False):
-            return torch.aminmax(x, dim=dim, keepdim=keepdims)[0]
+            with self.assertWarnsOnceRegex(UserWarning, "_aminmax is deprecated"):
+                if dim is None:
+                    return torch._aminmax(x)[0]
+                else:
+                    return torch._aminmax(x, dim, keepdims)[0]
 
         def _amax_wrapper(x, dim=None, keepdims=False):
-            return torch.aminmax(x, dim=dim, keepdim=keepdims)[1]
+            with self.assertWarnsOnceRegex(UserWarning, "_aminmax is deprecated"):
+                if dim is None:
+                    return torch._aminmax(x)[1]
+                else:
+                    return torch._aminmax(x, dim, keepdims)[1]
 
         self._test_minmax_helper(_amin_wrapper, np.amin, device, dtype)
         self._test_minmax_helper(_amax_wrapper, np.amax, device, dtype)
@@ -2219,65 +2228,67 @@ def test_dim_reduction(self, device, dtype):
         self.assertEqual(x[:, :2].amax().item(), 5)
         self.assertEqual(x[:, :2].argmax().item(), 2)
 
-        dim_red_fns = [
-            "mean", "median", "nanmedian", "mode", "norm", "prod",
-            "std", "sum", "var", "max", "min", "amax", "amin"]
 
+    @precisionOverride({torch.float16: 1e-2, torch.bfloat16: 1e-2})
+    @dtypes(*set(all_types_and(torch.half, torch.bfloat16)) - {torch.uint8})
+    @parametrize("fn_name", [
+        "mean", "median", "nanmedian", "mode", "norm", "prod",
+        "std", "sum", "var", "max", "min", "amax", "amin"])
+    def test_dim_reduction_fns(self, device, dtype, fn_name):
         def normfn_attr(t, dim, keepdim=False, out=None):
             attr = torch.norm
             return attr(t, 2, dim, keepdim, out=out)
 
-        for fn_name in dim_red_fns:
-            fn_attr = getattr(torch, fn_name) if fn_name != "norm" else normfn_attr
-
-            def fn(x, dim, keepdim=False, out=None):
-                ans = fn_attr(x, dim, keepdim=keepdim, out=out)
-                return ans if not isinstance(ans, tuple) else ans[0]
-
-            def fn_tuple(x, dim, keepdim=False, out=None):
-                return fn_attr(x, dim, keepdim=keepdim, out=out)
-
-            def test_multidim(x, dim):
-                self.assertEqual(fn(x, dim).unsqueeze(dim), fn(x, dim, keepdim=True))
-                self.assertEqual(x.ndimension() - 1, fn(x, dim).ndimension())
-                self.assertEqual(x.ndimension(), fn(x, dim, keepdim=True).ndimension())
-
-            # general case
-            x = torch.randn(3, 4, 5, device=device)
-            dim = random.randint(0, 2)
-            test_multidim(x, dim)
-
-            # check 1-d behavior
-            x = torch.randn(1, device=device)
-            dim = 0
-            self.assertEqual(fn(x, dim).shape, ())
-            self.assertEqual(fn(x, dim, keepdim=True).shape, (1,))
-
-            # check reducing of a singleton dimension
-            dims = [3, 4, 5]
-            singleton_dim = random.randint(0, 2)
-            dims[singleton_dim] = 1
-            x = torch.randn(dims, device=device)
-            test_multidim(x, singleton_dim)
-
-            # check reducing with output kwargs
-            if fn_name in ['median', 'nanmedian', 'mode', 'max', 'min']:
-                y = torch.randn(5, 3, device=device)
-                values = torch.randn(5, 3, device=device)
-                indices = torch.zeros(5, 3, device=device).long() - 1
-                fn_tuple(y, 1, keepdim=False, out=(values[:, 1], indices[:, 1]))
-                values_expected, indices_expected = fn_tuple(y, 1, keepdim=False)
-                self.assertEqual(values[:, 1], values_expected,
-                                 msg=f'{fn_name} values with out= kwarg')
-                self.assertEqual(indices[:, 1], indices_expected,
-                                 msg=f'{fn_name} indices with out= kwarg')
-                continue
-
-            x = torch.randn(5, 3, device=device)
+        fn_attr = getattr(torch, fn_name) if fn_name != "norm" else normfn_attr
+
+        def fn(x, dim, keepdim=False, out=None):
+            ans = fn_attr(x, dim, keepdim=keepdim, out=out)
+            return ans if not isinstance(ans, tuple) else ans[0]
+
+        def fn_tuple(x, dim, keepdim=False, out=None):
+            return fn_attr(x, dim, keepdim=keepdim, out=out)
+
+        def test_multidim(x, dim):
+            self.assertEqual(fn(x, dim).unsqueeze(dim), fn(x, dim, keepdim=True))
+            self.assertEqual(x.ndimension() - 1, fn(x, dim).ndimension())
+            self.assertEqual(x.ndimension(), fn(x, dim, keepdim=True).ndimension())
+
+        # general case
+        x = torch.randn(3, 4, 5, device=device)
+        dim = random.randint(0, 2)
+        test_multidim(x, dim)
+
+        # check 1-d behavior
+        x = torch.randn(1, device=device)
+        dim = 0
+        self.assertEqual(fn(x, dim).shape, ())
+        self.assertEqual(fn(x, dim, keepdim=True).shape, (1,))
+
+        # check reducing of a singleton dimension
+        dims = [3, 4, 5]
+        singleton_dim = random.randint(0, 2)
+        dims[singleton_dim] = 1
+        x = torch.randn(dims, device=device)
+        test_multidim(x, singleton_dim)
+
+        # check reducing with output kwargs
+        if fn_name in ['median', 'nanmedian', 'mode', 'max', 'min']:
             y = torch.randn(5, 3, device=device)
-            fn(y, 1, keepdim=False, out=x[:, 1])
-            expected = fn(y, 1, keepdim=False)
-            self.assertEqual(x[:, 1], expected, msg=f'{fn_name} with out= kwarg')
+            values = torch.randn(5, 3, device=device)
+            indices = torch.zeros(5, 3, device=device).long() - 1
+            fn_tuple(y, 1, keepdim=False, out=(values[:, 1], indices[:, 1]))
+            values_expected, indices_expected = fn_tuple(y, 1, keepdim=False)
+            self.assertEqual(values[:, 1], values_expected,
+                             msg=f'{fn_name} values with out= kwarg')
+            self.assertEqual(indices[:, 1], indices_expected,
+                             msg=f'{fn_name} indices with out= kwarg')
+            return
+
+        x = torch.randn(5, 3, device=device)
+        y = torch.randn(5, 3, device=device)
+        fn(y, 1, keepdim=False, out=x[:, 1])
+        expected = fn(y, 1, keepdim=False)
+        self.assertEqual(x[:, 1], expected, msg=f'{fn_name} with out= kwarg')
 
     @onlyCUDA
     @largeTensorTest('10GB')
diff --git a/test/test_serialization.py b/test/test_serialization.py
index 49f8880885ec..1be1b06ab786 100644
--- a/test/test_serialization.py
+++ b/test/test_serialization.py
@@ -15,8 +15,10 @@
 import shutil
 import pathlib
 import platform
+from collections import OrderedDict
 from copy import deepcopy
 from itertools import product
+from types import ModuleType
 
 from torch._utils_internal import get_file_path_2
 from torch._utils import _rebuild_tensor
@@ -27,9 +29,10 @@
 from torch.testing._internal.common_utils import (
     IS_FILESYSTEM_UTF8_ENCODING, TemporaryDirectoryName,
     TestCase, IS_FBCODE, IS_WINDOWS, TEST_DILL, run_tests, download_file, BytesIOContext, TemporaryFileName,
-    parametrize, instantiate_parametrized_tests, AlwaysWarnTypedStorageRemoval, serialTest)
+    parametrize, instantiate_parametrized_tests, AlwaysWarnTypedStorageRemoval, serialTest, skipIfTorchDynamo)
 from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_dtype import all_types_and_complex_and
+from torch.testing._internal.two_tensor import TwoTensor  # noqa: F401
 
 if not IS_WINDOWS:
     from mmap import MAP_SHARED, MAP_PRIVATE
@@ -493,6 +496,15 @@ def test_serialization_map_location(self):
         def map_location(storage, loc):
             return storage
 
+        def generate_map_locations(device_type):
+            return [
+                {'cuda:0': device_type + ':0'},
+                device_type,
+                device_type + ':0',
+                torch.device(device_type),
+                torch.device(device_type, 0)
+            ]
+
         def load_bytes():
             with open(test_file_path, 'rb') as f:
                 return io.BytesIO(f.read())
@@ -504,34 +516,39 @@ def load_bytes():
             'cpu',
             torch.device('cpu'),
         ]
-        gpu_0_map_locations = [
-            {'cuda:0': 'cuda:0'},
-            'cuda',
-            'cuda:0',
-            torch.device('cuda'),
-            torch.device('cuda', 0)
-        ]
+        gpu_0_map_locations = generate_map_locations('cuda')
         gpu_last_map_locations = [
             f'cuda:{torch.cuda.device_count() - 1}',
         ]
+        xpu_0_map_locations = generate_map_locations('xpu')
+        xpu_last_map_locations = [
+            f'xpu:{torch.xpu.device_count() - 1}',
+        ]
 
-        def check_map_locations(map_locations, tensor_class, intended_device):
+        def check_map_locations(map_locations, dtype, intended_device):
             for fileobject_lambda in fileobject_lambdas:
                 for map_location in map_locations:
                     tensor = torch.load(fileobject_lambda(), map_location=map_location)
 
                     self.assertEqual(tensor.device, intended_device)
-                    self.assertIsInstance(tensor, tensor_class)
-                    self.assertEqual(tensor, tensor_class([[1.0, 2.0], [3.0, 4.0]]))
+                    self.assertEqual(tensor.dtype, dtype)
+                    self.assertEqual(tensor, torch.tensor([[1.0, 2.0], [3.0, 4.0]], dtype=dtype, device=intended_device))
 
-        check_map_locations(cpu_map_locations, torch.FloatTensor, torch.device('cpu'))
+        check_map_locations(cpu_map_locations, torch.float, torch.device('cpu'))
         if torch.cuda.is_available():
-            check_map_locations(gpu_0_map_locations, torch.cuda.FloatTensor, torch.device('cuda', 0))
+            check_map_locations(gpu_0_map_locations, torch.float, torch.device('cuda', 0))
             check_map_locations(
                 gpu_last_map_locations,
-                torch.cuda.FloatTensor,
+                torch.float,
                 torch.device('cuda', torch.cuda.device_count() - 1)
             )
+        if torch.xpu.is_available():
+            check_map_locations(xpu_0_map_locations, torch.float, torch.device('xpu', 0))
+            check_map_locations(
+                xpu_last_map_locations,
+                torch.float,
+                torch.device('xpu', torch.xpu.device_count() - 1)
+            )
 
     @unittest.skipIf(torch.cuda.is_available(), "Testing torch.load on CPU-only machine")
     def test_load_nonexistent_device(self):
@@ -1024,7 +1041,7 @@ def __reduce__(self):
             self.assertIsNone(torch.load(f, weights_only=False))
             f.seek(0)
             # Safe load should assert
-            with self.assertRaisesRegex(pickle.UnpicklingError, "Unsupported class"):
+            with self.assertRaisesRegex(pickle.UnpicklingError, "Unsupported global: GLOBAL __builtin__.print"):
                 torch.load(f, weights_only=True)
 
     @parametrize('weights_only', (False, True))
@@ -4094,6 +4111,23 @@ def __setstate__(self, state):
 class TestEmptySubclass(torch.Tensor):
     ...
 
+# ONLY use SubclassSpoof subclasses for the subclass spoof tests since we modify them
+# Cannot define locally in test or pickle will fail.
+class TestEmptySubclassSpoof(TestEmptySubclass):
+    ...
+
+class TestWrapperSubclassSpoof(TestWrapperSubclass):
+    ...
+
+class RebuildFromTypeV2Spoof(torch.Tensor):
+    def __new__(cls, elem, naughty, **kwargs):
+        if naughty:
+            raise RuntimeError("naughty")
+        return super().__new__(cls, elem)
+
+    def __reduce_ex__(self, protocol):
+        return (torch._tensor._rebuild_from_type_v2, (RebuildFromTypeV2Spoof, torch.Tensor, (True,), {}))
+
 
 class TestSubclassSerialization(TestCase):
     def test_tensor_subclass_wrapper_serialization(self):
@@ -4173,6 +4207,203 @@ def test_empty_class_serialization(self):
             f.seek(0)
             tensor2 = torch.load(f)
 
+    def _create_bad_func(self, name):
+        def bad_func(self, *args, **kwargs):
+            raise RuntimeError(f"running {name}")
+        return bad_func
+
+    @parametrize("wrapper", (True, False))
+    def test_tensor_subclass_method_spoofing(self, wrapper):
+        '''
+        This tests seeks to do the following:
+            - determine which methods of a tensor subclass might be called during unpickling (weights_only=False)
+              we consider these methods "risky" for weights_only
+            - ensure that we ban overriding this group of methods on a tensor subclass by default (weights_only=True)
+            - ensure that tensor subclass that doesn't override any of these can be unpickled (weights_only=True)
+
+        We achieve this by overriding all methods of a tensor subclass to raise a RuntimeError
+        when called. We then try to unpickle a tensor subclass with weights_only=False and ensure that
+        only the RuntimeErrors that we expect are thrown.
+
+        We then load with weights_only and ensure that weights_only will fail unless all the risky methods
+        are not overriden by resetting the risky methods to the non-overriden version in a loop and calling load.
+        The final weights_only load call when all the risky methods are no longer overriden.
+        '''
+        subclass = TestWrapperSubclassSpoof if wrapper else TestEmptySubclassSpoof
+        t = subclass(torch.randn(2, 3))
+        # To trigger setattr for the non-wrapper case
+        if not wrapper:
+            t.foo = 'bar'
+        inp = {'weight': t}
+
+        with TemporaryFileName() as f:
+            torch.save(inp, f)
+            loaded = torch.load(f, weights_only=True)
+            self.assertEqual(loaded['weight'], inp['weight'])
+
+            restore_methods = dict()
+            methods = [func for func in dir(subclass) if callable(getattr(subclass, func))]
+            for method in methods:
+                if method != "__class__":
+                    restore_methods[method] = getattr(subclass, method)
+                    setattr(subclass, method, self._create_bad_func(method))
+            # These additional methods might be called during getattr or setattr
+            # but are not in methods above (not defined on tensor base class)
+            subclass.__get__ = self._create_bad_func("__get__")
+            subclass.__set__ = self._create_bad_func("__set__")
+            subclass.__getattr__ = self._create_bad_func("__getattr__")
+            restore_methods["__get__"] = None
+            restore_methods["__getattr__"] = None
+            restore_methods["__set__"] = None
+
+            try:
+                # Check that weights_only=False load raises the RuntimeErrors we expect
+                with self.assertRaisesRegex(RuntimeError, "running __getattribute__"):
+                    torch.load(f, weights_only=False)
+                subclass.__getattribute__ = restore_methods['__getattribute__']
+                with self.assertRaisesRegex(RuntimeError, "running __setstate__"):
+                    torch.load(f, weights_only=False)
+                subclass.__setstate__ = restore_methods['__setstate__']
+                with self.assertRaisesRegex(RuntimeError, "running __setattr__"):
+                    torch.load(f, weights_only=False)
+                subclass.__setattr__ = restore_methods['__setattr__']
+                # should finally work
+                torch.load(f, weights_only=False)
+
+                # Check that weights_only=True catches that risky methods are overriden
+                subclass.__setstate__ = self._create_bad_func("__setstate__")
+                subclass.__getattribute__ = self._create_bad_func("__getattribute__")
+                subclass.__setattr__ = self._create_bad_func("__setattr__")
+                with self.assertRaisesRegex(pickle.UnpicklingError,
+                                            "methods: __getattribute__=True __getattr__=True __get__=True "
+                                            "__setattr__=True __set__=True __setstate__=True"):
+                    torch.load(f, weights_only=True)
+                risky_methods = ['__get__', '__set__', '__getattr__', '__setattr__', '__getattribute__', '__setstate__']
+                for i, meth in enumerate(risky_methods):
+                    setattr(subclass, meth, restore_methods[meth])
+                    if i != len(risky_methods) - 1:
+                        # When the given methods are not all back to default, load should still throw
+                        # but reflect which methods are no longer overriden
+                        with self.assertRaisesRegex(pickle.UnpicklingError, f"{meth}=False"):
+                            torch.load(f, weights_only=True)
+                    else:
+                        # When the given methods are all back to default, weights_only load should finally work
+                        loaded = torch.load(f, weights_only=True)
+            finally:
+                for method, func in restore_methods.items():
+                    setattr(subclass, method, func)
+                a = subclass(torch.randn(2, 3))
+
+    @skipIfTorchDynamo("name 'SYNTHETIC_LOCAL' is not defined")
+    def test_safe_globals_for_weights_only(self):
+        '''
+        Tests import semantic for tensor subclass and the {add/get/clear}_safe_globals APIs
+        '''
+        # Needed to prevent UnboundLocalError: local variable 'TwoTensor' referenced before assignment
+        global TwoTensor
+        t = TwoTensor(torch.randn(2, 3), torch.randn(2, 3))
+        p = torch.nn.Parameter(t)
+        sd = OrderedDict([('t', t), ('p', p)])
+
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(sd, f)
+            # unimport TwoTensor
+            try:
+                del sys.modules['torch.testing._internal.two_tensor']
+
+                # Loading tensor subclass with weights_only=True should fail
+                # if tensor subclass has not been imported
+                with self.assertRaisesRegex(pickle.UnpicklingError,
+                                            "expect `torch.testing._internal.two_tensor` to be present in `sys.modules`"):
+                    f.seek(0)
+                    sd = torch.load(f, weights_only=True)
+
+                # Loading tensor subclass with weights_only=True should work
+                # if target methods are not overriden and user has imported the subclass
+                from torch.testing._internal.two_tensor import TwoTensor
+                f.seek(0)
+                sd = torch.load(f, weights_only=True)
+                self.assertEqual(sd['t'], t)
+                self.assertEqual(sd['p'], p)
+
+                # Loading tensor subclass with weights_only=True should fail
+                # if __setstate__ is overriden
+                f.seek(0)
+                restore_setstate = TwoTensor.__setstate__
+                try:
+                    TwoTensor.__setstate__ = lambda self, state: self.__dict__.update(state)
+                    with self.assertRaisesRegex(pickle.UnpicklingError, "__setstate__=True"):
+                        torch.load(f, weights_only=True)
+
+                    # Loading tensor subclass with overriden __setstate__ with weights_only=True should work
+                    # if the class is marked safe
+                    f.seek(0)
+                    torch.serialization.add_safe_globals([TwoTensor])
+                    self.assertTrue(torch.serialization.get_safe_globals() == [TwoTensor])
+                    sd = torch.load(f, weights_only=True)
+                    self.assertEqual(sd['t'], t)
+                    self.assertEqual(sd['p'], p)
+
+                    # Should fail again when safe globals are cleared
+                    torch.serialization.clear_safe_globals()
+                    f.seek(0)
+                    with self.assertRaisesRegex(pickle.UnpicklingError, "__setstate__=True"):
+                        torch.load(f, weights_only=True)
+                finally:
+                    TwoTensor.__setstate__ = restore_setstate
+            finally:
+                from torch.testing._internal.two_tensor import TwoTensor
+
+
+    def test_tensor_subclass_parent_module_method_spoofing(self):
+        '''
+        Tests that weights_only load does not call any methods of the parent module
+        that contains the tensor subclass.
+
+        We achieve this by overriding all methods of a module we add to sys.modules to raise a RuntimeError
+        when called. We then try to unpickle a tensor subclass with weights_only=True and ensure that
+        no RuntimeErrors are thrown.
+        '''
+        # Simulates user doing `import spoof_mod` where `spoof_mod` contains `TestEmptySubclass`
+        class SpoofModule(ModuleType):
+            pass
+
+        spoof_mod = SpoofModule('bla')
+        spoof_mod.TestEmptySubclass = TestEmptySubclass
+        inp = {'weight': TestEmptySubclass(torch.randn(2, 3))}
+        TestEmptySubclass.__module__ = 'spoof_mod'
+        sys.modules['spoof_mod'] = spoof_mod
+
+        try:
+            with TemporaryFileName() as f:
+                torch.save(inp, f)
+                torch.load(f, weights_only=True)
+                restore_methods = dict()
+                methods = [func for func in dir(SpoofModule) if callable(getattr(SpoofModule, func))]
+                for method in methods:
+                    if method != "__class__":
+                        restore_methods[method] = getattr(SpoofModule, method)
+                        setattr(SpoofModule, method, self._create_bad_func(method))
+                SpoofModule.__get__ = self._create_bad_func("__get__")
+                SpoofModule.__getattr__ = self._create_bad_func("__getattr__")
+                loaded = torch.load(f, weights_only=True)
+                self.assertEqual(loaded['weight'], inp['weight'])
+        finally:
+            TestEmptySubclass.__module__ = __name__
+            del sys.modules['spoof_mod']
+
+    def test_rebuild_from_type_v2_spoof(self):
+        t = RebuildFromTypeV2Spoof(torch.randn(2, 3), False)
+        inp = {'weight': t}
+
+        with TemporaryFileName() as f:
+            torch.save(inp, f)
+            # subclass will be pushed onto unpickler's stack as a string
+            # and only gets converted to the type if it is argument 1 to _rebuild_from_type_v2
+            with self.assertRaisesRegex(TypeError, "'str' object is not callable"):
+                loaded = torch.load(f, weights_only=True)
+
+
 
 instantiate_device_type_tests(TestBothSerialization, globals())
 instantiate_parametrized_tests(TestSubclassSerialization)
diff --git a/test/test_torch.py b/test/test_torch.py
index 81da78f9a882..c8cff93bd1bf 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -29,6 +29,8 @@
 from functools import partial
 from torch import multiprocessing as mp
 from torch.testing import make_tensor
+from torch.testing._internal.common_optimizers import (
+    optim_db, optims, _get_optim_inputs_including_global_cliquey_kwargs)
 
 from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
     TEST_WITH_TORCHINDUCTOR, TEST_WITH_ROCM, run_tests, IS_JETSON,
@@ -5877,8 +5879,13 @@ def _run_scaling_case(self, device, run, unskipped, skipped, atol=1e-7, optimize
 
                 self.assertEqual(c, s, atol=atol, rtol=1e-05)
 
-    # Compares no scaling + no autocasting against scaling + autocasting.
-    def _grad_scaling_autocast_test(self, *, device="cuda", atol=1e-3, optimizer_ctor=torch.optim.SGD, optimizer_kwargs=None):
+    @onlyNativeDeviceTypes
+    @parametrize("foreach, fused", [(None, None), (True, None), (None, True)])
+    @optims(
+        [optim for optim in optim_db if optim.optim_cls in [torch.optim.AdamW, torch.optim.Adam, torch.optim.SGD]],
+        dtypes=[torch.float32]
+    )
+    def test_grad_scaling_autocast(self, device, dtype, optim_info, foreach, fused):
         try_pickle = False
 
         def run(device, data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_api):
@@ -5902,6 +5909,9 @@ def run(device, data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_
                         optimizer.step()
             return scaler
 
+        optimizer_ctor = optim_info.optim_cls
+
+        # Compares no scaling + no autocasting against scaling + autocasting.
         # NOTE(mkozuki): With current way of testing, `torch.optim.Adam` is failing in spite of `foreach` and `fused`.
         #   Giving some flexibility to this test might help.
         context = contextlib.nullcontext
@@ -5911,71 +5921,51 @@ def run(device, data, model, optimizer, scaler, loss_fn, skip_iter, try_scaling_
         with context():
             # sets atol=1e-3 because we're comparing pure fp32 arithmetic vs a mixture of fp16 and fp32
             self._run_scaling_case(
-                device, run, unskipped=3, skipped=1, atol=atol, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
+                device, run, unskipped=3, skipped=1, atol=1e-3,
+                optimizer_ctor=optimizer_ctor, optimizer_kwargs={"foreach": foreach, "fused": fused},
             )
             # this will be picked up by try_pickle within run():
             try_pickle = True
             self._run_scaling_case(
-                device, run, unskipped=3, skipped=1, atol=atol, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
+                device, run, unskipped=3, skipped=1, atol=1e-3,
+                optimizer_ctor=optimizer_ctor, optimizer_kwargs={"foreach": foreach, "fused": fused},
             )
 
-    @onlyNativeDeviceTypes
-    def test_grad_scaling_autocast(self, device):
-        device = torch.device(device)
-        for optimizer_ctor in (torch.optim.SGD, torch.optim.Adam, torch.optim.AdamW):
-            self._grad_scaling_autocast_test(device=device.type, optimizer_ctor=optimizer_ctor)
-
-    @onlyNativeDeviceTypes
-    def test_grad_scaling_autocast_foreach(self, device):
-        device = torch.device(device)
-        for optimizer_ctor in (torch.optim.SGD, torch.optim.Adam, torch.optim.AdamW):
-            self._grad_scaling_autocast_test(device=device.type, optimizer_ctor=optimizer_ctor, optimizer_kwargs={"foreach": True})
-
-    @onlyNativeDeviceTypes
-    def test_grad_scaling_autocast_fused(self, device):
-        device = torch.device(device)
-        for optimizer_ctor in (torch.optim.Adam, torch.optim.AdamW):
-            self._grad_scaling_autocast_test(device=device.type, optimizer_ctor=optimizer_ctor, optimizer_kwargs={"fused": True})
-
     # Make sure that the parameters become nonsense when scaled gradients are finite
     # but they get invalidated before `optimizer.step`, after `GradScaler.unscale_`
 
     @onlyNativeDeviceTypes
-    def test_params_invalidated_with_grads_invalidated_between_unscale_and_step(self, device):
-        device = torch.device(device)
-        for optimizer_ctor, optimizer_kwargs in product(
-            (torch.optim.Adam, torch.optim.AdamW),
-            (
-                {"foreach": False, "fused": False},
-                {"foreach": True, "fused": False},
-                {"foreach": False, "fused": True},
-            ),
-        ):
-            with self.subTest(optimizer=optimizer_ctor, optimizer_kwargs=optimizer_kwargs):
-                self._test_grads_invalidated_between_unscale_and_step(device.type, optimizer_ctor, optimizer_kwargs)
-
-    def _test_grads_invalidated_between_unscale_and_step(self, device, optimizer_ctor, optimizer_kwargs):
-        model, _, optimizer, _, data, loss_fn, _ = _create_scaling_case(
-            device, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
-        )
-        scaler = torch.GradScaler(device=device, init_scale=128.0)
+    @optims(
+        [optim for optim in optim_db if optim.optim_cls in [torch.optim.AdamW, torch.optim.Adam, torch.optim.SGD]],
+        dtypes=[torch.float32]
+    )
+    def test_params_invalidated_with_grads_invalidated_between_unscale_and_step(self, device, dtype, optim_info):
+        optimizer_ctor = optim_info.optim_cls
+        all_optim_inputs = _get_optim_inputs_including_global_cliquey_kwargs(
+            device, dtype, optim_info, skip=("differentiable",))
+
+        for optim_input in all_optim_inputs:
+            model, _, optimizer, _, data, loss_fn, _ = _create_scaling_case(
+                device, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optim_input.kwargs,
+            )
+            scaler = torch.GradScaler(device=device, init_scale=128.0)
 
-        for input, target in data:
-            optimizer.zero_grad()
-            with torch.autocast(device_type=device, dtype=torch.half):
-                output = model(input)
-                loss = loss_fn(output, target)
-            scaler.scale(loss).backward()
-            scaler.unscale_(optimizer)
+            for input, target in data:
+                optimizer.zero_grad()
+                with torch.autocast(device_type=device, dtype=torch.half):
+                    output = model(input)
+                    loss = loss_fn(output, target)
+                scaler.scale(loss).backward()
+                scaler.unscale_(optimizer)
 
-            # deliberately break grads
-            for j, param in enumerate(model.parameters()):
-                param.grad.copy_(torch.inf if j % 2 else torch.nan)
+                # deliberately break grads
+                for j, param in enumerate(model.parameters()):
+                    param.grad.copy_(torch.inf if j % 2 else torch.nan)
 
-            scaler.step(optimizer)
-            scaler.update()
+                scaler.step(optimizer)
+                scaler.update()
 
-        self.assertTrue(all((p.isnan().any() or p.isinf().any()) for p in model.parameters()))
+            self.assertTrue(all((p.isnan().any() or p.isinf().any()) for p in model.parameters()))
 
     @onlyNativeDeviceTypes
     def test_grad_scale_will_not_overflow(self, device):
diff --git a/test/test_utils.py b/test/test_utils.py
index b151b5141a28..66d66b8874f1 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1,37 +1,52 @@
 # Owner(s): ["module: unknown"]
 
-import sys
 import os
+import random
 import re
 import shutil
-import random
 import subprocess
+import sys
 import tempfile
-import traceback
 import textwrap
+import traceback
 import unittest
 import warnings
-from typing import Any, List, Dict
+from typing import Any, Dict, List
+
 import torch
+import torch.cuda
 import torch.nn as nn
+import torch.utils.cpp_extension
 import torch.utils.data
-from torch.utils.data import DataLoader
+from torch.autograd._functions.utils import check_onnx_broadcast
+from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
 from torch.testing._internal.common_cuda import TEST_MULTIGPU
 from torch.testing._internal.common_device_type import (
-    ops,
-    onlyCPU,
     instantiate_device_type_tests,
+    onlyCPU,
+    ops,
 )
 from torch.testing._internal.common_methods_invocations import op_db
-import torch.cuda
-from torch.utils._pytree import tree_any, tree_all_only
-from torch.utils.checkpoint import checkpoint, checkpoint_sequential, get_device_states, _infer_device_type
+from torch.testing._internal.common_utils import (  # type: ignore[attr-defined]
+    IS_FBCODE,
+    IS_SANDCASTLE,
+    IS_WINDOWS,
+    load_tests,
+)
 from torch.utils._device import set_device
-from torch.utils._traceback import report_compile_source_on_error, format_traceback_short, CapturedTraceback
-import torch.utils.cpp_extension
-from torch.autograd._functions.utils import check_onnx_broadcast
-from torch.onnx.symbolic_opset9 import _prepare_onnx_paddings
-from torch.testing._internal.common_utils import load_tests, IS_FBCODE, IS_SANDCASTLE, IS_WINDOWS  # type: ignore[attr-defined]
+from torch.utils._pytree import tree_all_only, tree_any
+from torch.utils._traceback import (
+    CapturedTraceback,
+    format_traceback_short,
+    report_compile_source_on_error,
+)
+from torch.utils.checkpoint import (
+    _infer_device_type,
+    checkpoint,
+    checkpoint_sequential,
+    get_device_states,
+)
+from torch.utils.data import DataLoader
 
 # load_tests from torch.testing._internal.common_utils is used to automatically filter tests for
 # sharding on sandcastle. This line silences flake warnings
@@ -40,11 +55,10 @@
 HAS_CUDA = torch.cuda.is_available()
 
 
-from torch.testing._internal.common_utils import TestCase, run_tests
+from torch.testing._internal.common_utils import run_tests, TestCase
 
 
 class RandomDatasetMock(torch.utils.data.Dataset):
-
     def __getitem__(self, index):
         return torch.tensor([torch.rand(1).item(), random.uniform(0, 1)])
 
@@ -53,7 +67,6 @@ def __len__(self):
 
 
 class TestCheckpoint(TestCase):
-
     # This runs checkpoint_sequential on each of the nets in
     # module_lists_to_compare, and compares them against the uncheckpointed model.
     # To compare, it checks outputs as well as input gradients and parameter gradients
@@ -101,9 +114,7 @@ def _check_checkpoint_sequential(
     # Test whether checkpoint is being triggered or not. For this, we check
     # the number of times forward pass happens
     def test_checkpoint_trigger(self):
-
         class Net(nn.Module):
-
             def __init__(self):
                 super().__init__()
                 self.counter = 0
@@ -112,7 +123,7 @@ def forward(self, input_var):
                 self.counter += 1
                 # For reentrant, need to have autograd actually
                 # pack a tensor to trigger recomp
-                ret = input_var * torch.tensor(2.)
+                ret = input_var * torch.tensor(2.0)
                 return ret
 
         # checkpointed
@@ -122,13 +133,15 @@ def forward(self, input_var):
                 for m in modules:
                     self.assertEqual(m.counter, 0)
                 input_var = torch.randn(3, 4, requires_grad=True)
-                out = checkpoint_sequential(modules, 2, input_var, use_reentrant=use_reentrant)
+                out = checkpoint_sequential(
+                    modules, 2, input_var, use_reentrant=use_reentrant
+                )
                 for m in modules:
                     self.assertEqual(m.counter, 1)
                 out.sum().backward()
-                for m in modules[:(len(modules) // 2)]:
+                for m in modules[: (len(modules) // 2)]:
                     self.assertEqual(m.counter, 2)
-                for m in modules[(len(modules) // 2):]:
+                for m in modules[(len(modules) // 2) :]:
                     self.assertEqual(m.counter, 1)
 
     def test_checkpoint_valid(self):
@@ -138,7 +151,7 @@ def test_checkpoint_valid(self):
             nn.Linear(50, 20),
             nn.ReLU(),
             nn.Linear(20, 5),
-            nn.ReLU()
+            nn.ReLU(),
         )
 
         input_var = torch.randn(1, 100, requires_grad=True)
@@ -147,20 +160,33 @@ def test_checkpoint_valid(self):
         chunks = 2
         modules = list(model.children())
         out = checkpoint_sequential(modules, chunks, input_var, use_reentrant=True)
-        with self.assertRaisesRegex(RuntimeError, "torch.utils.checkpoint is incompatible"):
+        with self.assertRaisesRegex(
+            RuntimeError, "torch.utils.checkpoint is incompatible"
+        ):
             torch.autograd.grad(
-                outputs=[out], grad_outputs=[torch.ones(1, 5)], inputs=[input_var], create_graph=True
+                outputs=[out],
+                grad_outputs=[torch.ones(1, 5)],
+                inputs=[input_var],
+                create_graph=True,
             )
         # works with use_reentrant=False, and grads are the same
         out = model(input_var)
         grads_no_checkpoint = torch.autograd.grad(
-            outputs=[out], grad_outputs=[torch.ones(1, 5)], inputs=[input_var], create_graph=True,
+            outputs=[out],
+            grad_outputs=[torch.ones(1, 5)],
+            inputs=[input_var],
+            create_graph=True,
+        )
+        out_checkpoint = checkpoint_sequential(
+            modules, chunks, input_var, use_reentrant=False
         )
-        out_checkpoint = checkpoint_sequential(modules, chunks, input_var, use_reentrant=False)
         # check outputs are the same
         self.assertEqual(out_checkpoint, out)
         grads_checkpoint = torch.autograd.grad(
-            outputs=[out_checkpoint], grad_outputs=[torch.ones(1, 5)], inputs=[input_var], create_graph=True,
+            outputs=[out_checkpoint],
+            grad_outputs=[torch.ones(1, 5)],
+            inputs=[input_var],
+            create_graph=True,
         )
         self.assertEqual(grads_no_checkpoint, grads_checkpoint)
 
@@ -173,7 +199,7 @@ def test_checkpoint(self):
                     nn.Linear(50, 20),
                     nn.ReLU(),
                     nn.Linear(20, 5),
-                    nn.ReLU()
+                    nn.ReLU(),
                 )
 
                 # Compare uncheckpointed model with its checkpointed counterparts
@@ -247,7 +273,7 @@ def forward(self):
 
     def test_checkpoint_rng_cpu(self):
         for _ in range(5):
-            inp = torch.randn(20000, device='cpu').requires_grad_()
+            inp = torch.randn(20000, device="cpu").requires_grad_()
             phase1 = torch.nn.Dropout()
             phase2 = torch.nn.Dropout()
 
@@ -272,10 +298,10 @@ def run_fn(input):
 
             self.assertEqual(grad_with_checkpointing, grad_no_checkpointing)
 
-    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
+    @unittest.skipIf(not HAS_CUDA, "No CUDA")
     def test_checkpoint_rng_cuda(self):
         for _ in range(5):
-            inp = torch.randn(20000, device='cuda').requires_grad_()
+            inp = torch.randn(20000, device="cuda").requires_grad_()
             phase1 = torch.nn.Dropout()
             phase2 = torch.nn.Dropout()
 
@@ -300,9 +326,9 @@ def run_fn(input):
 
             self.assertEqual(grad_with_checkpointing, grad_no_checkpointing)
 
-    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
+    @unittest.skipIf(not HAS_CUDA, "No CUDA")
     def test_checkpoint_not_preserve_rng_state_and_without_reentrant(self):
-        inp = torch.randn(2, device='cuda').requires_grad_()
+        inp = torch.randn(2, device="cuda").requires_grad_()
         layer = torch.nn.Dropout()
 
         def run_fn(input):
@@ -312,9 +338,7 @@ def run_fn(input):
         out.sum().backward()
         # This should run without error
 
-
     def test_checkpoint_non_tensor(self):
-
         def run_fn(tensor1, tensor2):
             if tensor2 is None:
                 return tensor1
@@ -349,7 +373,9 @@ def foo(t1, t2, scale, t3):
         res[1].sum().backward(retain_graph=True)
         res[4].sum().backward(retain_graph=True)
         res[6].sum().backward()
-        with self.assertRaisesRegex(RuntimeError, "Trying to backward through the graph a second time"):
+        with self.assertRaisesRegex(
+            RuntimeError, "Trying to backward through the graph a second time"
+        ):
             res[6].sum().backward()
         t1_grad = t1.grad
         t2_grad = t2.grad
@@ -387,6 +413,7 @@ def test_checkpoint_partial_grad(self):
         def run_fn(tensor1, tensor2):
             # tensor 2 is used for other application logic
             return tensor1, tensor2
+
         input_var = torch.randn(1, 4, requires_grad=True)
         input_var2 = torch.randn(1, 4, requires_grad=False)
         out = checkpoint(run_fn, input_var, input_var2, use_reentrant=True)
@@ -394,11 +421,12 @@ def run_fn(tensor1, tensor2):
 
         def run_fn2(tensor1, tensor2):
             return tensor1
+
         input_var = torch.randn(1, 4, requires_grad=False)
         input_var2 = torch.randn(1, 4, requires_grad=True)
         with self.assertRaisesRegex(
             RuntimeError,
-            r"none of output has requires_grad=True, this checkpoint\(\) is not necessary"
+            r"none of output has requires_grad=True, this checkpoint\(\) is not necessary",
         ):
             out = checkpoint(run_fn2, input_var, input_var2, use_reentrant=True)
             out.sum().backward()
@@ -430,13 +458,13 @@ def hook(_unused):
             def test_fn(x):
                 # The main property of this function is that it contains multiple
                 # operations that save gradients in a chain.
-                x = x ** 2
+                x = x**2
                 track(x, 2)
-                x = x ** 2
+                x = x**2
                 track(x, 1)
-                x = x ** 2
+                x = x**2
                 track(x, 0)
-                x = x ** 2
+                x = x**2
                 return x.sum()
 
             fn(test_fn)
@@ -450,20 +478,32 @@ def test_fn(x):
         non_retain_stats = _do_test(lambda fn: fn(x).backward(), True)
 
         # In a retain_grad backward, buffers get preserved
-        _unused_retain_stats = _do_test(lambda fn: fn(x).backward(retain_graph=True), False)
+        _unused_retain_stats = _do_test(
+            lambda fn: fn(x).backward(retain_graph=True), False
+        )
 
         # In a regular backward with checkpoint, buffers get eagerly freed
-        checkpoint_non_retain_stats = _do_test(lambda fn: checkpoint(fn, x, use_reentrant=False).backward(), True)
+        checkpoint_non_retain_stats = _do_test(
+            lambda fn: checkpoint(fn, x, use_reentrant=False).backward(), True
+        )
 
         # In a retain_grad backward with checkpoint, buffers get eagerly freed
-        checkpoint_retain_stats = _do_test(lambda fn: checkpoint(fn, x, use_reentrant=False).backward(retain_graph=True), True)
+        checkpoint_retain_stats = _do_test(
+            lambda fn: checkpoint(fn, x, use_reentrant=False).backward(
+                retain_graph=True
+            ),
+            True,
+        )
 
         self.assertEqual(non_retain_stats, checkpoint_non_retain_stats)
         self.assertEqual(non_retain_stats, checkpoint_retain_stats)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
     def test_get_device_states_recursive(self):
-        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:1")]}
+        inp = {
+            "foo": torch.rand(10, device="cuda:0"),
+            "bar": [torch.rand(10, device="cuda:1")],
+        }
         device_ids, device_states = get_device_states(inp)
         self.assertEqual(2, len(device_ids))
         self.assertEqual(2, len(device_states))
@@ -473,7 +513,7 @@ def test_get_device_states_recursive(self):
         self.assertTrue(isinstance(device_states[1], torch.Tensor))
 
     def test_infer_device_state_recursive_meta(self):
-        inp = {'foo' : torch.rand(10, device="meta")}
+        inp = {"foo": torch.rand(10, device="meta")}
         device_type = _infer_device_type(inp)
         self.assertEqual("meta", device_type)
 
@@ -481,19 +521,28 @@ def test_infer_device_state_recursive_meta(self):
     def test_infer_device_state_recursive_multi_cuda(self):
         # Check that no warning is issued for either cuda:0, cuda:1 or
         # cuda:0, cuda:0 cases since they are both the same device type
-        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:1")]}
+        inp = {
+            "foo": torch.rand(10, device="cuda:0"),
+            "bar": [torch.rand(10, device="cuda:1")],
+        }
         with warnings.catch_warnings():
             warnings.simplefilter("error")
             device_type = _infer_device_type(inp)
             self.assertEqual("cuda", device_type)
-        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="cuda:0")]}
+        inp = {
+            "foo": torch.rand(10, device="cuda:0"),
+            "bar": [torch.rand(10, device="cuda:0")],
+        }
         with warnings.catch_warnings():
             warnings.simplefilter("error")
             device_type = _infer_device_type(inp)
             self.assertEqual("cuda", device_type)
         # Check that a warning is issued for cuda:0, meta and that it includes
         # device type information
-        inp = {'foo' : torch.rand(10, device="cuda:0"), 'bar': [torch.rand(10, device="meta")]}
+        inp = {
+            "foo": torch.rand(10, device="cuda:0"),
+            "bar": [torch.rand(10, device="meta")],
+        }
         with warnings.catch_warnings(record=True) as w:
             device_type = _infer_device_type(inp)
             self.assertEqual("cuda", device_type)
@@ -503,7 +552,7 @@ def test_infer_device_state_recursive_multi_cuda(self):
             "Tensor arguments, excluding CPU tensors, are detected on at least two types of devices"
             in warning_msg
         )
-        self.assertTrue("Device types: [\'cuda\', \'meta\']" in warning_msg)
+        self.assertTrue("Device types: ['cuda', 'meta']" in warning_msg)
         self.assertTrue("first device type: cuda" in warning_msg)
 
 
@@ -517,11 +566,13 @@ def setUp(self):
 
     def test_random_seed(self):
         def run():
-            dataloader = torch.utils.data.DataLoader(RandomDatasetMock(),
-                                                     batch_size=2,
-                                                     num_workers=4,
-                                                     shuffle=True,
-                                                     timeout=self.MAX_TIMEOUT_IN_SECOND)
+            dataloader = torch.utils.data.DataLoader(
+                RandomDatasetMock(),
+                batch_size=2,
+                num_workers=4,
+                shuffle=True,
+                timeout=self.MAX_TIMEOUT_IN_SECOND,
+            )
             return next(iter(dataloader))
 
         torch.manual_seed(2018)
@@ -534,37 +585,47 @@ def test_single_keep(self):
         # self.dataset is a Tensor here; technically not a valid input because
         # not a Dataset subclass, but needs to stay working so add ignore's
         # for type checking with mypy
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=0,
-                                             drop_last=False)
+        dataloader: DataLoader = DataLoader(
+            self.dataset,  # type: ignore[arg-type]
+            batch_size=self.batch_size,
+            num_workers=0,
+            drop_last=False,
+        )
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 2)
 
     def test_single_drop(self):
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=0,
-                                             drop_last=True)
+        dataloader: DataLoader = DataLoader(
+            self.dataset,  # type: ignore[arg-type]
+            batch_size=self.batch_size,
+            num_workers=0,
+            drop_last=True,
+        )
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 1)
 
-    @unittest.skip("FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN")
+    @unittest.skip(
+        "FIXME: Intermittent CUDA out-of-memory error on Windows and time-out under ASAN"
+    )
     def test_multi_keep(self):
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=2,
-                                             drop_last=False,
-                                             timeout=self.MAX_TIMEOUT_IN_SECOND)
+        dataloader: DataLoader = DataLoader(
+            self.dataset,  # type: ignore[arg-type]
+            batch_size=self.batch_size,
+            num_workers=2,
+            drop_last=False,
+            timeout=self.MAX_TIMEOUT_IN_SECOND,
+        )
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 2)
 
     def test_multi_drop(self):
-        dataloader : DataLoader = DataLoader(self.dataset,  # type: ignore[arg-type]
-                                             batch_size=self.batch_size,
-                                             num_workers=2,
-                                             drop_last=True,
-                                             timeout=self.MAX_TIMEOUT_IN_SECOND)
+        dataloader: DataLoader = DataLoader(
+            self.dataset,  # type: ignore[arg-type]
+            batch_size=self.batch_size,
+            num_workers=2,
+            drop_last=True,
+            timeout=self.MAX_TIMEOUT_IN_SECOND,
+        )
         dataiter = iter(dataloader)
         self.assertEqual(len(list(dataiter)), 1)
 
@@ -572,14 +633,20 @@ def test_multi_drop(self):
 test_dir = os.path.abspath(os.path.dirname(str(__file__)))
 
 
-@unittest.skipIf('SKIP_TEST_BOTTLENECK' in os.environ.keys(), 'SKIP_TEST_BOTTLENECK is set')
+@unittest.skipIf(
+    "SKIP_TEST_BOTTLENECK" in os.environ.keys(), "SKIP_TEST_BOTTLENECK is set"
+)
 class TestBottleneck(TestCase):
     def _run(self, command, timeout=30):
         """Returns (return-code, stdout, stderr)"""
         import subprocess
 
-        p = subprocess.Popen(command, stdout=subprocess.PIPE,  # noqa: P204
-                             stderr=subprocess.PIPE, shell=True)
+        p = subprocess.Popen(
+            command,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            shell=True,
+        )
         try:
             output, err = p.communicate(timeout=timeout)
         except subprocess.TimeoutExpired:
@@ -590,67 +657,108 @@ def _run(self, command, timeout=30):
         err_str = err.decode("ascii")
         return (rc, output_str, err_str)
 
-    def _run_bottleneck(self, test_file, scriptargs=''):
+    def _run_bottleneck(self, test_file, scriptargs=""):
         curdir = os.path.dirname(os.path.abspath(__file__))
-        filepath = f'{curdir}/{test_file}'
-        if scriptargs != '':
-            scriptargs = f' {scriptargs}'
+        filepath = f"{curdir}/{test_file}"
+        if scriptargs != "":
+            scriptargs = f" {scriptargs}"
         rc, out, err = self._run(
-            f'{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}')
+            f"{sys.executable} -m torch.utils.bottleneck {filepath}{scriptargs}"
+        )
         return rc, out, err
 
     def _check_run_args(self):
         # Check that this fails due to missing args
-        rc, out, err = self._run_bottleneck('bottleneck_test/test_args.py')
-        self.assertEqual(rc, 2, atol=0, rtol=0, msg=self._fail_msg('Missing args should error', out + err))
+        rc, out, err = self._run_bottleneck("bottleneck_test/test_args.py")
+        self.assertEqual(
+            rc,
+            2,
+            atol=0,
+            rtol=0,
+            msg=self._fail_msg("Missing args should error", out + err),
+        )
 
         # This should succeed
-        rc, out, err = self._run_bottleneck('bottleneck_test/test_args.py', '--foo foo --bar bar')
-        self.assertEqual(rc, 0, atol=0, rtol=0, msg=self._fail_msg('Should pass args to script', out + err))
+        rc, out, err = self._run_bottleneck(
+            "bottleneck_test/test_args.py", "--foo foo --bar bar"
+        )
+        self.assertEqual(
+            rc,
+            0,
+            atol=0,
+            rtol=0,
+            msg=self._fail_msg("Should pass args to script", out + err),
+        )
 
     def _fail_msg(self, msg, output):
-        return f'{msg}, output was:\n{output}'
+        return f"{msg}, output was:\n{output}"
 
     def _check_environment_summary(self, output):
-        results = re.search('Environment Summary', output)
-        self.assertIsNotNone(results, self._fail_msg('Should have Environment Summary', output))
+        results = re.search("Environment Summary", output)
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have Environment Summary", output)
+        )
 
         # Up to five lines away from the heading, there should be the version number
-        results = re.search(r'Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+', output)
-        self.assertIsNotNone(results, self._fail_msg('Should have PyTorch version', output))
+        results = re.search(
+            r"Environment Summary.*(\n.*){,5}\nPyTorch \d+\.\d+", output
+        )
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have PyTorch version", output)
+        )
 
     def _check_cprof_summary(self, output):
-        results = re.search('cProfile output', output)
-        self.assertIsNotNone(results, self._fail_msg('Should have cProfile output', output))
+        results = re.search("cProfile output", output)
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have cProfile output", output)
+        )
 
         # This assumes that after the cProfile output section we have
         # the autograd profiler output
-        results = re.search(r'cProfile output.*(\n.*){6,50}\n.*autograd profiler output', output)
-        self.assertIsNotNone(results, self._fail_msg(
-            'Distance between cProfile and autograd prof out not in [6, 50] lines', output))
+        results = re.search(
+            r"cProfile output.*(\n.*){6,50}\n.*autograd profiler output", output
+        )
+        self.assertIsNotNone(
+            results,
+            self._fail_msg(
+                "Distance between cProfile and autograd prof out not in [6, 50] lines",
+                output,
+            ),
+        )
 
     def _check_autograd_summary(self, output):
-        results = re.search('autograd profiler output', output)
-        self.assertIsNotNone(results, self._fail_msg('Should have autograd profiler output', output))
+        results = re.search("autograd profiler output", output)
+        self.assertIsNotNone(
+            results, self._fail_msg("Should have autograd profiler output", output)
+        )
 
         # This assumes that after the autograd profiler output is the end of the
         # output.
-        results = re.search(r'autograd profiler output.*(\n.*){6,100}', output)
-        self.assertIsNotNone(results, self._fail_msg(
-            'Distance between autograd prof output and end of output not in [6, 100] lines', output))
+        results = re.search(r"autograd profiler output.*(\n.*){6,100}", output)
+        self.assertIsNotNone(
+            results,
+            self._fail_msg(
+                "Distance between autograd prof output and end of output not in [6, 100] lines",
+                output,
+            ),
+        )
 
     def _check_cuda(self, output):
         if HAS_CUDA:
-            results = re.search('CUDA mode', output)
-            self.assertIsNotNone(results, self._fail_msg('Should tell users CUDA', output))
+            results = re.search("CUDA mode", output)
+            self.assertIsNotNone(
+                results, self._fail_msg("Should tell users CUDA", output)
+            )
         else:
-            results = re.search('CUDA mode', output)
-            self.assertIsNone(results, self._fail_msg('Should not tell users about CUDA', output))
+            results = re.search("CUDA mode", output)
+            self.assertIsNone(
+                results, self._fail_msg("Should not tell users about CUDA", output)
+            )
 
-    @unittest.skipIf(HAS_CUDA, 'CPU-only test')
+    @unittest.skipIf(HAS_CUDA, "CPU-only test")
     def test_bottleneck_cpu_only(self):
-        rc, out, err = self._run_bottleneck('bottleneck_test/test.py')
-        self.assertEqual(rc, 0, msg=f'Run failed with\n{err}')
+        rc, out, err = self._run_bottleneck("bottleneck_test/test.py")
+        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
 
         self._check_run_args()
         self._check_environment_summary(out)
@@ -658,10 +766,10 @@ def test_bottleneck_cpu_only(self):
         self._check_cprof_summary(out)
         self._check_cuda(out)
 
-    @unittest.skipIf(not HAS_CUDA, 'No CUDA')
+    @unittest.skipIf(not HAS_CUDA, "No CUDA")
     def test_bottleneck_cuda(self):
-        rc, out, err = self._run_bottleneck('bottleneck_test/test_cuda.py')
-        self.assertEqual(rc, 0, msg=f'Run failed with\n{err}')
+        rc, out, err = self._run_bottleneck("bottleneck_test/test_cuda.py")
+        self.assertEqual(rc, 0, msg=f"Run failed with\n{err}")
 
         self._check_run_args()
         self._check_environment_summary(out)
@@ -677,7 +785,7 @@ def test_bottleneck_cuda(self):
 class TestCollectEnv(TestCase):
     def test_smoke(self):
         info_output = get_pretty_env_info()
-        self.assertTrue(info_output.count('\n') >= 17)
+        self.assertTrue(info_output.count("\n") >= 17)
 
 
 class TestONNXUtils(TestCase):
@@ -688,7 +796,6 @@ def test_prepare_onnx_paddings(self):
         self.assertEqual(paddings, [0, 3, 1, 0, 4, 2])
 
     def test_check_onnx_broadcast(self):
-
         def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
             broadcast = True
             fail = False
@@ -741,7 +848,6 @@ def try_check_onnx_broadcast(dims1, dims2, expect_broadcast, expect_fail):
 
 
 class TestHipify(TestCase):
-
     def test_import_hipify(self):
         from torch.utils.hipify import hipify_python  # noqa: F401
 
@@ -774,15 +880,19 @@ def test_quote_escape(self):
             self.assertEqual(self.trie.quote(orig_chars[i]), quoted_strs[i])
 
     def test_export_trie_to_regex(self):
-        words_to_add = ["__CUDACC__", "CUDA_ERROR_CONTEXT_ALREADY_CURRENT", "CUDA_ERROR_ARRAY_IS_MAPPED",
-                        "CUDA_ERROR_NOT_MAPPED", "CUDA_ERROR_INVALID_SOURCE"]
+        words_to_add = [
+            "__CUDACC__",
+            "CUDA_ERROR_CONTEXT_ALREADY_CURRENT",
+            "CUDA_ERROR_ARRAY_IS_MAPPED",
+            "CUDA_ERROR_NOT_MAPPED",
+            "CUDA_ERROR_INVALID_SOURCE",
+        ]
         for word in words_to_add:
             self.trie.add(word)
         regex = self.trie.export_to_regex()
         expected_regex = r"(?:CUDA_ERROR_(?:ARRAY_IS_MAPPED|CONTEXT_ALREADY_CURRENT|INVALID_SOURCE|NOT_MAPPED)|__CUDACC__)"
         self.assertEqual(regex, expected_regex)
 
-
     def test_prefix_words_export_trie_to_regex(self):
         # test case where some nodes have both children and are also leaf nodes.
         words_to_add = ["apple", "app", "ban", "banana"]
@@ -800,7 +910,6 @@ def test_single_export_trie_to_regex(self):
         expected_regex = "cudaErrorInvalidMemcpyDirection"
         self.assertEqual(regex, expected_regex)
 
-
     def test_char_export_trie_to_regex(self):
         self.trie.add("a")
         self.assertEqual(self.trie.export_to_regex(), "a")
@@ -811,6 +920,7 @@ def test_special_char_export_trie_to_regex(self):
         self.trie.add(r"c*")
         self.assertEqual(self.trie.export_to_regex(), r"c\*")
 
+
 class TestAssert(TestCase):
     def test_assert_true(self):
         # verify assertions work as expected
@@ -845,14 +955,16 @@ def test_load_standalone(self):
         build_dir = tempfile.mkdtemp()
         try:
             src_path = os.path.join(build_dir, "main.cpp")
-            src = textwrap.dedent("""\
+            src = textwrap.dedent(
+                """\
                 #include <iostream>
                 #include <torch/torch.h>
                 int main() {
                     auto x = torch::eye(3);
                     std::cout << x << std::endl;
                 }
-            """)
+            """
+            )
             with open(src_path, "w") as f:
                 f.write(src)
 
@@ -866,8 +978,7 @@ def test_load_standalone(self):
 
             ext = ".exe" if IS_WINDOWS else ""
             self.assertEqual(
-                exec_path,
-                os.path.join(build_dir, f"standalone_load_test{ext}")
+                exec_path, os.path.join(build_dir, f"standalone_load_test{ext}")
             )
 
             for shell in [True, False]:
@@ -880,12 +991,14 @@ def test_load_standalone(self):
                 self.assertEqual(
                     # Windows prints "\r\n" for newlines.
                     textwrap.dedent(r.stdout.decode("utf-8")).replace("\r\n", "\n"),
-                    textwrap.dedent("""\
+                    textwrap.dedent(
+                        """\
                      1  0  0
                      0  1  0
                      0  0  1
                     [ CPUFloatType{3,3} ]
-                    """)
+                    """
+                    ),
                 )
 
         finally:
@@ -930,30 +1043,30 @@ def tearDown(self):
     def test_external_module_register(self):
         # Built-in module
         with self.assertRaisesRegex(RuntimeError, "The runtime module of"):
-            torch._register_device_module('cuda', torch.cuda)
+            torch._register_device_module("cuda", torch.cuda)
 
         # Wrong device type
         with self.assertRaisesRegex(RuntimeError, "Expected one of cpu"):
-            torch._register_device_module('dummmy', DummyPrivateUse1Module)
+            torch._register_device_module("dummmy", DummyPrivateUse1Module)
 
         with self.assertRaises(AttributeError):
             torch.privateuseone.is_available()  # type: ignore[attr-defined]
 
-        torch._register_device_module('privateuseone', DummyPrivateUse1Module)
+        torch._register_device_module("privateuseone", DummyPrivateUse1Module)
 
         torch.privateuseone.is_available()  # type: ignore[attr-defined]
 
         # No supporting for override
         with self.assertRaisesRegex(RuntimeError, "The runtime module of"):
-            torch._register_device_module('privateuseone', DummyPrivateUse1Module)
+            torch._register_device_module("privateuseone", DummyPrivateUse1Module)
 
     def test_external_module_register_with_renamed_backend(self):
-        torch.utils.rename_privateuse1_backend('foo')
+        torch.utils.rename_privateuse1_backend("foo")
         with self.assertRaisesRegex(RuntimeError, "has already been set"):
-            torch.utils.rename_privateuse1_backend('dummmy')
+            torch.utils.rename_privateuse1_backend("dummmy")
 
         custom_backend_name = torch._C._get_privateuse1_backend_name()
-        self.assertEqual(custom_backend_name, 'foo')
+        self.assertEqual(custom_backend_name, "foo")
 
         with self.assertRaises(AttributeError):
             torch.foo.is_available()  # type: ignore[attr-defined]
@@ -961,65 +1074,69 @@ def test_external_module_register_with_renamed_backend(self):
         with self.assertRaisesRegex(AssertionError, "Tried to use AMP with the"):
             with torch.autocast(device_type=custom_backend_name):
                 pass
-        torch._register_device_module('foo', DummyPrivateUse1Module)
+        torch._register_device_module("foo", DummyPrivateUse1Module)
 
         torch.foo.is_available()  # type: ignore[attr-defined]
         with torch.autocast(device_type=custom_backend_name):
             pass
 
-        self.assertEqual(torch._utils._get_device_index('foo:1'), 1)
+        self.assertEqual(torch._utils._get_device_index("foo:1"), 1)
         self.assertEqual(torch._utils._get_device_index(torch.device("foo:2")), 2)
 
+
 class TestRenderUtils(TestCase):
     def test_basic(self):
         self.assertExpectedInline(
-            torch._utils.render_call(torch.sum, [torch.randn(100)], {'dim': 0}),
-            '''torch.sum(tensor([...], size=(100,)), dim=0)'''
+            torch._utils.render_call(torch.sum, [torch.randn(100)], {"dim": 0}),
+            """torch.sum(tensor([...], size=(100,)), dim=0)""",
         )
         self.assertExpectedInline(
-            torch._utils.render_call(torch.sum, [torch.randn(100, 100)], {'dim': 0}),
-            '''torch.sum(tensor([...], size=(100, 100)), dim=0)'''
+            torch._utils.render_call(torch.sum, [torch.randn(100, 100)], {"dim": 0}),
+            """torch.sum(tensor([...], size=(100, 100)), dim=0)""",
         )
 
+
 class TestDeviceUtils(TestCase):
     def test_basic(self):
-        with torch.device('meta') as dev:
+        with torch.device("meta") as dev:
             x = torch.empty(3, 3)
-        self.assertEqual(x.device.type, 'meta')
-        self.assertEqual(dev, torch.device('meta'))
+        self.assertEqual(x.device.type, "meta")
+        self.assertEqual(dev, torch.device("meta"))
 
     def test_decorator(self):
-        @set_device('meta')
+        @set_device("meta")
         def f():
             return torch.empty(3, 3)
-        self.assertEqual(f().device.type, 'meta')
+
+        self.assertEqual(f().device.type, "meta")
 
     def test_decorator_generator(self):
-        @set_device('meta')
+        @set_device("meta")
         def f():
             yield torch.empty(3, 3)
             yield torch.empty(3, 3)
+
         r1, r2 = list(f())
-        self.assertEqual(r1.device.type, 'meta')
-        self.assertEqual(r2.device.type, 'meta')
+        self.assertEqual(r1.device.type, "meta")
+        self.assertEqual(r2.device.type, "meta")
 
     def test_nn_module(self):
-        with torch.device('meta'):
+        with torch.device("meta"):
             m = nn.Linear(40, 50)
-        self.assertEqual(m.weight.device.type, 'meta')
+        self.assertEqual(m.weight.device.type, "meta")
 
     def test_set_default_device(self):
         try:
-            torch.set_default_device('meta')
+            torch.set_default_device("meta")
             r = torch.empty(2, 2)
         finally:
             torch.set_default_device(None)
 
-        self.assertEqual(r.device.type, 'meta')
+        self.assertEqual(r.device.type, "meta")
 
     def test_get_default_device(self):
-        torch.set_default_device('meta')
-        self.assertEqual(torch.get_default_device().type, 'meta')
+        torch.set_default_device("meta")
+        self.assertEqual(torch.get_default_device().type, "meta")
         torch.set_default_device(None)
 
     @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
@@ -1048,7 +1165,7 @@ def test_device_mode_ops(self, device, dtype, op):
             # very incomplete
             if tree_any(
                 lambda x: isinstance(x, torch.Tensor),
-                (sample.input, sample.args, sample.kwargs)
+                (sample.input, sample.args, sample.kwargs),
             ):
                 continue
             # Many OpInfos will explicitly pass in a device.  DeviceContext
@@ -1057,11 +1174,11 @@ def test_device_mode_ops(self, device, dtype, op):
             # NB: Can't pass None to sample_inputs, the function can't
             # handle it.
             kwargs = sample.kwargs.copy()
-            kwargs.pop('device', None)
-            with torch.device('meta'):
+            kwargs.pop("device", None)
+            with torch.device("meta"):
                 r = func(sample.input, *sample.args, **kwargs)
             self.assertTrue(
-                tree_all_only(torch.Tensor, lambda x: x.device.type == 'meta', r)
+                tree_all_only(torch.Tensor, lambda x: x.device.type == "meta", r)
             )
 
 
@@ -1070,22 +1187,22 @@ def test_device_mode_ops(self, device, dtype, op):
 
 class TestCppExtensionUtils(TestCase):
     def test_cpp_compiler_is_ok(self):
-        self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform('c++'))
+        self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform("c++"))
 
     def test_cc_compiler_is_ok(self):
-        self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform('cc'))
+        self.assertTrue(torch.utils.cpp_extension.check_compiler_ok_for_platform("cc"))
 
 
 class TestTraceback(TestCase):
     def test_basic(self):
-        source = '''\
+        source = """\
 def f(x):
     def g(x):
         raise RuntimeError  # HEYA
 
     x = x * 3
     return g(x) + 1
-'''
+"""
 
         out: Dict[str, Any] = {}
         scope = {"__compile_source__": source}
@@ -1095,29 +1212,36 @@ def g(x):
             with report_compile_source_on_error():
                 out["f"](1)
         except RuntimeError as e:
-            self.assertIn("HEYA", ''.join(traceback.format_tb(e.__traceback__)))
+            self.assertIn("HEYA", "".join(traceback.format_tb(e.__traceback__)))
 
     def test_format_traceback_short(self):
         try:
             raise RuntimeError
         except RuntimeError as e:
-            self.assertRegex(format_traceback_short(e.__traceback__), r'.*test_utils.py:\d+ in test_format_traceback_short')
+            self.assertRegex(
+                format_traceback_short(e.__traceback__),
+                r".*test_utils.py:\d+ in test_format_traceback_short",
+            )
 
     def test_captured_traceback(self):
-        self.assertIn('test_captured_traceback', ''.join(CapturedTraceback.extract().format()))
+        self.assertIn(
+            "test_captured_traceback", "".join(CapturedTraceback.extract().format())
+        )
 
     def test_captured_traceback_format_all(self):
-        rs = CapturedTraceback.format_all([CapturedTraceback.extract(), CapturedTraceback.extract()])
+        rs = CapturedTraceback.format_all(
+            [CapturedTraceback.extract(), CapturedTraceback.extract()]
+        )
         self.assertEqual(len(rs), 2)
-        self.assertIn('test_captured_traceback_format_all', ''.join(rs[0]))
+        self.assertIn("test_captured_traceback_format_all", "".join(rs[0]))
 
     def test_captured_traceback_format_all_cached(self):
         tb = CapturedTraceback.extract()
         tb.format()  # cached
         rs = CapturedTraceback.format_all([tb, CapturedTraceback.extract()])
         self.assertEqual(len(rs), 2)
-        self.assertIn('test_captured_traceback_format_all', ''.join(rs[0]))
+        self.assertIn("test_captured_traceback_format_all", "".join(rs[0]))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     run_tests()
diff --git a/test/test_xpu.py b/test/test_xpu.py
index 74cc891a9e62..a3838f1d5a05 100644
--- a/test/test_xpu.py
+++ b/test/test_xpu.py
@@ -1,6 +1,7 @@
 # Owner(s): ["module: intel"]
 
 import sys
+import tempfile
 import unittest
 
 import torch
@@ -270,6 +271,40 @@ def convert_boolean_tensors(x):
 
             self.assertEqual(expect, actual)
 
+    def test_serialization_array_with_storage(self):
+        x = torch.randn(5, 5).xpu()
+        y = torch.zeros(2, 5, dtype=torch.int, device="xpu")
+        q = [x, y, x, y.storage()]
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(q, f)
+            f.seek(0)
+            q_copy = torch.load(f)
+        self.assertEqual(q_copy, q, atol=0, rtol=0)
+        q_copy[0].fill_(5)
+        self.assertEqual(q_copy[0], q_copy[2], atol=0, rtol=0)
+        self.assertEqual(q_copy[0].dtype, torch.float)
+        self.assertEqual(q_copy[1].dtype, torch.int)
+        self.assertEqual(q_copy[2].dtype, torch.float)
+        self.assertTrue(isinstance(q_copy[3], torch.storage.TypedStorage))
+        self.assertTrue(isinstance(q_copy[3]._untyped_storage, torch.UntypedStorage))
+        q_copy[1].fill_(10)
+        y.fill_(10)
+        self.assertEqual(q_copy[3], y.storage())
+
+    def test_serialization_array_with_empty(self):
+        x = [
+            torch.randn(4, 4).xpu(),
+            torch.tensor([], dtype=torch.float, device=torch.device("xpu")),
+        ]
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(x, f)
+            f.seek(0)
+            x_copy = torch.load(f)
+        for original, copy in zip(x, x_copy):
+            self.assertEqual(copy, original)
+            self.assertIs(type(copy), type(original))
+            self.assertEqual(copy.get_device(), original.get_device())
+
 
 instantiate_device_type_tests(TestXpu, globals(), only_for="xpu")
 
diff --git a/test/torch_np/numpy_tests/core/test_indexing.py b/test/torch_np/numpy_tests/core/test_indexing.py
index f0d9023ddbff..77844e77a6e0 100644
--- a/test/torch_np/numpy_tests/core/test_indexing.py
+++ b/test/torch_np/numpy_tests/core/test_indexing.py
@@ -740,7 +740,7 @@ def _get_multi_index(self, arr, indices):
                 try:
                     indx = np.array(indx, dtype=np.intp)
                 except ValueError:
-                    raise IndexError
+                    raise IndexError from None
                 in_indices[i] = indx
             elif indx.dtype.kind != "b" and indx.dtype.kind != "i":
                 raise IndexError(
@@ -902,7 +902,7 @@ def _get_multi_index(self, arr, indices):
                     arr = arr.reshape(arr.shape[:ax] + mi.shape + arr.shape[ax + 1 :])
                 except ValueError:
                     # too many dimensions, probably
-                    raise IndexError
+                    raise IndexError from None
                 ax += mi.ndim
                 continue
 
diff --git a/test/torch_np/numpy_tests/core/test_multiarray.py b/test/torch_np/numpy_tests/core/test_multiarray.py
index 38e2df73d5b8..bf9aab8ebcee 100644
--- a/test/torch_np/numpy_tests/core/test_multiarray.py
+++ b/test/torch_np/numpy_tests/core/test_multiarray.py
@@ -409,7 +409,7 @@ def make_array(size, offset, strides):
             try:
                 r = np.ndarray([size], dtype=int, buffer=x, offset=offset * x.itemsize)
             except Exception as e:
-                raise RuntimeError(e)  # noqa: TRY200
+                raise RuntimeError(e)  # noqa: B904
             r.strides = strides = strides * x.itemsize
             return r
 
@@ -6304,7 +6304,7 @@ def test_flat_element_deletion(self):
         except TypeError:
             pass
         except Exception:
-            raise AssertionError
+            raise AssertionError from None
 
 
 class TestConversion(TestCase):
diff --git a/test/torch_np/numpy_tests/core/test_scalar_methods.py b/test/torch_np/numpy_tests/core/test_scalar_methods.py
index addc550ed337..2e763c6636a8 100644
--- a/test/torch_np/numpy_tests/core/test_scalar_methods.py
+++ b/test/torch_np/numpy_tests/core/test_scalar_methods.py
@@ -132,7 +132,7 @@ def test_roundtrip(self, ftype, frac_vals, exp_vals):
                 df = np.longdouble(d)
             except (OverflowError, RuntimeWarning):
                 # the values may not fit in any float type
-                raise SkipTest("longdouble too small on this platform")  # noqa: TRY200
+                raise SkipTest("longdouble too small on this platform")  # noqa: B904
 
             assert_equal(nf / df, f, f"{n}/{d}")
 
diff --git a/test/torch_np/numpy_tests/lib/test_function_base.py b/test/torch_np/numpy_tests/lib/test_function_base.py
index fa1168840635..d0eda87b0108 100644
--- a/test/torch_np/numpy_tests/lib/test_function_base.py
+++ b/test/torch_np/numpy_tests/lib/test_function_base.py
@@ -1435,7 +1435,7 @@ def test_keywords_no_func_code(self):
         try:
             vectorize(random.randrange)  # Should succeed
         except Exception:
-            raise AssertionError  # noqa: TRY200
+            raise AssertionError  # noqa: B904
 
     def test_keywords2_ticket_2100(self):
         # Test kwarg support: enhancement ticket 2100
diff --git a/test/torch_np/numpy_tests/linalg/test_linalg.py b/test/torch_np/numpy_tests/linalg/test_linalg.py
index 616c7b95f5c9..3a5c21745e24 100644
--- a/test/torch_np/numpy_tests/linalg/test_linalg.py
+++ b/test/torch_np/numpy_tests/linalg/test_linalg.py
@@ -1958,7 +1958,7 @@ def test_xerbla_override(self):
             pid = os.fork()
         except (OSError, AttributeError):
             # fork failed, or not running on POSIX
-            raise SkipTest("Not POSIX or fork failed.")  # noqa: TRY200
+            raise SkipTest("Not POSIX or fork failed.")  # noqa: B904
 
         if pid == 0:
             # child; close i/o file handles
diff --git a/third_party/cpp-httplib b/third_party/cpp-httplib
new file mode 160000
index 000000000000..3b6597bba913
--- /dev/null
+++ b/third_party/cpp-httplib
@@ -0,0 +1 @@
+Subproject commit 3b6597bba913d51161383657829b7e644e59c006
diff --git a/third_party/cub b/third_party/cub
deleted file mode 160000
index d106ddb991a5..000000000000
--- a/third_party/cub
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit d106ddb991a56c3df1b6d51b2409e36ba8181ce4
diff --git a/third_party/ideep b/third_party/ideep
index 8a6cc4e09dc5..55ca0191687a 160000
--- a/third_party/ideep
+++ b/third_party/ideep
@@ -1 +1 @@
-Subproject commit 8a6cc4e09dc509f04f83c085e38786b1fb44e14d
+Subproject commit 55ca0191687aaf19aca5cdb7881c791e3bea442b
diff --git a/third_party/kineto b/third_party/kineto
index 3a81076cc970..327ac5052cf2 160000
--- a/third_party/kineto
+++ b/third_party/kineto
@@ -1 +1 @@
-Subproject commit 3a81076cc97092666f319846f32f36b73ce2293e
+Subproject commit 327ac5052cf25238fc769ab421c680d19b848eb3
diff --git a/third_party/mkl-dnn.BUILD b/third_party/mkl-dnn.BUILD
index dac4f9e3e8cf..9a688a52b1cf 100644
--- a/third_party/mkl-dnn.BUILD
+++ b/third_party/mkl-dnn.BUILD
@@ -63,9 +63,9 @@ template_rule(
     out = "include/oneapi/dnnl/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "3",
-        "@DNNL_VERSION_MINOR@": "3",
-        "@DNNL_VERSION_PATCH@": "6",
-        "@DNNL_VERSION_HASH@": "86e6af5974177e513fd3fee58425e1063e7f1361",
+        "@DNNL_VERSION_MINOR@": "4",
+        "@DNNL_VERSION_PATCH@": "2",
+        "@DNNL_VERSION_HASH@": "1137e04ec0b5251ca2b4400a4fd3c667ce843d67",
     },
 )
 
diff --git a/third_party/zstd b/third_party/zstd
deleted file mode 160000
index aec56a52fbab..000000000000
--- a/third_party/zstd
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit aec56a52fbab207fc639a1937d1e708a282edca8
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index 776125e84a7f..1fb92aa083ae 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2816,9 +2816,9 @@
   output_differentiability: [True, False]
   query, key, value: _scaled_dot_product_flash_attention_for_cpu_backward(grad, query, key, value, output, logsumexp, dropout_p, is_causal, attn_mask, scale)
 
-- name: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
+- name: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None, SymInt? window_size_left=None, SymInt? window_size_right=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
   output_differentiability: [True, False, False, False, False]
-  query, key, value: _flash_attention_backward_symint(grad, query, key, value, output, softmax_logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset, scale)
+  query, key, value: _flash_attention_backward_symint(grad, query, key, value, output, softmax_logsumexp, cum_seq_q, cum_seq_k, max_q, max_k, dropout_p, is_causal, philox_seed, philox_offset, scale, window_size_left, window_size_right)
 
 - name: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k)
   output_differentiability: [True, False, False, False, False, False]
diff --git a/tools/pyi/gen_pyi.py b/tools/pyi/gen_pyi.py
index d5f6837cba01..a7eb81341eb5 100644
--- a/tools/pyi/gen_pyi.py
+++ b/tools/pyi/gen_pyi.py
@@ -1130,6 +1130,18 @@ def replace_special_case(hint: str) -> str:
                     )
                 )
             ],
+            "xpu": [
+                "def xpu({}) -> Tensor: ...".format(
+                    ", ".join(
+                        [
+                            "self",
+                            "device: Optional[Union[_device, _int, str]] = None",
+                            "non_blocking: _bool = False",
+                            "memory_format: torch.memory_format = torch.preserve_format",
+                        ]
+                    )
+                )
+            ],
             "cpu": [
                 "def cpu(self, memory_format: torch.memory_format = torch.preserve_format) -> Tensor: ..."
             ],
diff --git a/tools/stats/upload_test_stats_intermediate.py b/tools/stats/upload_test_stats_intermediate.py
new file mode 100644
index 000000000000..77cab472367b
--- /dev/null
+++ b/tools/stats/upload_test_stats_intermediate.py
@@ -0,0 +1,29 @@
+import argparse
+import sys
+
+from tools.stats.test_dashboard import upload_additional_info
+from tools.stats.upload_test_stats import get_tests
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Upload test stats to Rockset")
+    parser.add_argument(
+        "--workflow-run-id",
+        required=True,
+        help="id of the workflow to get artifacts from",
+    )
+    parser.add_argument(
+        "--workflow-run-attempt",
+        type=int,
+        required=True,
+        help="which retry of the workflow this is",
+    )
+    args = parser.parse_args()
+
+    print(f"Workflow id is: {args.workflow_run_id}")
+
+    test_cases = get_tests(args.workflow_run_id, args.workflow_run_attempt)
+
+    # Flush stdout so that any errors in Rockset upload show up last in the logs.
+    sys.stdout.flush()
+
+    upload_additional_info(args.workflow_run_id, args.workflow_run_attempt, test_cases)
diff --git a/tools/testing/test_selections.py b/tools/testing/test_selections.py
index 311eac59eb28..3e43edd50247 100644
--- a/tools/testing/test_selections.py
+++ b/tools/testing/test_selections.py
@@ -12,7 +12,7 @@
 
 IS_MEM_LEAK_CHECK = os.getenv("PYTORCH_TEST_CUDA_MEM_LEAK_CHECK", "0") == "1"
 BUILD_ENVIRONMENT = os.getenv("BUILD_ENVIRONMENT", "")
-USE_3_PROCS = "sm86" in BUILD_ENVIRONMENT
+USE_3_PROCS = "sm86" in BUILD_ENVIRONMENT or "cuda" not in BUILD_ENVIRONMENT
 
 # NUM_PROCS_FOR_SHARDING_CALC must remain consistent across all shards of a job
 # to ensure that sharding is consistent, NUM_PROCS is the actual number of procs
diff --git a/torch/_C/__init__.pyi.in b/torch/_C/__init__.pyi.in
index ac70396c468e..0599da2117fb 100644
--- a/torch/_C/__init__.pyi.in
+++ b/torch/_C/__init__.pyi.in
@@ -1196,6 +1196,7 @@ def _has_storage(x: Tensor) -> _bool: ...
 def _construct_storage_from_data_pointer(data_ptr: _int, device: torch.device, size: _int) -> Storage: ...
 def _should_allow_numbers_as_tensors(func_name: str) -> _bool: ...
 def _group_tensors_by_device_and_dtype(nested_tensorlists: List[List[Optional[Tensor]]], with_indices: _bool = False) -> Dict[Tuple[torch.device, str], Tuple[List[List[Optional[Tensor]]], List[_int]]]: ...
+def _check_tp_alloc_is_default(cls: Type) -> _bool: ...
 
 # NB: There is no Capsule type in typing, see
 # https://code.activestate.com/lists/python-dev/139675/
diff --git a/torch/_C/_distributed_c10d.pyi b/torch/_C/_distributed_c10d.pyi
index 28d790e3d690..74a73a3ddaa4 100644
--- a/torch/_C/_distributed_c10d.pyi
+++ b/torch/_C/_distributed_c10d.pyi
@@ -210,6 +210,20 @@ class PrefixStore(Store):
     @property
     def underlying_store(self) -> Store: ...
 
+class _ControlCollectives:
+    def barrier(self, key: str, timeout: timedelta, blocking: bool) -> None: ...
+    def broadcast_send(self, key: str, data: str, timeout: timedelta) -> None: ...
+    def broadcast_recv(self, key: str, timeout: timedelta) -> str: ...
+    def gather_send(self, key: str, data: str, timeout: timedelta) -> None: ...
+    def gather_recv(self, key: str, timeout: timedelta) -> str: ...
+    def scatter_send(self, key: str, data: str, timeout: timedelta) -> None: ...
+    def scatter_recv(self, key: str, timeout: timedelta) -> str: ...
+    def all_gather(self, key: str, data: str, timeout: timedelta) -> str: ...
+    def all_sum(self, key: str, data: str, timeout: timedelta) -> int: ...
+
+class _StoreCollectives(_ControlCollectives):
+    def __init__(self, store: Store, rank: int, world_size: int) -> None: ...
+
 class _DistributedBackendOptions:
     def __init__(self): ...
     @property
diff --git a/torch/_custom_ops.py b/torch/_custom_ops.py
index c13b0aaf339a..c09a8ae68543 100644
--- a/torch/_custom_ops.py
+++ b/torch/_custom_ops.py
@@ -250,7 +250,7 @@ def impl_abstract(qualname, *, func=None):
     """
     import torch.library
 
-    return torch.library.impl_abstract(qualname, func, _stacklevel=2)
+    return torch.library.register_fake(qualname, func, _stacklevel=2)
 
 
 def impl_save_for_backward(qualname, *, func=None):
diff --git a/torch/_dynamo/__init__.py b/torch/_dynamo/__init__.py
index b9c4fbfd7b6e..98496b5fc5de 100644
--- a/torch/_dynamo/__init__.py
+++ b/torch/_dynamo/__init__.py
@@ -62,7 +62,7 @@
 
     # Wrap manual_seed with the disable decorator.
     # Can't do it at its implementation due to dependency issues.
-    torch.manual_seed = disable(torch.manual_seed)
+    torch.manual_seed = torch._disable_dynamo(torch.manual_seed)
     # Add the new manual_seed to the builtin registry.
     torch.jit._builtins._register_builtin(torch.manual_seed, "aten::manual_seed")
 
diff --git a/torch/_dynamo/compiled_autograd.py b/torch/_dynamo/compiled_autograd.py
index e8e61042d474..f9cf03947a8c 100644
--- a/torch/_dynamo/compiled_autograd.py
+++ b/torch/_dynamo/compiled_autograd.py
@@ -319,3 +319,11 @@ def disable():
         if prior:
             compiled_autograd_enabled = True
         torch._C._dynamo.compiled_autograd.set_autograd_compiler(prior)
+
+
+# return to starting state of a new process
+def reset() -> None:
+    compiled_autograd_enable = False
+    assert compiled_autograd_enabled_count == 0
+    torch._C._dynamo.compiled_autograd.set_autograd_compiler(None)
+    torch._C._dynamo.compiled_autograd.set_verbose_logging(False)
diff --git a/torch/_dynamo/comptime.py b/torch/_dynamo/comptime.py
index 7b876258bd48..23000c464fdb 100644
--- a/torch/_dynamo/comptime.py
+++ b/torch/_dynamo/comptime.py
@@ -126,8 +126,7 @@ def _i_will_not_complain_if_bc_breaks_VariableTracker(self):
         return self.__variable
 
     def __repr__(self):
-        # TODO: The default repr is pretty bad, do better
-        return repr(self.__variable)
+        return self.__variable.debug_repr()
 
     # TODO: API for adding a custom guard
 
@@ -188,6 +187,9 @@ def __get_tx(self, stacklevel):
             tx = tx.parent
         return tx
 
+    def print(self, val, *, file=None):
+        print(repr(val), file=file)
+
     def print_disas(self, *, file=None, stacklevel=0):
         """
         Print the current series of opcodes being executed (not including
@@ -275,9 +277,9 @@ def _i_will_not_complain_if_bc_breaks_InstructionTranslator(self):
 
 class _Comptime:
     @staticmethod
-    def __call__(fn):
-        """fn gets called at compile time in TorchDynamo, does nothing otherwise"""
-        return
+    def __call__(fn, fallback_fn=lambda: None):
+        """fn gets called at compile time in TorchDynamo, calls fallback_fn otherwise"""
+        fallback_fn()
 
     # Convenience wrappers that are more compact to use
 
@@ -285,6 +287,10 @@ def __call__(fn):
     def graph_break():
         comptime(lambda ctx: ctx.graph_break())
 
+    @staticmethod
+    def print(e):
+        comptime(lambda ctx: ctx.print(ctx.get_local("e")), lambda: print(e))
+
     @staticmethod
     def print_graph():
         comptime(lambda ctx: ctx.print_graph())
diff --git a/torch/_dynamo/convert_frame.py b/torch/_dynamo/convert_frame.py
index 38795341be21..d5c24a67d9e2 100644
--- a/torch/_dynamo/convert_frame.py
+++ b/torch/_dynamo/convert_frame.py
@@ -312,7 +312,7 @@ def profile_wrapper(*args, **kwargs):
         retval = prof.runcall(func, *args, **kwargs)
         profile_latency = time.time() - start_ts
         prof.disable()
-        log.info(
+        log.warning(
             "### Cprofile for %s trace id [%s] took %.3f seconds ###",
             func.__name__,
             trace_id,
@@ -322,7 +322,7 @@ def profile_wrapper(*args, **kwargs):
         try:
             prof.dump_stats(profile_path)
         except PermissionError:
-            log.info("Cannot write to %s", str(profile_path))
+            log.warning("Cannot write to %s", str(profile_path))
         svg_path = profile_path.with_suffix(".svg")
         try:
             gprof2dot_process = subprocess.Popen(
@@ -341,9 +341,9 @@ def profile_wrapper(*args, **kwargs):
                 ["dot", "-Tsvg", "-o", str(svg_path)],
                 stdin=gprof2dot_process.stdout,
             )
-            log.info("Generated SVG from profile at %s", str(svg_path))
+            log.warning("Generated SVG from profile at %s", str(svg_path))
         except FileNotFoundError:
-            log.info(
+            log.warning(
                 "Failed to generate SVG from profile -- dumping stats instead."
                 "Try installing gprof2dot and dot for a better visualization"
             )
@@ -773,6 +773,22 @@ def format_guard_failures():
             "".join(CapturedTraceback.extract(skip=2 + skip).format()),
         )
         # -4: -2 as above, plus trace_structured frames
+        #
+        # NB: the frame looks like this:
+        #
+        # # handled by skip argument
+        # torch/_dynamo/convert_frame.py:1069 in catch_errors
+        # torch/_dynamo/convert_frame.py:910 in _convert_frame
+        # torch/_dynamo/convert_frame.py:464 in _convert_frame_assert
+        # torch/_utils_internal.py:70 in wrapper_function
+        #
+        # # 2 current frame and context lib
+        # env/lib/python3.10/contextlib.py:79 in inner
+        # torch/_dynamo/convert_frame.py:776 in _compile
+        #
+        # # 2 extra here
+        # torch/_logging/_internal.py:1064 in trace_structured
+        # torch/_dynamo/convert_frame.py:780 in <lambda>
         torch._logging.trace_structured(
             "dynamo_start",
             lambda: {
diff --git a/torch/_dynamo/debug_utils.py b/torch/_dynamo/debug_utils.py
index 67dd492fe851..4b4b37a34da9 100644
--- a/torch/_dynamo/debug_utils.py
+++ b/torch/_dynamo/debug_utils.py
@@ -360,7 +360,7 @@ def same_two_models(
             fp64_ref = run_fwd_maybe_bwd(fp64_model, fp64_examples, only_fwd)
         except Exception:
             if require_fp64:
-                raise RuntimeError("Could not generate fp64 outputs")  # noqa: TRY200
+                raise RuntimeError("Could not generate fp64 outputs")  # noqa: B904
             log.warning("Could not generate fp64 outputs")
 
     try:
diff --git a/torch/_dynamo/eval_frame.py b/torch/_dynamo/eval_frame.py
index bb90d2842145..db35c0f631e8 100644
--- a/torch/_dynamo/eval_frame.py
+++ b/torch/_dynamo/eval_frame.py
@@ -150,7 +150,10 @@ def __init__(self, mod: torch.nn.Module, dynamo_ctx):
 
     def _initialize(self):
         # Do this stuff in constructor to lower overhead slightly
-        if isinstance(self._orig_mod.forward, types.MethodType) and trace_rules.check(
+        if isinstance(self.dynamo_ctx, DisableContext):
+            # No need to check trace rules
+            self.forward = self.dynamo_ctx(self._orig_mod.__call__)
+        elif isinstance(self._orig_mod.forward, types.MethodType) and trace_rules.check(
             self._orig_mod.forward
         ):
             # This may be a torch.nn.* instance in trace_rules.py which
@@ -353,14 +356,9 @@ def get_compiler_config():
             # User has wrapped the class with compile/disable decorator. Apply
             # disable to init/call method.
             cls_obj = fn
-            if isinstance(self, DisableContext):
-                # Disable on init is useful for reconstruction of bytecodes where we
-                # want to prevent Dynamo from tracing into the init function. Check
-                # test_reconstruction in test_model_output.py.
-                cls_obj.__init__ = self(cls_obj.__init__)
             cls_obj.__call__ = self(cls_obj.__call__)
             if issubclass(cls_obj, torch.nn.Module):
-                # NN module variable tracker directly inlines the _call_impl. Disable it.
+                # NN module variable tracker directly inlines the _call_impl.
                 cls_obj._call_impl = self(cls_obj._call_impl)
             return cls_obj
 
@@ -383,12 +381,8 @@ def get_compiler_config():
 
         callback = self.callback
 
-        if isinstance(self, DisableContext):
-            is_jit_tracing = always_false
-            is_fx_tracing = always_false
-        else:
-            is_jit_tracing = torch._C._is_tracing
-            is_fx_tracing = torch.fx._symbolic_trace.is_fx_tracing
+        is_jit_tracing = torch._C._is_tracing
+        is_fx_tracing = torch.fx._symbolic_trace.is_fx_tracing
 
         @functools.wraps(fn)
         def _fn(*args, **kwargs):
@@ -424,10 +418,7 @@ def _fn(*args, **kwargs):
                     cleanup()
 
         # hooks to properly handle inlining
-        if isinstance(self, DisableContext):
-            _fn._torchdynamo_disable = True  # type: ignore[attr-defined]
-        else:
-            _fn._torchdynamo_inline = fn  # type: ignore[attr-defined]
+        _fn._torchdynamo_inline = fn  # type: ignore[attr-defined]
 
         # Save the function pointer to find the original callable while nesting
         # of decorators.
@@ -519,6 +510,53 @@ class DisableContext(_TorchDynamoContext):
     def __init__(self):
         super().__init__(callback=None)
 
+    def __call__(self, fn):
+        # Earlier this code was in the base class _TorchDynamoContext. But we
+        # moved it here to have better code organization. For disable, we just
+        # want the callback to be None. We don't have to check trace_rules or
+        # create any wrapper.
+        fn = innermost_fn(fn)
+
+        if isinstance(fn, torch.nn.Module):
+            mod = fn
+            new_mod = OptimizedModule(mod, self)
+            new_mod._torchdynamo_orig_callable = mod.forward
+            return new_mod
+
+        if inspect.isclass(fn):
+            # User has wrapped the class with compile/disable decorator. Apply
+            # disable to init/call method.
+            cls_obj = fn
+            # Disable on init is useful for reconstruction of bytecodes where we
+            # want to prevent Dynamo from tracing into the init function. Check
+            # test_reconstruction in test_model_output.py.
+            cls_obj.__init__ = self(cls_obj.__init__)
+            cls_obj.__call__ = self(cls_obj.__call__)
+            if issubclass(cls_obj, torch.nn.Module):
+                # NN module variable tracker directly inlines the _call_impl. Disable it.
+                cls_obj._call_impl = self(cls_obj._call_impl)
+            return cls_obj
+
+        assert callable(fn)
+
+        callback = self.callback
+
+        @functools.wraps(fn)
+        def _fn(*args, **kwargs):
+            prior = set_eval_frame(callback)
+            try:
+                return fn(*args, **kwargs)
+            finally:
+                set_eval_frame(prior)
+
+        _fn._torchdynamo_disable = True  # type: ignore[attr-defined]
+
+        # Save the function pointer to find the original callable while nesting
+        # of decorators.
+        _fn._torchdynamo_orig_callable = fn  # type: ignore[attr-defined]
+
+        return _fn
+
 
 def _optimize_catch_errors(
     compile_fn,
@@ -1349,7 +1387,7 @@ def graph_with_interpreter(*args):
                     )(*example_fake_inputs)
                 except CondOpArgsMismatchError as e:
                     # Wrap the internal error to the user-facing error
-                    raise UserError(  # noqa: TRY200
+                    raise UserError(  # noqa: B904
                         UserErrorType.DYNAMIC_CONTROL_FLOW,
                         str(e),
                         case_name="cond_operands",
diff --git a/torch/_dynamo/guards.py b/torch/_dynamo/guards.py
index 0e714cb1a542..a8fc77b92c11 100644
--- a/torch/_dynamo/guards.py
+++ b/torch/_dynamo/guards.py
@@ -1124,6 +1124,26 @@ def HASATTR(self, guard: Guard):
         else:
             self._produce_guard_code(guard, [code])
 
+    def NOT_PRESENT_IN_GENERIC_DICT(self, guard: Guard, attr=None) -> None:
+        assert attr is not None
+        ref = self.arg_ref(guard)
+        val = self.get(guard.name)
+        assert isinstance(val, torch.nn.Module)
+
+        base_manager = self.get_guard_manager(guard)
+
+        mod_dict_source = f"{guard.name}.__dict__"
+        mod_generic_dict_manager = base_manager.get_generic_dict_manager(
+            source=mod_dict_source,
+            example_value=val.__dict__,
+            guard_manager_enum=GuardManagerType.GUARD_MANAGER,
+        )
+
+        code = f"not ___dict_contains({attr!r}, {ref}.__dict__)"
+        mod_generic_dict_manager.add_dict_contains_guard(
+            False, attr, get_verbose_code_parts(code, guard)
+        )
+
     def TYPE_MATCH(self, guard: Guard) -> None:
         # ___check_type_id is same as `id(type(x)) == y`
         t = type(self.get(guard.name))
@@ -1719,8 +1739,12 @@ def TENSOR_MATCH(self, guard: Guard, value=None):
         # For FSDP modules, we must use TENSOR_MATCH because FSDP module is
         # traced using UnspecializedNNModuleVariable and therefore lifts the
         # params as inputs.
+        # For numpy tensors, always use TENSOR_MATCH because __from_numpy leads
+        # to a new tensor everytime and therefore id differs.
         if (
-            guard.is_nn_module() and not guard.is_fsdp_module()
+            guard.is_nn_module()
+            and not guard.is_fsdp_module()
+            and not isinstance(guard.originating_source, NumpyTensorSource)
         ) or match_on_id_for_tensor(guard):
             self.ID_MATCH(guard)
         else:
diff --git a/torch/_dynamo/mutation_guard.py b/torch/_dynamo/mutation_guard.py
index e40fe21f32f4..c4a588888e11 100644
--- a/torch/_dynamo/mutation_guard.py
+++ b/torch/_dynamo/mutation_guard.py
@@ -103,7 +103,11 @@ def is_dynamic_nn_module(obj, is_export):
     # 1) Input signature problem because params are lifted as inputs
     # 2) nn module stack info changes
     # 3) adjust failing tests
-    if config.inline_inbuilt_nn_modules and not is_export:
+    if (
+        isinstance(obj, torch.nn.Module)
+        and config.inline_inbuilt_nn_modules
+        and not is_export
+    ):
         return True
     dyn = GenerationTracker.dynamic_classes.get(type(obj)) or GenerationTracker.check(
         obj
diff --git a/torch/_dynamo/source.py b/torch/_dynamo/source.py
index abf3b75cb2f8..33c464da5bd3 100644
--- a/torch/_dynamo/source.py
+++ b/torch/_dynamo/source.py
@@ -628,3 +628,7 @@ def is_from_defaults(source: Source):
     if isinstance(source, ChainedSource):
         return is_from_defaults(source.base)
     return False
+
+
+def is_cell_contents(source: Source):
+    return isinstance(source, AttrSource) and source.member == "cell_contents"
diff --git a/torch/_dynamo/symbolic_convert.py b/torch/_dynamo/symbolic_convert.py
index d6fb3e2145b7..093809703405 100644
--- a/torch/_dynamo/symbolic_convert.py
+++ b/torch/_dynamo/symbolic_convert.py
@@ -1504,6 +1504,14 @@ def SET_ADD(self, inst):
         assert obj.mutable_local
         return obj.call_method(self, "add", [v], {})
 
+    def SET_UPDATE(self, inst):
+        v = self.pop()
+        assert inst.argval > 0
+        obj = self.stack[-inst.arg]
+        assert isinstance(obj, SetVariable)
+        assert obj.mutable_local
+        obj.call_method(self, "update", [v], {})
+
     def LIST_APPEND(self, inst):
         v = self.pop()
         assert inst.argval > 0
@@ -2494,7 +2502,7 @@ def inline_call_(
             sub_locals, closure_cells = func.bind_args(parent, args, kwargs)
         except TypeError as e:
             # Wrap the general TypeError during bind_args() to the internal ArgsMismatchError with detailed info
-            raise ArgsMismatchError(  # noqa: TRY200
+            raise ArgsMismatchError(  # noqa: B904
                 "{reason}.\n  func = {func}, args = {args}, kwargs = {kwargs}".format(
                     reason=str(e),
                     func=f"'{func.get_name()}' {func.get_filename()}:{func.get_code().co_firstlineno}",
diff --git a/torch/_dynamo/testing.py b/torch/_dynamo/testing.py
index b4c022e8d8c2..9e9abe84228b 100644
--- a/torch/_dynamo/testing.py
+++ b/torch/_dynamo/testing.py
@@ -349,6 +349,12 @@ def xfailIfPy312(fn):
     return fn
 
 
+def skipIfPy312(fn):
+    if sys.version_info >= (3, 12):
+        return unittest.skip(fn)
+    return fn
+
+
 # Controls tests generated in test/inductor/test_torchinductor_dynamic_shapes.py
 # and test/dynamo/test_dynamic_shapes.py
 def expectedFailureDynamic(fn):
diff --git a/torch/_dynamo/trace_rules.py b/torch/_dynamo/trace_rules.py
index 6441ac3b0e84..8a2c12ee4e84 100644
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@@ -373,6 +373,7 @@
         "torch._add_relu_",
         "torch._add_relu",
         "torch._addmm_activation",
+        "torch._aminmax",
         "torch._amp_foreach_non_finite_check_and_unscale_",
         "torch._amp_update_scale_",
         "torch._assert_async",
diff --git a/torch/_dynamo/utils.py b/torch/_dynamo/utils.py
index 9c050d84a5ee..fcfbde1a6a79 100644
--- a/torch/_dynamo/utils.py
+++ b/torch/_dynamo/utils.py
@@ -1751,7 +1751,7 @@ def get_fake_value(node, tx, allow_non_graph_fake=False):
         elif isinstance(
             cause, torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode
         ):
-            raise UserError(  # noqa: TRY200
+            raise UserError(  # noqa: B904
                 UserErrorType.CONSTRAINT_VIOLATION,
                 "Tried to use data-dependent value in the subsequent computation. "
                 "This can happen when we encounter unbounded dynamic value that is unknown during tracing time.  "
@@ -2642,3 +2642,11 @@ def get_locals_to_steal(maybe_gm):
 
 def set_locals_to_steal(gm, locals_to_steal):
     gm.meta["locals_to_steal"] = locals_to_steal
+
+
+class Lit:
+    def __init__(self, s):
+        self.s = s
+
+    def __repr__(self):
+        return self.s
diff --git a/torch/_dynamo/variables/base.py b/torch/_dynamo/variables/base.py
index 80f4995a9fc0..1b2c05649f1b 100644
--- a/torch/_dynamo/variables/base.py
+++ b/torch/_dynamo/variables/base.py
@@ -172,6 +172,13 @@ def visit(
     def __repr__(self):
         return f"{self.__class__.__name__}()"
 
+    def debug_repr(self):
+        # Intended to be overridden to provide more info
+        try:
+            return repr(self.as_python_constant())
+        except NotImplementedError:
+            return repr(self)
+
     def python_type(self):
         """
         Abstract method to be implemented by subclasses of VariableTracker.
diff --git a/torch/_dynamo/variables/builder.py b/torch/_dynamo/variables/builder.py
index cc01ee150113..41b9fbd836ae 100644
--- a/torch/_dynamo/variables/builder.py
+++ b/torch/_dynamo/variables/builder.py
@@ -58,6 +58,7 @@
     FloatTensorSource,
     GetItemSource,
     GradSource,
+    is_cell_contents,
     is_constant_source,
     is_from_defaults,
     is_from_optimizer_source,
@@ -1166,6 +1167,7 @@ def wrap_literal(self, value):
                 # NN modules on the fly)
                 or self.source.guard_source().is_nn_module()
                 or is_from_defaults(self.source)
+                or is_cell_contents(self.source)
             ):
                 self.install_guards(GuardBuilder.CONSTANT_MATCH)
                 return ConstantVariable.create(value=value, source=self.source)
@@ -2377,6 +2379,8 @@ def create(tx, value) -> VariableTracker:
             return PlacementVariable(value)
         elif DeviceMeshVariable.is_device_mesh(value):
             return DeviceMeshVariable(value)
+        elif isinstance(value, re.Pattern):
+            return RegexPatternVariable(value)
         unimplemented(
             f"Unexpected type in sourceless builder {value_type.__module__}.{value_type.__qualname__}"
         )
@@ -2397,6 +2401,7 @@ def make_type_handlers():
         )
         handlers[dict] = lambda tx, value: ConstDictVariable(
             {create(tx, k): create(tx, v) for k, v in value.items()},
+            type(value),
             mutable_local=MutableLocal(),
         )
         handlers[list] = lambda tx, value: ListVariable(
@@ -2408,6 +2413,7 @@ def make_type_handlers():
         handlers[torch.Size] = lambda tx, value: SizeVariable(
             [create(tx, x) for x in value]
         )
+        handlers[collections.OrderedDict] = handlers[dict]
         handlers[immutable_dict] = handlers[dict]
         handlers[immutable_list] = handlers[list]
         handlers[types.ModuleType] = lambda tx, value: PythonModuleVariable(value)
diff --git a/torch/_dynamo/variables/builtin.py b/torch/_dynamo/variables/builtin.py
index cbe6079ab907..118e5a4addf9 100644
--- a/torch/_dynamo/variables/builtin.py
+++ b/torch/_dynamo/variables/builtin.py
@@ -787,27 +787,29 @@ def call_self_handler(tx, args, kwargs):
 
                 def constant_fold_handler(tx, args, kwargs):
                     # fast path
-                    return builder(
-                        tx,
-                        fn(
+                    try:
+                        res = fn(
                             *[x.as_python_constant() for x in args],
-                        ),
-                    )
+                        )
+                    except Exception as exc:
+                        unimplemented(f"constant fold exception: {repr(exc)}")
+                    return builder(tx, res)
 
             else:
 
                 def constant_fold_handler(tx, args, kwargs):
                     # path with a runtime check
                     if check_unspec_or_constant_args(args, kwargs):
-                        return builder(
-                            tx,
-                            fn(
+                        try:
+                            res = fn(
                                 *[x.as_python_constant() for x in args],
                                 **{
                                     k: v.as_python_constant() for k, v in kwargs.items()
                                 },
-                            ),
-                        )
+                            )
+                        except Exception as exc:
+                            unimplemented(f"constant fold exception: {repr(exc)}")
+                        return builder(tx, res)
 
             handlers.append(constant_fold_handler)
 
@@ -1400,10 +1402,15 @@ def check_type(ty):
 
     def call_issubclass(self, tx, left_ty, right_ty):
         """Checks if first arg is subclass of right arg"""
-        left_ty = left_ty.as_python_constant()
-        right_ty = right_ty.as_python_constant()
+        try:
+            left_ty_py = left_ty.as_python_constant()
+            right_ty_py = right_ty.as_python_constant()
+        except NotImplementedError:
+            unimplemented(
+                f"call_issubclass args not constant left_ty: {left_ty}, right_ty: {right_ty}"
+            )
 
-        return variables.ConstantVariable(issubclass(left_ty, right_ty))
+        return variables.ConstantVariable(issubclass(left_ty_py, right_ty_py))
 
     def call_super(self, tx, a, b):
         return variables.SuperVariable(a, b)
@@ -1570,7 +1577,7 @@ def call_getattr(
             ) and trace_rules.is_aten_op_or_tensor_method(member):
                 return TorchInGraphFunctionVariable(member, **options)
         elif isinstance(obj, (PythonModuleVariable, DummyModule)):
-            if obj.is_torch:
+            if obj.is_torch or name not in obj.value.__dict__:
                 member = getattr(obj.value, name)
             else:
                 member = obj.value.__dict__[name]
diff --git a/torch/_dynamo/variables/dicts.py b/torch/_dynamo/variables/dicts.py
index 723cf6ac77ef..0724a80621f7 100644
--- a/torch/_dynamo/variables/dicts.py
+++ b/torch/_dynamo/variables/dicts.py
@@ -150,6 +150,15 @@ def make_hashable(key):
     def as_proxy(self):
         return {k.vt.as_proxy(): v.as_proxy() for k, v in self.items.items()}
 
+    def debug_repr(self):
+        return (
+            "{"
+            + ", ".join(
+                f"{k.vt.debug_repr()}: {v.debug_repr()}" for k, v in self.items.items()
+            )
+            + "}"
+        )
+
     def as_python_constant(self):
         return {
             k.vt.as_python_constant(): v.as_python_constant()
@@ -195,7 +204,7 @@ def reconstruct(self, codegen):
     def getitem_const(self, arg: VariableTracker):
         key = ConstDictVariable._HashableTracker(arg)
         if key not in self.items:
-            raise KeyError(arg.value)
+            unimplemented(f"dict KeyError: {arg.value}")
         return self.items[key]
 
     def call_method(
@@ -315,6 +324,11 @@ def is_python_constant(self):
             return False
         return super().is_python_constant()
 
+    def debug_repr(self):
+        return (
+            f"defaultdict({self.default_factory.debug_repr()}, {super().debug_repr()})"
+        )
+
     @staticmethod
     def is_supported_arg(arg):
         if isinstance(arg, variables.BuiltinVariable):
@@ -358,6 +372,12 @@ def __init__(
         items = dict.fromkeys(items, SetVariable._default_value())
         super().__init__(items, **kwargs)
 
+    def debug_repr(self):
+        if not self.items:
+            return "set()"
+        else:
+            return "{" + ",".join(k.vt.debug_repr() for k in self.items.keys()) + "}"
+
     @property
     def set_items(self):
         return set(self.items.keys())
@@ -387,6 +407,8 @@ def call_method(
         args: List[VariableTracker],
         kwargs: Dict[str, VariableTracker],
     ) -> "VariableTracker":
+        from . import ListVariable, TupleVariable
+
         # We foward the calls to the dictionary model
         if name == "add":
             assert not kwargs
@@ -406,6 +428,24 @@ def call_method(
             return variables.UserFunctionVariable(
                 polyfill.set_isdisjoint
             ).call_function(tx, [self, args[0]], {})
+        elif (
+            name == "update"
+            and len(args) == 1
+            and isinstance(
+                args[0],
+                (
+                    SetVariable,
+                    ListVariable,
+                    TupleVariable,
+                ),
+            )
+            and self.mutable_local
+        ):
+            if isinstance(args[0], (ListVariable, TupleVariable)):
+                arg = SetVariable(args[0].unpack_var_sequence(tx))
+            else:
+                arg = args[0]
+            return super().call_method(tx, "update", (arg,), kwargs)
         return super().call_method(tx, name, args, kwargs)
 
     def getitem_const(self, arg: VariableTracker):
diff --git a/torch/_dynamo/variables/lists.py b/torch/_dynamo/variables/lists.py
index d51b4daff347..e0fe96dfa336 100644
--- a/torch/_dynamo/variables/lists.py
+++ b/torch/_dynamo/variables/lists.py
@@ -21,6 +21,7 @@
     is_namedtuple,
     istype,
     iter_contains,
+    Lit,
     namedtuple_fields,
     odict_values,
     set_example_value,
@@ -71,6 +72,9 @@ def modified(self, items, **kwargs):
     def value(self):
         return self.as_python_constant()
 
+    def debug_repr_helper(self, prefix, suffix):
+        return prefix + ", ".join(i.debug_repr() for i in self.items) + suffix
+
     def as_python_constant(self):
         return self.python_type()([x.as_python_constant() for x in self.items])
 
@@ -166,6 +170,9 @@ def __init__(self, items, **kwargs):
         assert stop is not None
         super().__init__([start, stop, step], **kwargs)
 
+    def debug_repr(self):
+        return self.debug_repr_helper("range(", ")")
+
     def python_type(self):
         return range
 
@@ -274,6 +281,9 @@ def python_type(self):
     def __repr__(self):
         return f"{self.__class__.__name__}(length={len(self.items)})"
 
+    def debug_repr(self):
+        return self.debug_repr_helper("[", "]")
+
     def reconstruct(self, codegen):
         codegen.foreach(self.items)
         codegen.append_output(create_instruction("BUILD_LIST", arg=len(self.items)))
@@ -316,6 +326,9 @@ class DequeVariable(CommonListMethodsVariable):
     def python_type(self):
         return collections.deque
 
+    def debug_repr(self):
+        return self.debug_repr_helper("deque([", "])")
+
     def reconstruct(self, codegen):
         assert "deque" not in codegen.tx.f_globals
         codegen.append_output(
@@ -374,6 +387,9 @@ class TupleVariable(BaseListVariable):
     def python_type(self):
         return tuple
 
+    def debug_repr(self):
+        return self.debug_repr_helper("(", ")")
+
     def reconstruct(self, codegen):
         codegen.foreach(self.items)
         codegen.append_output(create_instruction("BUILD_TUPLE", arg=len(self.items)))
@@ -410,6 +426,9 @@ def __init__(
         self.proxy = proxy
         super().__init__(items, **kwargs)
 
+    def debug_repr(self):
+        return self.debug_repr_helper("torch.Size([", "])")
+
     def python_type(self):
         return torch.Size
 
@@ -529,6 +548,9 @@ def get_item_dyn(self, tx, arg: VariableTracker):
             assert isinstance(index, (int, torch.SymInt))
             return self.items[index]
 
+    def call_hasattr(self, tx, name: str) -> "VariableTracker":
+        return variables.ConstantVariable.create(hasattr(torch.Size, name))
+
 
 class NamedTupleVariable(TupleVariable):
     _nonvar_fields = {
@@ -540,6 +562,9 @@ def __init__(self, items, tuple_cls, **kwargs):
         super().__init__(items, **kwargs)
         self.tuple_cls = tuple_cls
 
+    def debug_repr(self):
+        return repr(self.tuple_cls(*(Lit(x.debug_repr()) for x in self.items)))
+
     def python_type(self):
         return self.tuple_cls
 
@@ -610,6 +635,9 @@ def __init__(self, items, **kwargs):
 
         super().__init__([start, stop, step], **kwargs)
 
+    def debug_repr(self):
+        return self.debug_repr_helper("slice(", ")")
+
     def as_proxy(self):
         return slice(*self._as_proxy())
 
@@ -750,6 +778,15 @@ def __init__(self, items, *, user_cls: type, user_cls_source: Source, **kwargs):
         assert istype(user_cls, type)
         assert isinstance(user_cls_source, Source)
 
+    def debug_repr(self):
+        # The constructor is safe as no methods, including __init__, are
+        # allowed to be overridden
+        # NB: This is guaranteed to print like a list, as __repr__ cannot be
+        # overridden, this is... well, it's OK I guess (consistent with
+        # eager), but it could be misleading.  You will have to query type
+        # instead for details.
+        return repr(self.user_cls([Lit(x.debug_repr()) for x in self.items]))
+
     def python_type(self):
         return self.user_cls
 
diff --git a/torch/_dynamo/variables/nn_module.py b/torch/_dynamo/variables/nn_module.py
index bcded32ab4fb..f71767a7b7cb 100644
--- a/torch/_dynamo/variables/nn_module.py
+++ b/torch/_dynamo/variables/nn_module.py
@@ -77,6 +77,39 @@ def record_nn_module_stack(module_key: str, source, tx, mod: torch.nn.Module):
         del tx.nn_module_stack[module_key]
 
 
+def guard_to_detect_forward_monkeypatching(source, mod):
+    # Users sometimes patch the forward method of a nn module instance to
+    # perform optimizations like quantization. Though this is not a good
+    # software practice, but python allows this and Dynamo needs to detect
+    # this patching.
+    #
+    # One way to do this is to add an ID_MATCH guard on every function
+    # getting inlined (https://github.com/pytorch/pytorch/pull/124975). But
+    # this increased guard overhead by around 20%.
+    #
+    # To keep the guard overhead down, we just guard on the `forward` being
+    # not present in the mod __dict__. The common case of patching forward
+    # method adds `forward` in the instance __dict__, whereas the unpatched
+    # `forward` sits in the type(mod).__dict__
+    if source:
+        if "forward" in mod.__dict__ and callable(mod.__dict__["forward"]):
+            # Monkeypatched forward method, add an ID_MATCH guard on forward function
+            fwd = mod.__dict__["forward"]
+            forward_source = AttrSource(source, "forward")
+            if type(fwd) is types.MethodType:
+                forward_source = AttrSource(forward_source, "__func__")
+            install_guard(forward_source.make_guard(GuardBuilder.CLOSURE_MATCH))
+        else:
+            # Common case - check that the forward key is absent in mod __dict__
+            install_guard(
+                source.make_guard(
+                    functools.partial(
+                        GuardBuilder.NOT_PRESENT_IN_GENERIC_DICT, attr="forward"
+                    )
+                )
+            )
+
+
 class NNModuleVariable(VariableTracker):
     _nonvar_fields = {
         "module_type",
@@ -216,6 +249,9 @@ def var_getattr(self, tx, name):
                 # if we can't find a __getattr__, just raise the AttributeError
                 raise
 
+        if name == "forward":
+            guard_to_detect_forward_monkeypatching(self.source, base)
+
         if name == "__class__" and not object_member:
             return variables.UserDefinedClassVariable(base.__class__, source=source)
 
@@ -746,6 +782,8 @@ def call_function(
         else:
             source = None
 
+        guard_to_detect_forward_monkeypatching(self.source, mod)
+
         ctx = (
             record_nn_module_stack(str(id(mod)), self.source, tx, mod)
             if self.source
diff --git a/torch/_dynamo/variables/sdpa.py b/torch/_dynamo/variables/sdpa.py
index 0a6af76690df..c5b0d9f586c8 100644
--- a/torch/_dynamo/variables/sdpa.py
+++ b/torch/_dynamo/variables/sdpa.py
@@ -65,9 +65,9 @@ def var_getattr(self, tx, name: str) -> VariableTracker:
             getattr_static(torch._C._SDPAParams, name)
         except AttributeError:
             # Using raise from is too verbose here
-            raise Unsupported(  # noqa: TRY200
+            raise Unsupported(
                 f"Unsupported torch._C._SDPAParams attribute {name}"
-            )
+            ) from None
 
         proxy = GetAttrVariable.create_getattr_proxy(self.as_proxy(), name)
         if self.source is not None:
diff --git a/torch/_dynamo/variables/tensor.py b/torch/_dynamo/variables/tensor.py
index e1d5cee368da..a1adbcf614bc 100644
--- a/torch/_dynamo/variables/tensor.py
+++ b/torch/_dynamo/variables/tensor.py
@@ -149,6 +149,10 @@ def __init__(
             _is_name_set = self.proxy.node.op == "placeholder"
         self._is_name_set: bool = _is_name_set
 
+    def debug_repr(self):
+        # TODO: strip off fake tensor from repr here
+        return repr(self.proxy.node.meta["example_value"])
+
     def as_proxy(self):
         return self.proxy
 
@@ -969,6 +973,9 @@ class SymNodeVariable(VariableTracker):
         *VariableTracker._nonvar_fields,
     }
 
+    def debug_repr(self):
+        return repr(self.sym_num)
+
     @classmethod
     def create(cls, tx, proxy, sym_num=None, **options):
         if sym_num is None:
@@ -1012,7 +1019,7 @@ def evaluate_expr(self, output_graph=None):
         try:
             return guard_scalar(self.sym_num)
         except GuardOnDataDependentSymNode as e:
-            raise UserError(  # noqa: TRY200
+            raise UserError(  # noqa: B904
                 UserErrorType.ANTI_PATTERN,
                 f"Consider annotating your code using torch._check*(). {str(e)}",
                 case_name="constrain_as_size_example",
diff --git a/torch/_dynamo/variables/torch.py b/torch/_dynamo/variables/torch.py
index 47705cdc07e1..8e7089f08059 100644
--- a/torch/_dynamo/variables/torch.py
+++ b/torch/_dynamo/variables/torch.py
@@ -83,6 +83,7 @@
     torch._assert,
     torch._utils._get_device_index,
     torch._C._get_cublas_allow_tf32,
+    torch._C._is_any_autocast_enabled,
     torch.cuda.get_device_properties,
     torch.cuda.is_available,
     torch.distributed.is_available,
diff --git a/torch/_export/__init__.py b/torch/_export/__init__.py
index a1c233acf01e..105a7ee2594b 100644
--- a/torch/_export/__init__.py
+++ b/torch/_export/__init__.py
@@ -149,7 +149,10 @@ def capture_pre_autograd_graph(
         kwargs = {}
 
     if export_api_rollout_check():
-        log.warning("Using torch.export._trace._export")
+        @lru_cache
+        def print_export_warning():
+            log.warning("Using torch.export._trace._export")
+        print_export_warning()
         module = torch.export._trace._export(f, args, kwargs, dynamic_shapes=dynamic_shapes, pre_dispatch=True).module()
     else:
         log_export_usage(event="export.private_api", flags={"capture_pre_autograd_graph"})
@@ -343,6 +346,7 @@ def aot_compile(
     options: Optional[Dict[str, Any]] = None,
     remove_runtime_assertions: bool = False,
     disable_constraint_solver: bool = False,
+    same_signature: bool = True,
 ) -> str:
     """
     Note: this function is not stable yet
@@ -393,6 +397,7 @@ def aot_compile(
             kwargs,
             dynamic_shapes,
             disable_constraint_solver=disable_constraint_solver,
+            same_signature=same_signature,
             # Disabling this flag, because instead we can rely on the mapping
             # dynamo_flat_name_to_original_fqn which is coming from Dynamo.
             restore_fqn=False,
diff --git a/torch/_export/non_strict_utils.py b/torch/_export/non_strict_utils.py
index 42c20aa55500..aff3d444c960 100644
--- a/torch/_export/non_strict_utils.py
+++ b/torch/_export/non_strict_utils.py
@@ -110,7 +110,9 @@ def make_fake_params_buffers(
     return faked_params_buffers  # type: ignore[return-value]
 
 
-def make_fake_inputs(nn_module, args, kwargs, dynamic_shapes):
+def make_fake_inputs(
+    nn_module, args, kwargs, dynamic_shapes, _is_torch_jit_trace=False
+):
     """
     Given an nn module, example inputs, and constraints, return a new fake mode,
     fake inputs created in that mode whose dynamic shape dimensions are constrained
@@ -127,7 +129,7 @@ def make_fake_inputs(nn_module, args, kwargs, dynamic_shapes):
     #   - [post-tracing] guards.py processes input shape equalities.
 
     constraints = torch.export.dynamic_shapes._process_dynamic_shapes(
-        nn_module, args, kwargs, dynamic_shapes
+        nn_module, args, kwargs, dynamic_shapes, _is_torch_jit_trace=_is_torch_jit_trace
     )
     constraints = constraints or []
     t_constraints: Dict[int, Dict[int, Constraint]] = defaultdict(dict)
@@ -136,13 +138,6 @@ def make_fake_inputs(nn_module, args, kwargs, dynamic_shapes):
         if constraint.shared is not None:
             t_constraints[constraint.shared.t_id][constraint.shared.dim] = constraint
 
-    code = nn_module.forward.__code__
-    co_fields = {
-        "co_name": code.co_name,
-        "co_filename": code.co_filename,
-        "co_firstlineno": code.co_firstlineno,
-    }
-
     context = torch._guards.TracingContext.try_get()
     if context is not None:
         # This occurs when we are exporting within dynamo. There already exists
@@ -153,11 +148,22 @@ def make_fake_inputs(nn_module, args, kwargs, dynamic_shapes):
             len(constraints) == 0
         ), "Found constraints when tracing with a toplevel tracing context."
         fake_mode = context.fake_mode
-    else:
+    elif not _is_torch_jit_trace:
+        code = nn_module.forward.__code__
+        co_fields = {
+            "co_name": code.co_name,
+            "co_filename": code.co_filename,
+            "co_firstlineno": code.co_firstlineno,
+        }
         fake_mode = FakeTensorMode(
             shape_env=ShapeEnv(tracked_fakes=[], co_fields=co_fields),
             allow_non_fake_inputs=True,
         )
+    else:
+        fake_mode = FakeTensorMode(
+            shape_env=ShapeEnv(tracked_fakes=[]),
+            allow_non_fake_inputs=True,
+        )
     if fake_mode.shape_env is None or fake_mode.shape_env.tracked_fakes is None:
         raise ValueError(
             "Detected fake_mode does not have a shape_env with tracked fakes. "
@@ -166,7 +172,11 @@ def make_fake_inputs(nn_module, args, kwargs, dynamic_shapes):
         )
 
     with fake_mode:
-        original_signature = inspect.signature(nn_module.forward)
+        # FIXME(ycao) ScriptMethod doesn't have signature, I am using an empty one to unblock
+        if not _is_torch_jit_trace:
+            original_signature = inspect.signature(nn_module.forward)
+        else:
+            original_signature = None
         sources: Dict[Tuple[int, int], List[Source]] = defaultdict(list)
         fake_args, fake_kwargs = tree_map_with_path(
             lambda kp, val: fakify(fake_mode, kp, val, t_constraints, sources),
@@ -215,6 +225,7 @@ def produce_guards_and_solve_constraints(
     equalities_inputs: EqualityConstraint,
     original_signature: inspect.Signature,
     _disable_forced_specializations: Optional[bool] = False,
+    _is_torch_jit_trace=False,
 ):
     """
     Given a fake mode, sources pairs corresponding to equal dynamic shape dimensions,
@@ -259,9 +270,13 @@ def produce_guards_and_solve_constraints(
     )
     dim_constraints.remove_redundant_dynamic_results()
     forced_specializations = dim_constraints.forced_specializations()
-    msg = dim_constraints.prettify_results(
-        original_signature, constraint_violation_error, forced_specializations
-    )
+    if not _is_torch_jit_trace:
+        msg = dim_constraints.prettify_results(
+            original_signature, constraint_violation_error, forced_specializations
+        )
+    else:
+        # FIXME(ycao): This is a hack to get around missing signature from ScriptMethod
+        msg = "dummy constraint violation message"
     if constraint_violation_error:
         constraint_violation_error.args = (constraint_violation_error.args[0] + msg,)
     elif forced_specializations:
diff --git a/torch/_export/passes/replace_set_grad_with_hop_pass.py b/torch/_export/passes/replace_set_grad_with_hop_pass.py
index e362ee354771..91104c17c38d 100644
--- a/torch/_export/passes/replace_set_grad_with_hop_pass.py
+++ b/torch/_export/passes/replace_set_grad_with_hop_pass.py
@@ -60,6 +60,12 @@ def _replace_with_hop(node: torch.fx.Node):
                 set_grad_node.meta.get("nn_module_stack", {})
             )
             output_node = next(iter(reversed(sub_gm.graph.nodes)), None)
+            # Split_module pass intentially doesn't add output node
+            # if the graph doesn't return anything.
+            # TODO (tmanlaibaatar) Figure out if this is right behaviour
+            # for split_module
+            if isinstance(output_node, torch.fx.Node) and output_node.op != "output":
+                output_node = None
             if output_node is not None:
                 assert len(output_node.args) == 1
                 output_args = output_node.args[0]
@@ -106,9 +112,7 @@ def _replace_with_hop(node: torch.fx.Node):
                         f"repalce_set_grad_with_hop_pass doesnt' support output type {type(output_args)}"
                     )
             else:
-                raise NotImplementedError(
-                    "Cannot replace a call_module with a hop if it has no output. This module will gets DCEed."
-                )
+                node.graph.erase_node(node)
         sub_graph.erase_node(set_grad_node)
 
 
@@ -164,6 +168,7 @@ def _maybe_inline_or_replace_with_hop(node: torch.fx.Node):
                     else node
                 ),
             )
+        new_gm.recompile()
         return new_gm
 
     return gm
diff --git a/torch/_export/tools.py b/torch/_export/tools.py
new file mode 100644
index 000000000000..d76392993bd2
--- /dev/null
+++ b/torch/_export/tools.py
@@ -0,0 +1,139 @@
+import logging
+import warnings
+from typing import Any, Dict, Iterable, Optional, Tuple
+
+import torch
+import torch.export
+import torch.export._trace
+from torch._utils_internal import log_export_usage
+
+log = logging.getLogger(__name__)
+
+__all__ = ["report_exportability"]
+
+
+def _generate_inputs_for_submodules(
+    model: torch.nn.Module,
+    target_submodules: Iterable[str],
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Tuple[Any, Any]]:
+    """
+    Generate inputs for targeting submdoules in the given model. Note that if two submodules refer to the same obj, this
+    function doesn't work.
+
+    Args:
+        model: root model.
+        inputs: inputs to the root model.
+        target_submodules: submodules that we want to generate inputs for.
+
+    Returns:
+        A dict that maps from submodule name to its inputs.
+    """
+    kwargs = kwargs or {}
+
+    handles = []
+    results = {}
+    submodule_to_names = {mod: name for name, mod in model.named_modules()}
+
+    def pre_forward(module, module_args, module_kwargs):
+        results[submodule_to_names[module]] = (module_args, module_kwargs)
+
+    try:
+        for name, mod in model.named_modules():
+            if name in target_submodules:
+                handles.append(
+                    mod.register_forward_pre_hook(pre_forward, with_kwargs=True)
+                )
+        model(*args, **kwargs)
+    except Exception as e:
+        warnings.warn(
+            f"Failed to generate submodule inputs because of the following error:\n{e}"
+        )
+    finally:
+        for h in handles:
+            h.remove()
+    return results
+
+
+def report_exportability(
+    mod: torch.nn.Module,
+    args: Tuple[Any, ...],
+    kwargs: Optional[Dict[str, Any]] = None,
+    *,
+    strict: bool = True,
+    pre_dispatch: bool = False,
+) -> Dict[str, Optional[Exception]]:
+    """
+    Report exportability issues for a module in one-shot.
+
+    Args:
+        mod: root module.
+        args: args to the root module.
+        kwargs: kwargs to the root module.
+    Returns:
+        A dict that maps from submodule name to the exception that was raised when trying to export it.
+        `None` means the module is exportable without issue.
+    Sample output:
+        {
+            '': UnsupportedOperatorException(func=<OpOverload(op='testlib.op_missing_meta', overload='default')>),
+            'submod_1': UnsupportedOperatorException(func=<OpOverload(op='testlib.op_missing_meta', overload='default')>),
+            'submod_2': None
+        }
+    """
+
+    log_export_usage(event="export.report_exportability")
+
+    kwargs = kwargs or {}
+
+    all_submod_names = [name for name, _ in mod.named_modules() if name != ""]
+    submod_inputs = _generate_inputs_for_submodules(mod, all_submod_names, args, kwargs)
+
+    report: Dict[str, Optional[Exception]] = {}
+
+    def try_export(module, module_name, args, kwargs):
+        nonlocal submod_inputs, report, strict, pre_dispatch
+
+        if args is not None or kwargs is not None:
+            try:
+                torch.export._trace._export(
+                    module,
+                    args,
+                    kwargs,
+                    strict=strict,
+                    pre_dispatch=pre_dispatch,
+                )
+                report[module_name] = None
+                log.info("Successfully exported `%s`", module_name)
+                return
+            except Exception as e:
+                short_msg = repr(e).split("\n")[0]
+                log.warning(
+                    "Failed exporting `%s` with exception: %s", module_name, short_msg
+                )
+                report[module_name] = e
+
+        for name, submod in module.named_children():
+            sub_module_name = name if module_name == "" else f"{module_name}.{name}"
+
+            submod_args, submod_kwargs = submod_inputs.get(
+                sub_module_name, (None, None)
+            )
+
+            try_export(submod, sub_module_name, submod_args, submod_kwargs)
+
+        return
+
+    try_export(mod, "", args, kwargs)
+
+    unique_issues = set()
+    for exception in report.values():
+        if exception is not None:
+            key = repr(exception).split("\\n")[0]
+            unique_issues.add(key)
+
+    log.warning("Found %d export issues:", len(unique_issues))
+    for issue in unique_issues:
+        log.warning(issue)
+
+    return report
diff --git a/torch/_export/utils.py b/torch/_export/utils.py
index 59648ccadab2..19fc4e9bdc4d 100644
--- a/torch/_export/utils.py
+++ b/torch/_export/utils.py
@@ -118,7 +118,7 @@ def get_keystr(key_path: KeyPath) -> str:
                                 sympy.Eq(node_dim.node.expr, arg_dim), symbol
                             )
                             if solution is None:
-                                raise RuntimeError(  # noqa: TRY200
+                                raise RuntimeError(  # noqa: B904
                                     f"Expected input {node.name}.shape[{j}] = {arg_dim} to be "
                                     f"of the form {node_dim.node.expr}, where {symbol} is an integer"
                                 )
diff --git a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
index f0d39fd1e858..0b6e02da80d2 100644
--- a/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/jit_compile_runtime_wrappers.py
@@ -83,21 +83,6 @@ def _force_contiguous(x):
     return x
 
 
-def _compute_output_meta_with_inductor_strides(fw_module, fwd_output_strides):
-    out = [n.meta["val"] for n in (list(fw_module.graph.nodes)[-1].args[0])]
-    # will only be set for inductor
-    if not fwd_output_strides:
-        return out
-    with TracingContext.get().fake_mode.shape_env.suppress_guards():
-        for i in range(len(out)):
-            if not isinstance(out[i], Tensor):
-                continue
-            if all(s1 == s2 for s1, s2 in zip(out[i].stride(), fwd_output_strides[i])):
-                continue
-            out[i] = out[i].as_strided(out[i].shape, fwd_output_strides[i])
-    return out
-
-
 # See Note [Tangents must be contiguous, Part 2]
 def coerce_runtime_tangent(x, metadata_tensor):
     if not isinstance(x, torch.Tensor):
@@ -141,7 +126,6 @@ def aot_dispatch_base(
     (
         fw_module,
         updated_flat_args,
-        aot_config,
         fw_metadata,
     ) = fakified_out_wrapper.pre_compile(
         fw_module, updated_flat_args, aot_config, fw_metadata=fw_metadata
@@ -150,7 +134,6 @@ def aot_dispatch_base(
     (
         fw_module,
         updated_flat_args,
-        aot_config,
         fw_metadata,
     ) = functionalized_rng_wrapper.pre_compile(
         fw_module, updated_flat_args, aot_config, fw_metadata=fw_metadata
@@ -187,12 +170,12 @@ def aot_dispatch_base(
 
     # Create a wrapper to set up the rng functionalize and fakified out bits
     compiled_fw = functionalized_rng_wrapper.post_compile(
-        compiled_fw, aot_config, fw_metadata=fw_metadata
+        compiled_fw, aot_config, runtime_metadata=fw_metadata
     )
     compiled_fw = fakified_out_wrapper.post_compile(
         compiled_fw,
         aot_config,
-        fw_metadata=fw_metadata,
+        runtime_metadata=fw_metadata,
     )
     # Why do we need to pass in num_fw_outs_saved_for_bw?
     # See Note: [Partitioner handling for Subclasses, Part 2]
@@ -205,7 +188,7 @@ def aot_dispatch_base(
     ).post_compile(
         compiled_fw,
         aot_config,  # not used
-        fw_metadata=fw_metadata,
+        runtime_metadata=fw_metadata,
     )
 
     if not hasattr(compiled_fw_func, "_boxed_call"):
@@ -218,7 +201,7 @@ def aot_dispatch_base(
     ).post_compile(
         compiled_fw_func,
         aot_config,
-        fw_metadata=fw_metadata,
+        runtime_metadata=fw_metadata,
     )
 
     return compiled_fn
@@ -420,7 +403,6 @@ def aot_dispatch_autograd(
             (
                 fw_module,
                 adjusted_flat_args,
-                aot_config,
                 fw_metadata,
             ) = fakified_out_wrapper.pre_compile(
                 fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata
@@ -432,7 +414,6 @@ def aot_dispatch_autograd(
             (
                 fw_module,
                 adjusted_flat_args,
-                aot_config,
                 fw_metadata,
             ) = functionalized_rng_wrapper.pre_compile(
                 fw_module, adjusted_flat_args, aot_config, fw_metadata=fw_metadata
@@ -457,16 +438,16 @@ def aot_dispatch_autograd(
             ).post_compile(
                 compiled_fw_func,
                 aot_config,  # not used
-                fw_metadata=fw_metadata,
+                runtime_metadata=fw_metadata,
             )
 
             compiled_fw_func = functionalized_rng_wrapper.post_compile(
-                compiled_fw_func, aot_config, fw_metadata=fw_metadata
+                compiled_fw_func, aot_config, runtime_metadata=fw_metadata
             )
             compiled_fw_func = fakified_out_wrapper.post_compile(
                 compiled_fw_func,
                 aot_config,
-                fw_metadata=fw_metadata,
+                runtime_metadata=fw_metadata,
             )
 
         # NB: It's important to compile backwards ahead of time, as this may
@@ -1032,7 +1013,7 @@ def backward(ctx, *args):
     ).post_compile(
         CompiledFunction.apply,
         aot_config,
-        fw_metadata=fw_metadata,
+        runtime_metadata=fw_metadata,
     )
 
     if not config.debug_assert:
diff --git a/torch/_functorch/_aot_autograd/runtime_wrappers.py b/torch/_functorch/_aot_autograd/runtime_wrappers.py
index 934b783f6fc8..c1b9a3b29f2e 100644
--- a/torch/_functorch/_aot_autograd/runtime_wrappers.py
+++ b/torch/_functorch/_aot_autograd/runtime_wrappers.py
@@ -73,7 +73,7 @@ def pre_compile(
         aot_config: AOTConfig,
         *,
         fw_metadata: ViewAndMutationMeta,
-    ):
+    ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]:
         """
         Process the inputs to the compiler_fn. You can pass in extra metadata via kwargs.
         Args:
@@ -82,15 +82,15 @@ def pre_compile(
         aot_config: AOTConfig passed in at compile time
         fw_metadata: ViewAndMutationMeta generated from flat_fn and flat_args
         """
-        return flat_fn, flat_args, aot_config, fw_metadata
+        return flat_fn, flat_args, fw_metadata
 
-    def post_compile(self, compiled_fn, aot_config, *, fw_metadata):
+    def post_compile(self, compiled_fn, aot_config, *, runtime_metadata) -> Callable:
         """
         Given an output of the compiler, wrap it with information received from prologue.
         Args:
         compiled_fn: Callable after calling compiler_fn
         aot_config: AOTConfig after calling prologue
-        fw_metadata: ViewAndMutationMeta after calling prologue
+        runtime_metadata: ViewAndMutationMeta after calling all wrappers's pre_compile steps.
         Example:
 
         def wrapped_compiled_fn(args):
@@ -101,28 +101,6 @@ def wrapped_compiled_fn(args):
         """
         return compiled_fn
 
-    def create(
-        self,
-        flat_fn,
-        flat_args: List[Tensor],
-        aot_config: AOTConfig,
-        *,
-        fw_metadata: ViewAndMutationMeta,
-        compiler_fn,
-    ):
-        (
-            wrapped_flat_fn,
-            new_flat_args,
-            new_aot_config,
-            new_fw_metadata,
-        ) = self.pre_compile(flat_fn, flat_args, aot_config, fw_metadata=fw_metadata)
-        compiled_fn = compiler_fn(
-            wrapped_flat_fn, new_flat_args, new_aot_config, fw_metadata=new_fw_metadata
-        )
-        return self.post_compile(
-            compiled_fn, new_aot_config, fw_metadata=new_fw_metadata
-        )
-
 
 # The wrapper created by this function handles all of the runtime aliasing and mutation "epilogue" logic
 # that needs to run after the compiled function.
@@ -143,11 +121,11 @@ def post_compile(
         compiled_fn,
         aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         return _create_runtime_wrapper(
             compiled_fn,
-            runtime_metadata=fw_metadata,
+            runtime_metadata=runtime_metadata,
             indices_of_inps_to_detach=self.indices_of_inps_to_detach,
             trace_joint=self.trace_joint,
             keep_input_mutations=aot_config.keep_inference_input_mutations,
@@ -421,7 +399,7 @@ def pre_compile(
         aot_config,
         *,
         fw_metadata,
-    ):
+    ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]:
         if config.functionalize_rng_ops:
             # Update example inputs for the fw_compiler
             fake_mode = detect_fake_mode()
@@ -430,27 +408,27 @@ def pre_compile(
             # We are not clearing flat_args here because
             # 1) There is a check in the debug compiler at the end
             # 2) It does not matter as these are fake tensors
-        return flat_fn, flat_args, aot_config, fw_metadata
+        return flat_fn, flat_args, fw_metadata
 
     def post_compile(
         self,
         compiled_fn,
         aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         @wraps(compiled_fn)
         def wrapper(runtime_args: List[Any]):
-            if fw_metadata.is_rng_op_functionalized:
+            if runtime_metadata.is_rng_op_functionalized:
                 # Add the seed and offset to args
                 seed, offset = CUDARngStateHelper.get_torch_state_as_tuple()
                 runtime_args.extend([seed, offset])
                 out = compiled_fn(runtime_args)
                 out = self._functionalized_rng_runtime_epilogue(
-                    fw_metadata,
+                    runtime_metadata,
                     out,
                     # TODO: this won't be right for the backward when we convert the call_compiled_backward to use the wrapper
-                    fw_metadata.num_forward_returns,
+                    runtime_metadata.num_forward_returns,
                 )
                 return out
             return compiled_fn(runtime_args)
@@ -493,7 +471,7 @@ def pre_compile(
         aot_config,
         *,
         fw_metadata,
-    ):
+    ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]:
         tracing_context = torch._guards.TracingContext.try_get()
         if tracing_context and tracing_context.fakify_first_call:
             self.out_metas = [
@@ -501,22 +479,25 @@ def pre_compile(
             ]
         else:
             self.needs_post_compile = False
-        return fw_module, flat_args, aot_config, fw_metadata
+        return fw_module, flat_args, fw_metadata
 
     def _compute_output_meta_with_inductor_strides(self):
         out = self.out_metas
         fwd_output_strides = self.fwd_output_strides
         if not fwd_output_strides:
             return out
-        with TracingContext.get().fake_mode.shape_env.suppress_guards():
-            for i in range(len(out)):
-                if not isinstance(out[i], Tensor):
-                    continue
-                if all(
-                    s1 == s2 for s1, s2 in zip(out[i].stride(), fwd_output_strides[i])
-                ):
-                    continue
-                out[i] = out[i].as_strided(out[i].shape, fwd_output_strides[i])
+
+        from torch.fx.experimental.symbolic_shapes import statically_known_true
+
+        for i in range(len(out)):
+            if not isinstance(out[i], Tensor):
+                continue
+            if all(
+                statically_known_true(s1 == s2)
+                for s1, s2 in zip(out[i].stride(), fwd_output_strides[i])
+            ):
+                continue
+            out[i] = out[i].as_strided(out[i].shape, fwd_output_strides[i])
         return out
 
     # To be called post compile
@@ -528,7 +509,7 @@ def post_compile(
         compiled_fn,
         aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         if self.needs_post_compile:
             assert self.fwd_output_strides is not None
@@ -575,19 +556,19 @@ def pre_compile(
             fw_only=self.fw_only,  # type: ignore[arg-type]
         )
         self.maybe_subclass_meta = subclass_meta
-        return new_flat_fn, new_flat_args, aot_config, fw_metadata
+        return new_flat_fn, new_flat_args, fw_metadata
 
     def post_compile(
         self,
         compiled_fn,
         _aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         if self.maybe_subclass_meta is None:
             return compiled_fn
 
-        subclass_metas = fw_metadata.subclass_fw_graph_out_meta
+        subclass_metas = runtime_metadata.subclass_fw_graph_out_meta
 
         @wraps(compiled_fn)
         def inner_fn(args: List[Any]):
@@ -713,7 +694,7 @@ def pre_compile(
         aot_config: AOTConfig,
         *,
         fw_metadata: ViewAndMutationMeta,
-    ):
+    ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]:
         # Use information about whether or not flat_fn mutates its arguments
         # or not to handle dupe args
 
@@ -740,7 +721,7 @@ def pre_compile(
 
         if ok:
             self.needs_post_compile = False
-            return flat_fn, leaf_flat_args, aot_config, fw_metadata
+            return flat_fn, leaf_flat_args, fw_metadata
 
         if requires_subclass_dispatch(leaf_flat_args, fw_metadata):
             raise RuntimeError(
@@ -865,14 +846,14 @@ def wrapped_flat_fn(*args):
                 ref_fw_metadata == updated_fw_metadata
             ), f"ref_metadata={str(ref_fw_metadata)}, actual_metadata={str(updated_fw_metadata)}"
 
-        return wrapped_flat_fn, deduped_flat_args, aot_config, updated_fw_metadata
+        return wrapped_flat_fn, deduped_flat_args, updated_fw_metadata
 
     def post_compile(
         self,
         compiled_fn,
         aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         if not self.needs_post_compile:
             return compiled_fn
@@ -932,6 +913,8 @@ def debugged_compiled_fn(args):
 # would cause us to hit that path more frequently).
 @dataclass
 class AOTSyntheticBaseWrapper(CompilerWrapper):
+    # Currently, the only reason we need to plumb this bool is because
+    # the synthetic base code prohibits more cases in the autograd case than the inference case.
     trace_joint: bool  # TODO: refactor trace_joint
     needs_post_compile: bool = True
     aliased_arg_idx_with_metadata_mutations: List[int] = field(default_factory=list)
@@ -943,7 +926,7 @@ def pre_compile(
         aot_config: AOTConfig,
         *,
         fw_metadata: ViewAndMutationMeta,
-    ):
+    ) -> Tuple[Callable, List[Tensor], ViewAndMutationMeta]:
         is_inference = not self.trace_joint
         flat_args_with_synthetic_bases, synthetic_base_info = merge_view_inputs(
             flat_args,
@@ -954,7 +937,7 @@ def pre_compile(
         # Happy path: we don't need synthetic bases
         if synthetic_base_info is None:
             self.needs_post_compile = False
-            return flat_fn, flat_args, aot_config, fw_metadata
+            return flat_fn, flat_args, fw_metadata
 
         # export path: ban synthetic bases for now, add later if requested.
         if requires_subclass_dispatch(flat_args, fw_metadata):
@@ -1050,7 +1033,6 @@ def wrapped_flat_fn(*args):
         return (
             wrapped_flat_fn,
             flat_args_with_synthetic_bases,
-            aot_config,
             fw_metadata_updated,
         )
 
@@ -1059,7 +1041,7 @@ def post_compile(
         compiled_fn,
         aot_config: AOTConfig,
         *,
-        fw_metadata: ViewAndMutationMeta,
+        runtime_metadata: ViewAndMutationMeta,
     ):
         if not self.needs_post_compile:
             return compiled_fn
diff --git a/torch/_functorch/aot_autograd.py b/torch/_functorch/aot_autograd.py
index f1ba67794bc7..379518fb958c 100644
--- a/torch/_functorch/aot_autograd.py
+++ b/torch/_functorch/aot_autograd.py
@@ -670,23 +670,26 @@ def convert(idx, x):
                 aot_dispatch_base_graph if aot_config.is_export else aot_dispatch_base
             )
 
-        wrappers = [
+        # Wrappers that edit fw_metadata
+        fw_metadata_wrappers = [
             AOTDedupeWrapper(),
             AOTSyntheticBaseWrapper(trace_joint=needs_autograd),
             # Add more passes here
         ]
-        for wrapper in wrappers:
-            flat_fn, fake_flat_args, aot_config, fw_metadata = wrapper.pre_compile(
+        for wrapper in fw_metadata_wrappers:
+            flat_fn, fake_flat_args, fw_metadata = wrapper.pre_compile(
                 flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata
             )
+        # Once all fw_metadata_wrappers have run, runtime_metadata is fixed
+        runtime_metadata = fw_metadata
 
         compiled_fn = compiler_fn(
-            flat_fn, fake_flat_args, aot_config, fw_metadata=fw_metadata
+            flat_fn, fake_flat_args, aot_config, fw_metadata=runtime_metadata
         )
 
-        for wrapper in reversed(wrappers):
+        for wrapper in reversed(fw_metadata_wrappers):
             compiled_fn = wrapper.post_compile(
-                compiled_fn, aot_config, fw_metadata=fw_metadata
+                compiled_fn, aot_config, runtime_metadata=runtime_metadata
             )
 
         if aot_config.is_export:
diff --git a/torch/_functorch/compile_utils.py b/torch/_functorch/compile_utils.py
index ffa37e59f04d..c9c750835a9f 100644
--- a/torch/_functorch/compile_utils.py
+++ b/torch/_functorch/compile_utils.py
@@ -1,6 +1,8 @@
 # mypy: ignore-errors
 
 
+from typing import Callable
+
 import torch
 import torch.fx as fx
 from torch.utils import _pytree as pytree
@@ -9,7 +11,7 @@
 aten = torch.ops.aten
 
 
-def get_aten_target(node):
+def get_aten_target(node: fx.Node) -> Callable:
     if hasattr(node.target, "overloadpacket"):
         return node.target.overloadpacket
     return node.target
diff --git a/torch/_functorch/config.py b/torch/_functorch/config.py
index 5749477c6e98..c559951f3809 100644
--- a/torch/_functorch/config.py
+++ b/torch/_functorch/config.py
@@ -136,6 +136,10 @@
 # of tensors in question.
 fake_tensor_propagate_real_tensors = False
 
+# Controls the default graph output format used by draw_graph
+# Supported formats are defined here https://graphviz.org/docs/outputs/
+torch_compile_graph_format = os.environ.get("TORCH_COMPILE_GRAPH_FORMAT", "svg")
+
 if TYPE_CHECKING:
     from torch.utils._config_typing import *  # noqa: F401, F403
 
diff --git a/torch/_functorch/partitioners.py b/torch/_functorch/partitioners.py
index ba549e5bd6e2..0956ee7e367c 100644
--- a/torch/_functorch/partitioners.py
+++ b/torch/_functorch/partitioners.py
@@ -1,5 +1,3 @@
-# mypy: ignore-errors
-
 import copy
 import functools
 import heapq
@@ -9,7 +7,10 @@
 import operator
 import os
 from collections import defaultdict
-from typing import List, Optional, Set, Tuple, TYPE_CHECKING, Union
+from dataclasses import dataclass, replace
+from typing import Callable, Dict, List, Optional, Set, Tuple, Union
+
+import sympy
 
 import torch
 import torch._inductor.inductor_prims
@@ -28,19 +29,84 @@
 from . import config
 from .compile_utils import fx_graph_cse, get_aten_target
 
-if TYPE_CHECKING:
-    import sympy
-
 
 AOT_PARTITIONER_DEBUG = config.debug_partitioner
 log = logging.getLogger(__name__)
 
+aten = torch.ops.aten
+prims = torch.ops.prims
+
+
+@dataclass
+class OpTypes:
+    """Class for keeping track of different operator categories"""
+
+    fusible_ops: Set[Callable]
+    compute_intensive_ops: Set[Callable]
+    random_ops: Set[Callable]
+    view_ops: Set[Callable]
+    recomputable_ops: Set[Callable]
+
+    def is_fusible(self, node: fx.Node):
+        return get_aten_target(node) in self.fusible_ops
+
+    def is_compute_intensive(self, node: fx.Node):
+        return get_aten_target(node) in self.compute_intensive_ops
+
+    def is_random(self, node: fx.Node):
+        return get_aten_target(node) in self.random_ops
+
+    def is_view(self, node: fx.Node):
+        return get_aten_target(node) in self.view_ops
+
+    def is_recomputable(self, node: fx.Node):
+        return get_aten_target(node) in self.recomputable_ops
+
+
+@dataclass
+class NodeInfo:
+    # Be careful about iterating over these explicitly, as their order may not
+    # be deterministic
+    inputs: List[fx.Node]
+    _required_fw_nodes: Set[fx.Node]
+    required_bw_nodes: Set[fx.Node]
+    unclaimed_nodes: Set[fx.Node]
+    fw_order: Dict[fx.Node, int]
+
+    @functools.cached_property
+    def required_fw_nodes(self) -> List[fx.Node]:
+        return sorted(
+            (n for n in self._required_fw_nodes), key=lambda n: self.fw_order[n]
+        )
+
+    def is_required_fw(self, n: fx.Node) -> bool:
+        return n in self._required_fw_nodes
 
-def must_recompute(node):
+    def is_required_bw(self, n: fx.Node) -> bool:
+        return n in self.required_bw_nodes
+
+    def is_unclaimed(self, n: fx.Node) -> bool:
+        return n in self.unclaimed_nodes
+
+    def get_fw_order(self, n: fx.Node) -> int:
+        assert n in self._required_fw_nodes, f"Node {n} not in fw nodes!"
+        return self.fw_order[n]
+
+
+@dataclass
+class MinCutOptions:
+    ban_if_used_far_apart: bool
+    ban_if_long_fusible_chains: bool
+    ban_if_materialized_backward: bool
+    ban_if_not_in_allowlist: bool
+    ban_if_reduction: bool
+
+
+def must_recompute(node: fx.Node) -> bool:
     return node.meta.get("recompute", False)
 
 
-def has_recomputable_ops(fx_g):
+def has_recomputable_ops(fx_g: fx.GraphModule) -> bool:
     found = False
     for node in fx_g.graph.nodes:
         if must_recompute(node):
@@ -48,7 +114,7 @@ def has_recomputable_ops(fx_g):
     return False
 
 
-def has_recomputable_rng_ops(fx_g):
+def has_recomputable_rng_ops(fx_g: fx.GraphModule) -> bool:
     for node in fx_g.graph.nodes:
         if (
             must_recompute(node)
@@ -59,7 +125,7 @@ def has_recomputable_rng_ops(fx_g):
     return False
 
 
-def sym_node_size(node):
+def sym_node_size(node: fx.Node) -> int:
     if isinstance(node.meta["val"], (torch.SymInt, torch.SymBool)):
         return 1
     assert isinstance(node.meta["val"], torch.SymFloat)
@@ -74,7 +140,9 @@ def __repr__(self):
 InvalidNode = InvalidNodeBase()
 
 
-def _extract_graph_with_inputs_outputs(joint_graph, inputs, outputs):
+def _extract_graph_with_inputs_outputs(
+    joint_graph: fx.Graph, inputs: List[fx.Node], outputs: List[fx.Node]
+) -> fx.Graph:
     """
     Given a graph, extracts out a subgraph that takes the specified nodes as
     inputs and returns the specified outputs.
@@ -136,36 +204,38 @@ def _extract_graph_with_inputs_outputs(joint_graph, inputs, outputs):
     return new_graph
 
 
-def _is_primal(node):
+def _is_primal(node: fx.Node) -> bool:
     return (
         node.op == "placeholder"
-        and "tangents" not in node.target
+        and "tangents" not in str(node.target)
         and not _is_bwd_seed_offset(node)
         and not _is_fwd_seed_offset(node)
     )
 
 
-def _is_tangent(node):
-    return node.op == "placeholder" and "tangents" in node.target
+def _is_tangent(node: fx.Node) -> bool:
+    return node.op == "placeholder" and "tangents" in str(node.target)
 
 
-def _is_bwd_seed_offset(node):
+def _is_bwd_seed_offset(node: fx.Node) -> bool:
     return node.op == "placeholder" and (
-        "bwd_seed" in node.target or "bwd_base_offset" in node.target
+        "bwd_seed" in str(node.target) or "bwd_base_offset" in str(node.target)
     )
 
 
-def _is_fwd_seed_offset(node):
+def _is_fwd_seed_offset(node: fx.Node) -> bool:
     return node.op == "placeholder" and (
-        "fwd_seed" in node.target or "fwd_base_offset" in node.target
+        "fwd_seed" in str(node.target) or "fwd_base_offset" in str(node.target)
     )
 
 
-def _is_backward_state(node):
+def _is_backward_state(node: fx.Node) -> bool:
     return node.op == "placeholder" and isinstance(node.meta.get("val"), BackwardState)
 
 
-def _extract_fwd_bwd_outputs(joint_module: fx.GraphModule, *, num_fwd_outputs):
+def _extract_fwd_bwd_outputs(
+    joint_module: fx.GraphModule, *, num_fwd_outputs
+) -> Tuple[List[fx.Node], List[fx.Node]]:
     outputs = pytree.arg_tree_leaves(
         *(node.args for node in joint_module.graph.find_nodes(op="output"))
     )
@@ -174,7 +244,7 @@ def _extract_fwd_bwd_outputs(joint_module: fx.GraphModule, *, num_fwd_outputs):
     return fwd_outputs, bwd_outputs
 
 
-def _remove_by_name(saved_values, name):
+def _remove_by_name(saved_values: List[fx.Node], name: str):
     for saved_value in saved_values:
         if saved_value.name == name:
             saved_values.remove(saved_value)
@@ -182,8 +252,12 @@ def _remove_by_name(saved_values, name):
 
 
 def _extract_fwd_bwd_modules(
-    joint_module: fx.GraphModule, saved_values, saved_sym_nodes, *, num_fwd_outputs
-):
+    joint_module: fx.GraphModule,
+    saved_values: List[fx.Node],
+    saved_sym_nodes: List[fx.Node],
+    *,
+    num_fwd_outputs: int,
+) -> Tuple[fx.GraphModule, fx.GraphModule]:
     fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(
         joint_module, num_fwd_outputs=num_fwd_outputs
     )
@@ -359,14 +433,10 @@ def default_partition(
     )
 
 
-def _prod(x):
-    s = 1
-    for i in x:
-        s *= i
-    return s
+INT_INF = int(1e6)
 
 
-def _tensor_nbytes(numel, dtype):
+def _tensor_nbytes(numel: int, dtype) -> int:
     return numel * dtype.itemsize
 
 
@@ -374,10 +444,7 @@ def _size_of(node: fx.Node) -> int:
     if "val" in node.meta:
         val = node.meta["val"]
         if isinstance(val, py_sym_types):
-            if isinstance(val, torch.SymInt):
-                return 1
-            else:
-                return 999999
+            return 1
         # NB: The fallback values here are meaningless, maybe we should respect
         # torch._inductor.config.unbacked_symint_fallback (but this is a
         # layering violation)
@@ -391,28 +458,18 @@ def _size_of(node: fx.Node) -> int:
             return _tensor_nbytes(hint_int(val.numel(), fallback=4098), val.dtype)
 
         raise RuntimeError(f"Unknown metadata type {type(val)}")
-
-    # Only needed since we don't always trace with fake tensors.
-    if "tensor_meta" in node.meta:
-        metadata = node.meta["tensor_meta"]
-        # TODO: What is to_size_hint suppose to be?
-        numel = _prod(map(to_size_hint, metadata.shape))  # noqa: F821
-        dtype = metadata.dtype
-    else:
-        return 0
-
-    return _tensor_nbytes(numel, dtype)
+    raise RuntimeError("We should always have `val` metadata on the nodes")
 
 
 # Used for some investigative purposes
-def _count_ops(graph):
+def _count_ops(graph: fx.Graph):
     from collections import defaultdict
 
-    cnt = defaultdict(int)
+    cnt: Dict[str, int] = defaultdict(int)
     for node in graph.nodes:
         if node.op == "call_function":
             cnt[node.target.__name__] += 1
-    print(sorted(cnt.items(), key=operator.itemgetter(1), reverse=True))
+    print(sorted(cnt.items(), key=lambda x: x[1], reverse=True))
 
 
 @functools.lru_cache(None)
@@ -433,14 +490,14 @@ def pointwise_ops():
     return ops
 
 
-def sort_depths(args, depth_map):
+def sort_depths(args, depth_map: Dict[fx.Node, int]) -> List[Tuple[fx.Node, int]]:
     arg_depths = {
         arg: depth_map[arg] for arg in args if isinstance(arg, torch.fx.node.Node)
     }
-    return sorted(arg_depths.items(), key=operator.itemgetter(1), reverse=True)
+    return sorted(arg_depths.items(), key=lambda x: x[1], reverse=True)
 
 
-def reordering_to_mimic_autograd_engine(gm):
+def reordering_to_mimic_autograd_engine(gm: fx.GraphModule) -> fx.GraphModule:
     """
     This pass finds the first bwd node in the graph (by looking at users of
     tangents) and then reorders the graph by walking from this node to all the
@@ -464,7 +521,7 @@ def reordering_to_mimic_autograd_engine(gm):
     """
 
     new_graph = fx.Graph()
-    env = {}
+    env: Dict[fx.Node, fx.Node] = {}
 
     # Add new placeholder nodes in the order specified by the inputs
     for node in gm.graph.find_nodes(op="placeholder"):
@@ -517,7 +574,12 @@ def insert_node_in_graph(node):
     return new_gm
 
 
-def functionalize_rng_ops(joint_module, fw_module, bw_module, num_sym_nodes):
+def functionalize_rng_ops(
+    joint_module: fx.GraphModule,
+    fw_module: fx.GraphModule,
+    bw_module: fx.GraphModule,
+    num_sym_nodes: int,
+) -> Tuple[fx.GraphModule, fx.GraphModule]:
     # During user-driven activation checkpointing, we have to ensure that a rng
     # op in fwd yields the same output as the recomputed rng op in the bwd.  To
     # do this, we use functionalize wrappers to wrap the random ops and share
@@ -591,11 +653,15 @@ def get_sample_rng_state(device):
 
     run_and_save_rng = torch._prims.rng_prims.run_and_save_rng_state
     run_with_rng_state = torch._prims.rng_prims.run_with_rng_state
-
+    bw_tangent_start_node = None
     for node in bw_module.graph.find_nodes(op="placeholder"):
         if "tangent" in node.name:
             bw_tangent_start_node = node
             break
+    if bw_tangent_start_node is None:
+        raise RuntimeError(
+            "Couldn't find tangent node in graph inputs. This is unexpected, please file a bug if you see this"
+        )
 
     fw_rng_state_outputs = []
     for base_node, node_pair in recomputable_rng_ops_map.items():
@@ -665,7 +731,7 @@ def get_sample_rng_state(device):
     return fw_module, bw_module
 
 
-def cleanup_recompute_tags(joint_module):
+def cleanup_recompute_tags(joint_module: fx.GraphModule) -> fx.GraphModule:
     """
     If there are two consecutive checkpointed blocks with no operator in
     between, we would still want to stash the tensor at the boundary of
@@ -683,332 +749,50 @@ def cleanup_recompute_tags(joint_module):
     return joint_module
 
 
-def min_cut_rematerialization_partition(
-    joint_module: fx.GraphModule,
-    _joint_inputs,
-    compiler="inductor",
-    recomputable_ops=None,
-    *,
-    num_fwd_outputs,
-) -> Tuple[fx.GraphModule, fx.GraphModule]:
-    """
-    Partitions the joint graph such that the backward recomputes the forward.
-    Recomputing helps in trading off memory bandwidth with computation.
-
-    To create the fwd and bwd graph, we copy the joint graph, manually set the
-    outputs to just original forward or backward outputs. And then we run the
-    resulting graphs through dead code elimination.
-
-    .. warning::
-        This API is experimental and likely to change.
-
-    Args:
-        joint_module(fx.GraphModule): The joint forward and backward graph. This
-            is the result of AOT Autograd tracing.
-        _joint_inputs: The inputs to the joint graph. This is unused.
-        compiler: This option determines the default set of recomputable ops.
-            Currently, there are two options: ``nvfuser`` and ``inductor``.
-        recomputable_ops: This is an optional set of recomputable ops. If this
-            is not None, then this set of ops will be used instead of the
-            default set of ops.
-        num_fwd_outputs: The number of outputs from the forward graph.
-
-    Returns:
-        Returns the generated forward and backward Fx graph modules.
-    """
-    try:
-        import networkx as nx
-    except ImportError as e:
-        raise RuntimeError(
-            "Need networkx installed to perform smart recomputation " "heuristics"
-        ) from e
-
-    joint_module.graph.eliminate_dead_code()
-    joint_module.recompile()
-
-    fx_g = joint_module.graph
-
-    #  add the CSE pass
-    if config.cse:
-        cse_graph = fx_graph_cse(fx_g)
-        joint_module.graph = cse_graph
-    joint_graph = joint_module.graph
-
-    graph_has_recomputable_ops = has_recomputable_ops(joint_module)
-    graph_has_recomputable_rng_ops = has_recomputable_rng_ops(joint_module)
-    if graph_has_recomputable_ops:
-        joint_module = cleanup_recompute_tags(joint_module)
-
-    name_to_node = {}
-    for node in joint_module.graph.nodes:
-        name_to_node[node.name] = node
-
-    def classify_nodes(joint_module):
-        required_bw_nodes = set()
-        for node in joint_module.graph.nodes:
-            if node.op == "placeholder" and "tangents" in node.target:
-                required_bw_nodes.add(node)
-            if node in required_bw_nodes:
-                required_bw_nodes.update(node.users)
-
-        primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
-        fwd_seed_offset_inputs = list(
-            filter(_is_fwd_seed_offset, joint_module.graph.nodes)
-        )
-        inputs = primal_inputs + fwd_seed_offset_inputs
-        fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(
-            joint_module, num_fwd_outputs=num_fwd_outputs
-        )
-        required_bw_nodes.update(
-            o for o in bwd_outputs if o is not None and o.op != "output"
-        )
-        forward_only_graph = _extract_graph_with_inputs_outputs(
-            joint_module.graph, inputs, fwd_outputs
-        )
-        required_fw_nodes = {
-            name_to_node[node.name]
-            for node in forward_only_graph.nodes
-            if node.op != "output"
-        }
-        unclaimed_nodes = {
-            node
-            for node in joint_module.graph.nodes
-            if node not in required_fw_nodes and node not in required_bw_nodes
-        }
-        return (
-            fwd_outputs,
-            required_fw_nodes,
-            required_bw_nodes,
-            unclaimed_nodes,
-            inputs,
-        )
-
-    (
-        orig_fw_outputs,
-        required_fw_nodes,
-        required_bw_nodes,
-        unclaimed_nodes,
-        inputs,
-    ) = classify_nodes(joint_module)
-
-    # networkx blows up on graphs with no required backward nodes
-    # Since there's nothing to partition anyway, and the default partitioner can "handle"
-    # this case, send our graph over to the default partitioner.
-    if len(required_bw_nodes) == 0:
-        return default_partition(
-            joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs
-        )
-
-    def is_fusible(a, b):
-        # We can perform "memory fusion" into a cat, but cat cannot be a
-        # producer to a fusion
-        if get_aten_target(b) == aten.cat:
-            return True
-        return get_aten_target(a) in fusible_ops and get_aten_target(b) in fusible_ops
-
-    fw_order = 0
-    for node in joint_module.graph.nodes:
-        if node in required_fw_nodes:
-            node.fw_order = fw_order
-            fw_order += 1
-
-    for node in reversed(joint_module.graph.nodes):
-        if node not in required_fw_nodes:
-            node.dist_from_bw = 0
-        else:
-            node.dist_from_bw = int(1e9)
-            for user in node.users:
-                node.dist_from_bw = min(node.dist_from_bw, user.dist_from_bw + 1)
-
-    aten = torch.ops.aten
-    prims = torch.ops.prims
-
-    # compiler == "nvfuser" is the default set of recomputable ops
-    default_recomputable_ops = [
-        aten.add,
-        aten.sub,
-        aten.div,
-        aten.atan2,
-        aten.mul,
-        aten.max,
-        aten.min,
-        aten.pow,
-        aten.remainder,
-        aten.fmod,
-        aten.__and__,
-        aten.__or__,
-        aten.__xor__,
-        aten.__lshift__,
-        aten.__rshift__,
-        aten.eq,
-        aten.ne,
-        aten.ge,
-        aten.gt,
-        aten.le,
-        aten.lt,
-        aten.abs,
-        aten.bitwise_not,
-        aten.ceil,
-        aten.floor,
-        aten.frac,
-        aten.neg,
-        aten.relu,
-        aten.round,
-        aten.silu,
-        aten.trunc,
-        aten.log,
-        aten.log10,
-        aten.log1p,
-        aten.log2,
-        aten.lgamma,
-        aten.exp,
-        aten.expm1,
-        aten.erf,
-        aten.erfc,
-        aten.cos,
-        aten.acos,
-        aten.cosh,
-        aten.sin,
-        aten.asin,
-        aten.sinh,
-        aten.tan,
-        aten.atan,
-        aten.tanh,
-        aten.atanh,
-        aten.sqrt,
-        aten.rsqrt,
-        aten.reciprocal,
-        aten.sigmoid,
-        aten.softplus,
-        aten.threshold,
-        aten.threshold_backward,
-        aten.clamp,
-        aten.where,
-        aten.lerp,
-        aten.addcmul,
-        aten.gelu,
-        aten.gelu_backward,
-        aten.sum,
-        aten.mean,
-        aten._grad_sum_to_size,
-        aten.sum_to_size,
-        aten.amax,
-        aten.to,
-        aten.type_as,
-        operator.getitem,
-        aten.squeeze,
-        aten.unsqueeze,
-        aten.rsub,
-        aten._to_copy,
-    ]  # noqa: E501,B950
-    view_ops = [aten.squeeze, aten.unsqueeze, aten.alias]
-    if compiler == "inductor":
-        default_recomputable_ops += [
-            prims.div,
-            prims.convert_element_type,
-            aten.clone,
-            aten._to_copy,
-            aten.full_like,
-            prims.var,
-            prims.sum,
-            aten.var,
-            aten.std,
-            prims.broadcast_in_dim,
-            aten.select,
-            aten._unsafe_view,
-            aten.view,
-            aten.expand,
-            aten.slice,
-            aten.reshape,
-            aten.broadcast_tensors,
-            aten.scalar_tensor,
-            aten.ones,
-            aten.new_zeros,
-            aten.lift_fresh_copy,
-            aten.arange,
-            aten.triu,
-            aten.var_mean,
-            aten.isinf,
-            aten.any,
-            aten.full,
-            aten.as_strided,
-            aten.zeros,
-            aten.argmax,
-            aten.maximum,
-            prims.iota,
-            prims._low_memory_max_pool2d_offsets_to_indices,
-        ]  # noqa: E501,B950
-        view_ops += [
-            aten.view,
-            aten.slice,
-            aten.t,
-            prims.broadcast_in_dim,
-            aten.expand,
-            aten.as_strided,
-            aten.permute,
-        ]
-        # Natalia said that we should allow recomputing indexing :)
-        default_recomputable_ops += [aten.index, aten.gather]
-    default_recomputable_ops += view_ops
-
-    default_recomputable_ops += pointwise_ops()
-
-    default_recomputable_ops += [
-        aten.zeros_like,
-    ]
-
-    default_recomputable_ops += [method_to_operator(m) for m in magic_methods]
-    recomputable_ops = (
-        set(recomputable_ops)
-        if recomputable_ops is not None
-        else set(default_recomputable_ops)
-    )
-
-    random_ops = [aten.native_dropout, aten.rand_like, aten.randn_like]
-    compute_intensive_ops = [
-        aten.mm,
-        aten.convolution,
-        aten.convolution_backward,
-        aten.bmm,
-        aten.addmm,
-        aten._scaled_dot_product_flash_attention,
-        aten._scaled_dot_product_efficient_attention,
-        aten.upsample_bilinear2d,
-    ]  # noqa: E501,B950
+def get_saved_values(
+    joint_graph: fx.Graph,
+    node_info: NodeInfo,
+    min_cut_options: MinCutOptions,
+    dont_ban=None,
+):
+    if dont_ban is None:
+        dont_ban = set()
+    op_types = get_default_op_list()
 
-    fusible_ops = recomputable_ops | set(random_ops)
     if AOT_PARTITIONER_DEBUG:
         joint_module_ops = {
             str(node.target._overloadpacket)
-            for node in joint_module.graph.nodes
+            for node in joint_graph.nodes
             if node.op == "call_function" and hasattr(node.target, "_overloadpacket")
         }
-        ops_ignored = joint_module_ops - {str(i) for i in recomputable_ops}
+        ops_ignored = joint_module_ops - {str(i) for i in op_types.recomputable_ops}
         print("Ops banned from rematerialization: ", ops_ignored)
         print()
 
-    BAN_IF_USED_FAR_APART = config.ban_recompute_used_far_apart
-    BAN_IF_LONG_FUSIBLE_CHAINS = config.ban_recompute_long_fusible_chains
-    BAN_IF_MATERIALIZED_BACKWARDS = config.ban_recompute_materialized_backward
-    BAN_IF_NOT_IN_ALLOWLIST = config.ban_recompute_not_in_allowlist
-    BAN_IF_REDUCTION = config.ban_recompute_reductions
+    def is_fusible(a, b):
+        # We can perform "memory fusion" into a cat, but cat cannot be a
+        # producer to a fusion
+        if get_aten_target(b) == aten.cat:
+            return True
+        return op_types.is_fusible(a) and op_types.is_fusible(b)
 
-    if config.aggressive_recomputation:
-        BAN_IF_MATERIALIZED_BACKWARDS = False
-        BAN_IF_USED_FAR_APART = False
-        BAN_IF_LONG_FUSIBLE_CHAINS = False
-        BAN_IF_NOT_IN_ALLOWLIST = False
+    try:
+        import networkx as nx
+    except ImportError as e:
+        raise RuntimeError(
+            "Need networkx installed to perform smart recomputation " "heuristics"
+        ) from e
 
     def is_materialized_backwards(node):
-        if get_aten_target(node) in view_ops:
+        if op_types.is_view(node):
             return False
         cur_nodes = {node}
         while len(cur_nodes) > 0:
             cur = cur_nodes.pop()
             for user in cur.users:
-                if user not in required_fw_nodes and not is_fusible(cur, user):
+                if not node_info.is_required_fw(user) and not is_fusible(cur, user):
                     return True
-                if get_aten_target(user) in view_ops:
+                if op_types.is_view(user):
                     cur_nodes.add(user)
 
         return False
@@ -1020,17 +804,15 @@ def should_ban_recomputation(node):
             return False
         if node.target in [aten.lift_fresh_copy.default, aten.lift_fresh.default]:
             return False
-
         # NB: "recompute" == 0 means that must save this node.
         if node.meta.get("recompute", None) == 0:
             return True
 
-        if BAN_IF_NOT_IN_ALLOWLIST:
-            if get_aten_target(node) not in recomputable_ops:
+        if min_cut_options.ban_if_not_in_allowlist:
+            if not op_types.is_recomputable(node):
                 return True
         else:
-            ignored_ops = random_ops + compute_intensive_ops
-            if get_aten_target(node) in ignored_ops:
+            if op_types.is_random(node) or op_types.is_compute_intensive(node):
                 return True
 
         # If a node *must* be materialized in the backwards pass, then we
@@ -1038,7 +820,9 @@ def should_ban_recomputation(node):
         # general, the assumption we make is that recomputing a node in the
         # backwards pass is "free". However, if a node must be materialized
         # in the backwards pass, then recomputing it is never free.
-        if is_materialized_backwards(node) and BAN_IF_MATERIALIZED_BACKWARDS:
+        if min_cut_options.ban_if_materialized_backward and is_materialized_backwards(
+            node
+        ):
             log.info("materialized backwards: %s %s", node, tuple(node.users))
             return True
 
@@ -1046,16 +830,15 @@ def should_ban_recomputation(node):
         # modification appears to have made this heuristic a lot less critical
         # for performance.
         # NB: As of PR #121692, this hack no longer seems necessary.
-        if not graph_has_recomputable_ops:
-            if compiler == "inductor" and node.dist_from_bw > config.max_dist_from_bw:
-                return True
+        if node.dist_from_bw < 1000 and node.dist_from_bw > config.max_dist_from_bw:
+            return True
 
         # If the output of an op is 4x smaller (arbitrary choice),
         # then we don't allow recomputation. The idea here is that for
         # things like reductions, saving the output of the reduction is very
         # cheap/small, and it makes sure we don't do things like recompute
         # normalizations in the backwards.
-        if BAN_IF_REDUCTION:
+        if min_cut_options.ban_if_reduction:
             input_tensors_size = sum(
                 _size_of(i) for i in node.args if isinstance(i, fx.Node)
             )
@@ -1069,9 +852,14 @@ def is_materialized(node):
 
         return not all(is_fusible(node, user) for user in node.users)
 
-    def get_node_weight(node) -> int:
+    def get_node_weight(node) -> float:
         mem_sz = _size_of(node)
 
+        if isinstance(node.meta["val"], py_sym_types):
+            # We never want to save symfloats
+            if not isinstance(node.meta["val"], torch.SymInt):
+                return INT_INF
+
         # Heuristic to bias towards nodes closer to the backwards pass
         # Complete guess about current value
         mem_sz = int(mem_sz * (1.1 ** max(min(node.dist_from_bw, 100), 1)))
@@ -1084,6 +872,11 @@ def get_node_weight(node) -> int:
     banned_nodes = set()
 
     def ban_recomputation_if_allowed(node):
+        if op_types.is_view(node):
+            return False
+        if node in dont_ban:
+            return False
+        # breakpoint()
         # This bans recomputation of the node unless we've been forced not to by
         # user annotation
         # NB: "recompute" > 0 means that user annotation has asked us to
@@ -1106,8 +899,8 @@ def ban_recomputation_if_allowed(node):
         if node.op == "output":
             continue
 
-        if node in required_bw_nodes:
-            if node not in inputs:
+        if node in node_info.required_bw_nodes:
+            if node not in node_info.inputs:
                 nx_graph.add_edge(node.name + "_in", "sink", capacity=math.inf)
                 continue
             # If someone saves a input for backward as-is and backward
@@ -1126,7 +919,7 @@ def ban_recomputation_if_allowed(node):
         # If a node can't be recomputed (too expensive or involves randomness),
         # we prevent it from being recomputed by adding an inf edge to the source
         # We only need to ban nodes in the fw pass, as those are the only ones that would be recomputed.
-        if node in required_fw_nodes and should_ban_recomputation(node):
+        if node_info.is_required_fw(node) and should_ban_recomputation(node):
             ban_recomputation_if_allowed(node)
 
         # Checks if a node is actually a tuple. Can be simplified to just an isinstance check if we always use faketensors.
@@ -1135,12 +928,13 @@ def ban_recomputation_if_allowed(node):
         ) or ("val" in node.meta and not isinstance(node.meta["val"], torch.Tensor))
 
         if is_sym_node(node):
-            weight = sym_node_size(node)
+            weight = float(sym_node_size(node))
         elif is_non_tensor_node:
-            weight = 0 if isinstance(node.meta.get("val"), BackwardState) else math.inf
+            weight = (
+                0.0 if isinstance(node.meta.get("val"), BackwardState) else math.inf
+            )
         else:
             weight = get_node_weight(node)
-
         # Creates the weights on the "node" edge
         nx_graph.add_edge(node.name + "_in", node.name + "_out", capacity=weight)
         for user in node.users:
@@ -1168,35 +962,40 @@ def find_first_unfusible(start_nodes: List[fx.Node], max_range: int) -> int:
         Finds the first unfusible node in the chain of nodes starting from
         `start_nodes` and returns its position.
         """
-        sorted_nodes = []
+        sorted_nodes: List[Tuple[int, fx.Node, bool]] = []
         for n in start_nodes:
-            heapq.heappush(sorted_nodes, (n.fw_order, n, True))
+            heapq.heappush(sorted_nodes, (node_info.get_fw_order(n), n, True))
 
         while len(sorted_nodes) > 0:
             _, node, node_is_fusible = heapq.heappop(sorted_nodes)
             if not node_is_fusible:
-                return node.fw_order
+                return node_info.get_fw_order(node)
             for user in node.users:
-                if user in required_fw_nodes:
-                    if user.fw_order > max_range:
+                if node_info.is_required_fw(user):
+                    if node_info.get_fw_order(user) > max_range:
                         continue
                     heapq.heappush(
-                        sorted_nodes, (user.fw_order, user, is_fusible(node, user))
+                        sorted_nodes,
+                        (node_info.get_fw_order(user), user, is_fusible(node, user)),
                     )
         return max_range
 
-    if BAN_IF_USED_FAR_APART:
-        for used_node in required_fw_nodes:
+    if min_cut_options.ban_if_used_far_apart:
+        for used_node in node_info.required_fw_nodes:
             orders = [
-                user.fw_order for user in used_node.users if user in required_fw_nodes
+                node_info.get_fw_order(user)
+                for user in used_node.users
+                if node_info.is_required_fw(user)
+            ]
+            fw_users = [
+                user for user in used_node.users if node_info.is_required_fw(user)
             ]
-            fw_users = [user for user in used_node.users if user in required_fw_nodes]
             if len(orders) > 0:
                 first_unfusible_use = find_first_unfusible(fw_users, max(orders))
                 for user in tuple(used_node.users):
                     if (
-                        user in required_fw_nodes
-                        and user.fw_order > first_unfusible_use
+                        node_info.is_required_fw(user)
+                        and node_info.get_fw_order(user) > first_unfusible_use
                         and is_fusible(used_node, user)
                     ):
                         if user in banned_nodes:
@@ -1204,10 +1003,10 @@ def find_first_unfusible(start_nodes: List[fx.Node], max_range: int) -> int:
                         log.info(
                             "used above/below fusible %s:(%s) -> %s -> %s:(%s)",
                             used_node,
-                            used_node.fw_order,
+                            node_info.get_fw_order(used_node),
                             first_unfusible_use,
                             user,
-                            user.fw_order,
+                            node_info.get_fw_order(user),
                         )
                         ban_recomputation_if_allowed(user)
 
@@ -1222,47 +1021,51 @@ def find_first_unfusible(start_nodes: List[fx.Node], max_range: int) -> int:
 
     # Some models it improves perf on are cait_m36_384, mixer_b16_224, poolformer_m36
 
-    if BAN_IF_LONG_FUSIBLE_CHAINS:
+    if min_cut_options.ban_if_long_fusible_chains:
         visited = set()
         for start_node in joint_graph.nodes:
-            if start_node not in required_fw_nodes:
+            if not node_info.is_required_fw(start_node):
                 continue
-            fusible = [(start_node.fw_order, start_node)]
-            start_order = start_node.fw_order
+            fusible = [(node_info.get_fw_order(start_node), start_node)]
+            start_order = node_info.get_fw_order(start_node)
             while len(fusible) > 0:
                 _, cur = heapq.heappop(fusible)
                 if cur in visited:
                     continue
                 visited.add(cur)
                 # 100 is arbitrary choice to try and prevent degenerate cases
-                if cur.fw_order > start_order + 100 and len(fusible) == 0:
+                if (
+                    node_info.get_fw_order(cur) > start_order + 100
+                    and len(fusible) == 0
+                ):
                     log.info(
                         "too long %s %s %s %s",
                         cur,
                         start_node,
-                        cur.fw_order,
-                        start_node.fw_order,
+                        node_info.get_fw_order(cur),
+                        node_info.get_fw_order(start_node),
                     )
                     ban_recomputation_if_allowed(cur)
                     break
 
                 for user in cur.users:
                     if (
-                        user in required_fw_nodes
+                        node_info.is_required_fw(user)
                         and is_fusible(cur, user)
                         and user not in banned_nodes
                     ):
-                        heapq.heappush(fusible, (user.fw_order, user))
+                        heapq.heappush(fusible, (node_info.get_fw_order(user), user))
 
     try:
         cut_value, partition = nx.minimum_cut(nx_graph, "source", "sink")
     except Exception:
         print("Failed to compute min-cut on following graph:")
         print("\n".join(nx.readwrite.edgelist.generate_edgelist(nx_graph)))
+        visualize_min_cut_graph(nx_graph)
         raise
 
     reachable, non_reachable = partition
-    cutset = set()
+    cutset: Set[Tuple[str, str]] = set()
     for u, nbrs in ((n, nx_graph[n]) for n in reachable):
         cutset.update((u, v) for v in nbrs if v in non_reachable)
 
@@ -1272,14 +1075,347 @@ def find_first_unfusible(start_nodes: List[fx.Node], max_range: int) -> int:
         node_name = node_in[:-3]
         cut_nodes.add(node_name)
 
+    name_to_node = get_name_to_node(joint_graph)
     # To make this stuff deterministic
-    node_idx = {node: idx for idx, node in enumerate(joint_module.graph.nodes)}
+    node_idx = {node: idx for idx, node in enumerate(joint_graph.nodes)}
     saved_values = sorted(
         (name_to_node[node] for node in cut_nodes), key=lambda x: node_idx[x]
     )
+    return saved_values, banned_nodes
+
+
+def visualize_min_cut_graph(nx_graph):
+    import networkx as nx
+    import pydot
+
+    dot_format = nx.nx_pydot.to_pydot(nx_graph).to_string()
+    dot_graph = pydot.graph_from_dot_data(dot_format)[0]
+    for edge in dot_graph.get_edges():
+        weight = nx_graph[edge.get_source()][edge.get_destination()]["capacity"]
+        # Set edge label to weight
+        edge.set_label(str(weight))
+        # Color edges with weight 'inf' as red
+        if weight == float("inf"):
+            edge.set_color("red")
+    print("Visualizing the failed graph to min_cut_failed.svg")
+    dot_graph.write_svg("min_cut_failed.svg")
+
+
+def get_default_op_list() -> OpTypes:
+    default_recomputable_ops: List[Callable] = [
+        aten.add,
+        aten.sub,
+        aten.div,
+        aten.atan2,
+        aten.mul,
+        aten.max,
+        aten.min,
+        aten.pow,
+        aten.remainder,
+        aten.fmod,
+        aten.__and__,
+        aten.__or__,
+        aten.__xor__,
+        aten.__lshift__,
+        aten.__rshift__,
+        aten.eq,
+        aten.ne,
+        aten.ge,
+        aten.gt,
+        aten.le,
+        aten.lt,
+        aten.abs,
+        aten.bitwise_not,
+        aten.ceil,
+        aten.floor,
+        aten.frac,
+        aten.neg,
+        aten.relu,
+        aten.round,
+        aten.silu,
+        aten.trunc,
+        aten.log,
+        aten.log10,
+        aten.log1p,
+        aten.log2,
+        aten.lgamma,
+        aten.exp,
+        aten.expm1,
+        aten.erf,
+        aten.erfc,
+        aten.cos,
+        aten.acos,
+        aten.cosh,
+        aten.sin,
+        aten.asin,
+        aten.sinh,
+        aten.tan,
+        aten.atan,
+        aten.tanh,
+        aten.atanh,
+        aten.sqrt,
+        aten.rsqrt,
+        aten.reciprocal,
+        aten.sigmoid,
+        aten.softplus,
+        aten.threshold,
+        aten.threshold_backward,
+        aten.clamp,
+        aten.where,
+        aten.lerp,
+        aten.addcmul,
+        aten.gelu,
+        aten.gelu_backward,
+        aten.sum,
+        aten.mean,
+        aten._grad_sum_to_size,
+        aten.sum_to_size,
+        aten.amax,
+        aten.to,
+        aten.type_as,
+        operator.getitem,
+        aten.squeeze,
+        aten.unsqueeze,
+        aten.rsub,
+        aten._to_copy,
+    ]  # noqa: E501,B950
+    recomputable_view_ops = [aten.squeeze, aten.unsqueeze, aten.alias]
+    recomputable_view_ops += [
+        aten.view,
+        aten.slice,
+        aten.t,
+        prims.broadcast_in_dim,
+        aten.expand,
+        aten.as_strided,
+        aten.permute,
+    ]
+    view_ops = recomputable_view_ops
+    default_recomputable_ops += [
+        prims.div,
+        prims.convert_element_type,
+        aten.clone,
+        aten._to_copy,
+        aten.full_like,
+        prims.var,
+        prims.sum,
+        aten.var,
+        aten.std,
+        prims.broadcast_in_dim,
+        aten.select,
+        aten._unsafe_view,
+        aten.view,
+        aten.expand,
+        aten.slice,
+        aten.reshape,
+        aten.broadcast_tensors,
+        aten.scalar_tensor,
+        aten.ones,
+        aten.new_zeros,
+        aten.lift_fresh_copy,
+        aten.arange,
+        aten.triu,
+        aten.var_mean,
+        aten.isinf,
+        aten.any,
+        aten.full,
+        aten.as_strided,
+        aten.zeros,
+        aten.argmax,
+        aten.maximum,
+        prims.iota,
+        prims._low_memory_max_pool2d_offsets_to_indices,
+    ]  # noqa: E501,B950
+    # Natalia said that we should allow recomputing indexing :)
+    default_recomputable_ops += [aten.index, aten.gather]
+    default_recomputable_ops += view_ops
+
+    default_recomputable_ops += pointwise_ops()
+
+    default_recomputable_ops += [
+        aten.zeros_like,
+    ]
+
+    default_recomputable_ops += [method_to_operator(m) for m in magic_methods]
+    recomputable_ops = set(default_recomputable_ops)
+
+    random_ops = [aten.native_dropout, aten.rand_like, aten.randn_like]
+    compute_intensive_ops = [
+        aten.mm,
+        aten.convolution,
+        aten.convolution_backward,
+        aten.bmm,
+        aten.addmm,
+        aten._scaled_dot_product_flash_attention,
+        aten._scaled_dot_product_efficient_attention,
+        aten.upsample_bilinear2d,
+    ]  # noqa: E501,B950
+
+    fusible_ops = recomputable_ops | set(random_ops)
+    return OpTypes(
+        set(fusible_ops),
+        set(compute_intensive_ops),
+        set(random_ops),
+        set(view_ops),
+        set(recomputable_ops),
+    )
+
+
+def get_name_to_node(graph: fx.Graph):
+    name_to_node = {}
+    for node in graph.nodes:
+        name_to_node[node.name] = node
+    return name_to_node
+
+
+def choose_saved_values_set(
+    joint_graph: fx.Graph, node_info: NodeInfo, memory_budget=1
+) -> List[fx.Node]:
+    min_cut_options = MinCutOptions(
+        ban_if_used_far_apart=config.ban_recompute_used_far_apart,
+        ban_if_long_fusible_chains=config.ban_recompute_long_fusible_chains,
+        ban_if_materialized_backward=config.ban_recompute_materialized_backward,
+        ban_if_not_in_allowlist=config.ban_recompute_not_in_allowlist,
+        ban_if_reduction=config.ban_recompute_reductions,
+    )
+
+    if config.aggressive_recomputation:
+        min_cut_options = replace(
+            min_cut_options,
+            ban_if_used_far_apart=False,
+            ban_if_long_fusible_chains=False,
+            ban_if_materialized_backward=False,
+            ban_if_not_in_allowlist=False,
+        )
+
+    if memory_budget == 0:
+        return node_info.inputs
+
+    runtime_optimized_saved_values, _ = get_saved_values(
+        joint_graph,
+        node_info,
+        min_cut_options,
+    )
+    return runtime_optimized_saved_values
+
+
+def min_cut_rematerialization_partition(
+    joint_module: fx.GraphModule,
+    _joint_inputs,
+    compiler="inductor",
+    *,
+    num_fwd_outputs,
+) -> Tuple[fx.GraphModule, fx.GraphModule]:
+    """
+    Partitions the joint graph such that the backward recomputes the forward.
+    Recomputing helps in trading off memory bandwidth with computation.
+
+    To create the fwd and bwd graph, we copy the joint graph, manually set the
+    outputs to just original forward or backward outputs. And then we run the
+    resulting graphs through dead code elimination.
+
+    .. warning::
+        This API is experimental and likely to change.
+
+    Args:
+        joint_module(fx.GraphModule): The joint forward and backward graph. This
+            is the result of AOT Autograd tracing.
+        _joint_inputs: The inputs to the joint graph. This is unused.
+        compiler: This option determines the default set of recomputable ops.
+            Currently, there are two options: ``nvfuser`` and ``inductor``.
+        recomputable_ops: This is an optional set of recomputable ops. If this
+            is not None, then this set of ops will be used instead of the
+            default set of ops.
+        num_fwd_outputs: The number of outputs from the forward graph.
+
+    Returns:
+        Returns the generated forward and backward Fx graph modules.
+    """
+
+    joint_module.graph.eliminate_dead_code()
+    joint_module.recompile()
+
+    fx_g = joint_module.graph
+
+    #  add the CSE pass
+    if config.cse:
+        cse_graph = fx_graph_cse(fx_g)
+        joint_module.graph = cse_graph
+    joint_graph = joint_module.graph
+
+    graph_has_recomputable_ops = has_recomputable_ops(joint_module)
+    graph_has_recomputable_rng_ops = has_recomputable_rng_ops(joint_module)
+    if graph_has_recomputable_ops:
+        joint_module = cleanup_recompute_tags(joint_module)
+
+    def classify_nodes(joint_module):
+        name_to_node = get_name_to_node(joint_module.graph)
+        required_bw_nodes = set()
+        for node in joint_module.graph.nodes:
+            if node.op == "placeholder" and "tangents" in node.target:
+                required_bw_nodes.add(node)
+            if node in required_bw_nodes:
+                for user in node.users:
+                    required_bw_nodes.add(user)
+
+        primal_inputs = list(filter(_is_primal, joint_module.graph.nodes))
+        fwd_seed_offset_inputs = list(
+            filter(_is_fwd_seed_offset, joint_module.graph.nodes)
+        )
+        inputs = primal_inputs + fwd_seed_offset_inputs
+        fwd_outputs, bwd_outputs = _extract_fwd_bwd_outputs(
+            joint_module, num_fwd_outputs=num_fwd_outputs
+        )
+        required_bw_nodes.update(
+            o for o in bwd_outputs if o is not None and o.op != "output"
+        )
+        forward_only_graph = _extract_graph_with_inputs_outputs(
+            joint_module.graph, inputs, fwd_outputs
+        )
+        required_fw_nodes: Set[fx.Node] = {
+            name_to_node[node.name]
+            for node in forward_only_graph.nodes
+            if node.op != "output"
+        }
+        unclaimed_nodes = {
+            node
+            for node in joint_module.graph.nodes
+            if node not in required_fw_nodes and node not in required_bw_nodes
+        }
+        fw_cnt = 0
+        fw_order = {}
+        for node in joint_module.graph.nodes:
+            if node in required_fw_nodes:
+                fw_order[node] = fw_cnt
+                fw_cnt += 1
+        return NodeInfo(
+            inputs, required_fw_nodes, required_bw_nodes, unclaimed_nodes, fw_order
+        )
+
+    node_info = classify_nodes(joint_module)
+
+    # networkx blows up on graphs with no required backward nodes
+    # Since there's nothing to partition anyway, and the default partitioner can "handle"
+    # this case, send our graph over to the default partitioner.
+    if len(node_info.required_bw_nodes) == 0:
+        return default_partition(
+            joint_module, _joint_inputs, num_fwd_outputs=num_fwd_outputs
+        )
+
+    for node in reversed(joint_module.graph.nodes):
+        if node.op == "output":
+            node.dist_from_bw = int(1e9)
+        elif not node_info.is_required_fw(node):
+            node.dist_from_bw = 0
+        else:
+            node.dist_from_bw = int(1e9)
+            for user in node.users:
+                node.dist_from_bw = min(node.dist_from_bw, user.dist_from_bw + 1)
+
+    saved_values = choose_saved_values_set(joint_graph, node_info, memory_budget=1)
     # save_for_backward on tensors and stashes symints in autograd .ctx
     saved_sym_nodes = list(filter(is_sym_node, saved_values))
     saved_values = list(filter(lambda n: not is_sym_node(n), saved_values))
+
     # NB: saved_sym_nodes will be mutated to reflect the actual saved symbols
     fw_module, bw_module = _extract_fwd_bwd_modules(
         joint_module,
@@ -1312,7 +1448,7 @@ def find_first_unfusible(start_nodes: List[fx.Node], max_range: int) -> int:
         }
         remat_nodes = fw_module_nodes & bw_module_nodes
 
-        counts = defaultdict(int)
+        counts: Dict[str, int] = defaultdict(int)
         for node in fw_module.graph.nodes:
             if node.name in remat_nodes and hasattr(node.target, "_overloadpacket"):
                 counts[str(node.target._overloadpacket)] += 1
@@ -1321,7 +1457,7 @@ def find_first_unfusible(start_nodes: List[fx.Node], max_range: int) -> int:
         )
         print(
             "Count of Ops Rematerialized: ",
-            sorted(counts.items(), key=operator.itemgetter(1), reverse=True),
+            sorted(counts.items(), key=lambda x: x[1], reverse=True),
         )
     return fw_module, bw_module
 
@@ -1331,7 +1467,7 @@ def draw_graph(
     fname: str,
     figname: str = "fx_graph",
     clear_meta: bool = True,
-    prog: Union[str, List[str]] = None,
+    prog: Optional[Union[str, List[str]]] = None,
     parse_stack_trace: bool = False,
     dot_graph_shape: Optional[str] = None,
 ) -> None:
@@ -1342,7 +1478,7 @@ def draw_graph(
             node.meta = {}
     base, ext = os.path.splitext(fname)
     if not ext:
-        ext = ".svg"
+        ext = "." + config.torch_compile_graph_format
     print(f"Writing FX graph to file: {base}{ext}")
     g = graph_drawer.FxGraphDrawer(
         traced,
@@ -1357,13 +1493,3 @@ def draw_graph(
         write_method(fname)
     else:
         write_method(fname, prog=prog)
-
-
-def draw_joint_graph(
-    graph: torch.fx.GraphModule,
-    joint_inputs,
-    file_name: str = "full_graph.png",
-    dot_graph_shape: Optional[str] = None,
-):
-    draw_graph(graph, file_name, dot_graph_shape=dot_graph_shape)
-    return default_partition(graph, joint_inputs)
diff --git a/torch/_higher_order_ops/associative_scan.py b/torch/_higher_order_ops/associative_scan.py
index 287e59ea0093..8b406f39a64d 100644
--- a/torch/_higher_order_ops/associative_scan.py
+++ b/torch/_higher_order_ops/associative_scan.py
@@ -110,16 +110,12 @@ def add(x: torch.Tensor, y: torch.Tensor):
 def trace_associative_scan(
     proxy_mode, func_overload, combine_fn: Callable, input: List[torch.Tensor], dim: int
 ):
-    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
-
     with disable_proxy_modes_tracing():
         sample_inputs = [
             torch.full((), False, dtype=x.dtype, device=x.device)
             for x in itertools.chain(input, input)
         ]
-        combine_graph = reenter_make_fx(combine_fn, pre_dispatch=pre_dispatch)(
-            *sample_inputs
-        )
+        combine_graph = reenter_make_fx(combine_fn)(*sample_inputs)
 
     outputs = None
     for node in combine_graph.graph.nodes:
diff --git a/torch/_higher_order_ops/cond.py b/torch/_higher_order_ops/cond.py
index 40aee90affcc..359feb192ae5 100644
--- a/torch/_higher_order_ops/cond.py
+++ b/torch/_higher_order_ops/cond.py
@@ -29,7 +29,6 @@
 from torch._subclasses.fake_tensor import FakeTensorMode
 from torch.fx.experimental.proxy_tensor import (
     _temp_remove_pre_dispatch_torch_function_mode,
-    disable_proxy_modes_tracing,
     ProxyTorchDispatchMode,
     track_tensor_tree,
 )
@@ -159,11 +158,8 @@ def trace_cond(proxy_mode, func_overload, pred, true_fn, false_fn, operands):
         isinstance(o, torch.Tensor) for o in operands
     ), "Cond operands must be a list of tensors"
 
-    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
-
-    with disable_proxy_modes_tracing():
-        true_graph = reenter_make_fx(true_fn, pre_dispatch)(*operands)
-        false_graph = reenter_make_fx(false_fn, pre_dispatch)(*operands)
+    true_graph = reenter_make_fx(true_fn)(*operands)
+    false_graph = reenter_make_fx(false_fn)(*operands)
 
     true_outs = []
     false_outs = []
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
index 664bfe1c4dd0..f4586a0a57b0 100644
--- a/torch/_higher_order_ops/flex_attention.py
+++ b/torch/_higher_order_ops/flex_attention.py
@@ -6,6 +6,7 @@
 from torch._higher_order_ops.utils import (
     _has_potential_branch_input_mutation,
     autograd_not_implemented,
+    reenter_make_fx,
     UnsupportedAliasMutationException,
 )
 from torch._ops import HigherOrderOperator
@@ -178,7 +179,7 @@ def trace_flex_attention(
         torch.zeros((), dtype=query.dtype, requires_grad=query.requires_grad)
     ] + [torch.zeros((), dtype=torch.int) for _ in range(4)]
     with TransformGetItemToIndex():
-        score_graph = make_fx(score_mod)(*example_vals, *other_buffers)
+        score_graph = reenter_make_fx(score_mod)(*example_vals, *other_buffers)
     qualname = proxy_mode.tracer.get_fresh_qualname("sdpa_score")
     proxy_mode.tracer.root.register_module(qualname, score_graph)
     node_args = (query, key, value, score_graph, *other_buffers)
@@ -405,17 +406,20 @@ def flex_attention_autograd(
     score_mod: Callable,
     *other_buffers: Tuple[torch.Tensor, ...],
 ) -> Tuple[torch.Tensor, torch.Tensor]:
-    input_requires_grad = any(t.requires_grad for t in (query, key, value))
-    if torch.is_grad_enabled() and input_requires_grad:
-        example_vals = [
-            torch.zeros((), dtype=query.dtype, requires_grad=input_requires_grad)
-        ] + [torch.zeros((), dtype=torch.int) for _ in range(4)]
-        fw_graph, bw_graph = create_fw_bw_graph(score_mod, example_vals, other_buffers)
-    else:
-        fw_graph, bw_graph = score_mod, None
-    out, logsumexp = FlexAttentionAutogradOp.apply(
-        query, key, value, fw_graph, bw_graph, *other_buffers
-    )
+    with TransformGetItemToIndex():
+        input_requires_grad = any(t.requires_grad for t in (query, key, value))
+        if torch.is_grad_enabled() and input_requires_grad:
+            example_vals = [
+                torch.zeros((), dtype=query.dtype, requires_grad=input_requires_grad)
+            ] + [torch.zeros((), dtype=torch.int) for _ in range(4)]
+            fw_graph, bw_graph = create_fw_bw_graph(
+                score_mod, example_vals, other_buffers
+            )
+        else:
+            fw_graph, bw_graph = score_mod, None
+        out, logsumexp = FlexAttentionAutogradOp.apply(
+            query, key, value, fw_graph, bw_graph, *other_buffers
+        )
     return out, logsumexp
 
 
@@ -448,9 +452,10 @@ def sdpa_dense_backward(
     score_mod = torch.vmap(score_mod, in_dims=(0, None, 0, None, None) + in_dim_buffers)
     score_mod = torch.vmap(score_mod, in_dims=(0, 0, None, None, None) + in_dim_buffers)
 
-    post_mod_scores = score_mod(scores, b, h, m, n, *other_buffers).to(
-        working_precision
-    )
+    with TransformGetItemToIndex():
+        post_mod_scores = score_mod(scores, b, h, m, n, *other_buffers).to(
+            working_precision
+        )
 
     softmax_scores = torch.exp(post_mod_scores - logsumexp.unsqueeze(-1))
 
@@ -484,9 +489,10 @@ def sdpa_dense_backward(
         in_dims=(0, 0, None, None, None, 0) + in_dim_buffers,
         out_dims=out_dims,
     )
-    grad_scores, *_ = joint_score_mod(
-        scores, b, h, m, n, grad_score_mod, *other_buffers
-    )
+    with TransformGetItemToIndex():
+        grad_scores, *_ = joint_score_mod(
+            scores, b, h, m, n, grad_score_mod, *other_buffers
+        )
     grad_scores = grad_scores.to(query.dtype)
 
     grad_query = grad_scores @ key
@@ -523,8 +529,9 @@ def trace_flex_attention_backward(
         torch.zeros((), dtype=query.dtype, requires_grad=query.requires_grad)
     ] + [torch.zeros((), dtype=torch.int) for _ in range(4)]
     bw_example_vals = fw_example_vals + [torch.zeros((), dtype=query.dtype)]
-    fw_graph = make_fx(fw_graph)(*fw_example_vals, *other_buffers)
-    joint_graph = make_fx(joint_graph)(*bw_example_vals, *other_buffers)
+    with TransformGetItemToIndex():
+        fw_graph = reenter_make_fx(fw_graph)(*fw_example_vals, *other_buffers)
+        joint_graph = reenter_make_fx(joint_graph)(*bw_example_vals, *other_buffers)
     proxy_mode.tracer.root.register_module("fw_graph", fw_graph)
     proxy_mode.tracer.root.register_module("joint_graph", joint_graph)
     node_args = (
diff --git a/torch/_higher_order_ops/map.py b/torch/_higher_order_ops/map.py
index 6bef897dfa51..2bf88ea19565 100644
--- a/torch/_higher_order_ops/map.py
+++ b/torch/_higher_order_ops/map.py
@@ -230,8 +230,7 @@ def trace_map(proxy_mode, func_overload, f, xs, pos_args):
     example_input = _unstack_pytree(xs)[0]
     body_graph = f
 
-    pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
-    body_graph = reenter_make_fx(body_graph, pre_dispatch)(*example_input, *pos_args)
+    body_graph = reenter_make_fx(body_graph)(*example_input, *pos_args)
 
     next_name = proxy_mode.tracer.get_fresh_qualname("body_graph_")
 
diff --git a/torch/_higher_order_ops/utils.py b/torch/_higher_order_ops/utils.py
index 32bb465041ce..0fcf22bcc338 100644
--- a/torch/_higher_order_ops/utils.py
+++ b/torch/_higher_order_ops/utils.py
@@ -1,3 +1,4 @@
+import functools
 from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import Any, Callable
@@ -76,16 +77,19 @@ def graph_with_interpreter(*args):
     return maybe_interpreted_fn
 
 
-# We'll use the current decomposition table to make sure operators in subgraphs are
-# decomposed properly.
-# We also need to maybe run with interpreter for propagating stack_trace
-def reenter_make_fx(fn, pre_dispatch=False):
-    decomp_table = torch.fx.experimental.proxy_tensor.CURRENT_DECOMPOSITION_TABLE
-    return make_fx(
-        _maybe_run_with_interpreter(fn),
-        decomposition_table=decomp_table,
-        pre_dispatch=pre_dispatch,
-    )
+def reenter_make_fx(fn):
+    from torch.fx.experimental.proxy_tensor import _CURRENT_MAKE_FX_TRACER
+
+    @functools.wraps(fn)
+    def wrapped(*args):
+        assert (
+            _CURRENT_MAKE_FX_TRACER is not None
+        ), "Cannot reenter make_fx when we're not under a make_fx tracing session"
+        return _CURRENT_MAKE_FX_TRACER.trace_subgraph(
+            _maybe_run_with_interpreter(fn), *args
+        )
+
+    return wrapped
 
 
 @contextmanager
diff --git a/torch/_higher_order_ops/while_loop.py b/torch/_higher_order_ops/while_loop.py
index 15bacb4bc194..b0ab00bdfac4 100644
--- a/torch/_higher_order_ops/while_loop.py
+++ b/torch/_higher_order_ops/while_loop.py
@@ -15,11 +15,7 @@
 )
 from torch._ops import HigherOrderOperator
 from torch._subclasses.fake_tensor import FakeTensorMode
-from torch.fx.experimental.proxy_tensor import (
-    disable_proxy_modes_tracing,
-    ProxyTorchDispatchMode,
-    track_tensor_tree,
-)
+from torch.fx.experimental.proxy_tensor import ProxyTorchDispatchMode, track_tensor_tree
 
 
 class WhileLoopOp(HigherOrderOperator):
@@ -189,14 +185,8 @@ def while_loop_tracing(mode, cond_fn, body_fn, carried_inputs, additional_inputs
     def _trace_while_loop(
         proxy_mode, while_loop_op, cond_fn, body_fn, carried_inputs, additional_inputs
     ):
-        pre_dispatch = getattr(proxy_mode, "pre_dispatch", False)
-        with disable_proxy_modes_tracing():
-            cond_graph = reenter_make_fx(cond_fn, pre_dispatch)(
-                *carried_inputs, *additional_inputs
-            )
-            body_graph = reenter_make_fx(body_fn, pre_dispatch)(
-                *carried_inputs, *additional_inputs
-            )
+        cond_graph = reenter_make_fx(cond_fn)(*carried_inputs, *additional_inputs)
+        body_graph = reenter_make_fx(body_fn)(*carried_inputs, *additional_inputs)
 
         next_name = None
         i = 0
diff --git a/torch/_inductor/__init__.py b/torch/_inductor/__init__.py
index 5ea494fc7b10..0d7cd8cece49 100644
--- a/torch/_inductor/__init__.py
+++ b/torch/_inductor/__init__.py
@@ -76,10 +76,12 @@ def aot_compile(
     flat_args_with_path, received_spec = pytree.tree_flatten_with_path(
         (args, kwargs or {})
     )
-    flat_example_inputs = tuple(x[1] for x in flat_args_with_path)
+    flat_example_inputs = tuple(
+        x[1] for x in flat_args_with_path if isinstance(x[1], torch.Tensor)
+    )
 
     if in_spec is not None and received_spec != in_spec:
-        raise ValueError(  # noqa: TRY200
+        raise ValueError(  # noqa: B904
             "Trying to flatten user inputs with exported input tree spec: \n"
             f"{in_spec}\n"
             "but actually got inputs with tree spec of: \n"
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
index 5ac418b847f8..70b467143111 100644
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@@ -734,11 +734,12 @@ def _get_tmp_dir_for_key(key: str) -> str:
         return os.path.join(FxGraphCache._get_tmp_dir(), key[1:3], key)
 
     @staticmethod
-    def _filter_symints(inputs: List[Any]) -> List[torch.SymInt]:
+    def _filter_backed_symints(inputs: List[Any]) -> List[torch.SymInt]:
         """
-        Get the SymInt objects from the input list.
+        Get the backed SymInt objects from the input list. Note that we can never
+        have guards that depend on unbacked symint.
         """
-        return [s for s in inputs if isinstance(s, torch.SymInt)]
+        return [s for s in inputs if isinstance(s, torch.SymInt) and has_hint(s)]
 
     @staticmethod
     def _get_shape_env() -> Optional[ShapeEnv]:
@@ -764,8 +765,7 @@ def _lookup_graph(
         shape_env = FxGraphCache._get_shape_env()
         assert shape_env is not None
 
-        symints = FxGraphCache._filter_symints(example_inputs)
-        assert all(has_hint(s) for s in symints)
+        symints = FxGraphCache._filter_backed_symints(example_inputs)
         hints = [hint_int(s) for s in symints]
 
         def iterate_over_candidates() -> Generator[CompiledFxGraph, None, None]:
@@ -885,7 +885,7 @@ def _save_graph(
         # Tensor arg with a symbolic shape will have a SymInt arg for the graph.
         shape_env = FxGraphCache._get_shape_env()
         assert shape_env is not None
-        symints = FxGraphCache._filter_symints(example_inputs)
+        symints = FxGraphCache._filter_backed_symints(example_inputs)
         disk_compiled_graph.guards_expr = shape_env.produce_guards_expression(symints)
 
         try:
@@ -1961,6 +1961,14 @@ def _compile_consts_linux(consts: bytes) -> str:
             return consts_o
 
         def _compile_consts_darwin(consts: bytes) -> str:
+            if config.aot_inductor.debug_dump_consts_bin:
+                _, _binary_constants_path = write(
+                    consts,
+                    "bin",
+                    specified_dir=specified_output_path,
+                )
+                log.debug("binary constants path: %s", _binary_constants_path)
+
             is_large_consts = len(consts) > 1024
             consts_asm = "\t.section\t__DATA,__data\n"
             consts_asm += "\t.globl\t__binary_constants_bin_start\n"
diff --git a/torch/_inductor/codegen/common.py b/torch/_inductor/codegen/common.py
index 0d90e474d04b..8641f89a7d3a 100644
--- a/torch/_inductor/codegen/common.py
+++ b/torch/_inductor/codegen/common.py
@@ -601,6 +601,8 @@ class OverridesData:
     )
 
 
+# NB: if you add a new special function, don't forget to update
+# torch._inductor.ops_handler too
 pointwise_overrides_data: Dict[str, OverridesData] = dict(
     airy_ai=OverridesData(
         type_promotion_kind=ELEMENTWISE_TYPE_PROMOTION_KIND.INT_TO_FLOAT,
diff --git a/torch/_inductor/codegen/cpp.py b/torch/_inductor/codegen/cpp.py
index e12a72d11601..a0beddbf9bd3 100644
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@@ -8,7 +8,7 @@
 import sys
 from copy import copy, deepcopy
 from enum import Enum
-from typing import Any, cast, Dict, List, Optional, Set, Tuple, Union
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
 
 import sympy
 
@@ -17,10 +17,9 @@
 from torch._inductor import dependencies
 from torch._prims_common import is_float_dtype
 from torch.utils import _pytree as pytree
-from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+from torch.utils._sympy.functions import CeilDiv, FloorDiv, ModularIndexing
 from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
 from torch.utils._sympy.value_ranges import bound_sympy, ValueRanges
-from ..._dynamo.utils import counters
 
 from .. import codecache, config, ir, metrics
 from ..codegen.wrapper import WrapperCodeGen
@@ -1503,6 +1502,7 @@ def __init__(self, args, num_threads):
         self.local_reduction_init = IndentedBuffer()
         self.local_reduction_stores = IndentedBuffer()
         self.is_reduction = False
+        self.non_parallel_reduction_prefix = IndentedBuffer()
         self.reduction_cse = CSE(self.newvar_prefix, self.suffix, name_prefix="tmp_acc")
         self.preloads = IndentedBuffer()
         self.poststores = IndentedBuffer()
@@ -1517,6 +1517,7 @@ def _gen_parallel_reduction_buffers(
         dtype,
         reduction_combine_fn=reduction_combine,
         reduction_init_fn=reduction_init,
+        welford_weight_reciprocal_vec_fn=None,
     ):
         if config.cpp.dynamic_threads and not self.parallel_reduction_prefix:
             self.parallel_reduction_prefix.writeline(
@@ -1553,6 +1554,15 @@ def _gen_parallel_reduction_buffers(
                 "}",
             ],
         )
+        if (
+            reduction_type == "welford_reduce"
+            and welford_weight_reciprocal_vec_fn
+            and hasattr(self, "weight_recp_vec_range")
+            and "vec" in f"{acc_type}"
+        ):
+            self.local_reduction_init.writeline(
+                welford_weight_reciprocal_vec_fn(dtype, num_threads)
+            )
 
     def get_reduction_var_pattern(self, line: str):
         return re.search("tmp_acc[0-9]+", line)
@@ -1881,6 +1891,8 @@ def get_reduction_code_buffer(loops, buffer="prefix"):
                             prefix = kernel.reduction_prefix
                             if loop.parallel:
                                 prefix = prefix + kernel.parallel_reduction_prefix
+                            else:
+                                prefix = prefix + kernel.non_parallel_reduction_prefix
                             return prefix
 
             def gen_loops(loops: List[LoopLevel], in_reduction=False):
@@ -2319,9 +2331,25 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
         self.reduction_prefix.writeline(
             f"{acc_type_vec} {acc_vec} = {self.reduction_init_vec(reduction_type, dtype)};"
         )
-        self.stores.writeline(
-            f"{acc_vec} = {self.reduction_combine_vec(reduction_type, acc_vec, value)};"
+        # save the reciprocal of weights for welford reduce if using static shape
+        reduction_size = functools.reduce(
+            lambda x, y: x * y, self.ranges[self.reduction_depth :]
         )
+        if reduction_type == "welford_reduce":
+            reduction_factor = (
+                self.tiling_factor if self.tiling_idx >= self.reduction_depth else 1
+            )
+            self.weight_recp_vec_range = FloorDiv(reduction_size, reduction_factor)
+            self.non_parallel_reduction_prefix.writeline(
+                self.welford_weight_reciprocal_vec(dtype, None)
+            )
+            self.stores.writeline(
+                f"{acc_vec} = {self.reduction_combine_vec(reduction_type, acc_vec, value, True)};"
+            )
+        else:
+            self.stores.writeline(
+                f"{acc_vec} = {self.reduction_combine_vec(reduction_type, acc_vec, value)};"
+            )
         self._gen_parallel_reduction_buffers(
             acc,
             acc_type,
@@ -2335,6 +2363,7 @@ def reduction(self, dtype, src_dtype, reduction_type, value):
             dtype,
             reduction_combine_fn=self.reduction_combine_vec,
             reduction_init_fn=self.reduction_init_vec,
+            welford_weight_reciprocal_vec_fn=self.welford_weight_reciprocal_vec,
         )
         tmpvar: Union[str, CSEVariable]
         if self.tiling_idx >= self.reduction_depth:
@@ -2436,7 +2465,18 @@ def reduction_acc_type_vec(self, reduction_type, dtype):
 
         return vec_type
 
-    def reduction_combine_vec(self, reduction_type, var, next_value):
+    def welford_weight_reciprocal_vec(self, dtype, num_threads=None):
+        vec_num_range_thread = (
+            CeilDiv(self.weight_recp_vec_range, num_threads)
+            if num_threads
+            else self.weight_recp_vec_range
+        )
+        vec_num_range_thread_expr = cexpr_index(vec_num_range_thread)
+        return f"static WeightRecp<{self._get_vec_type(dtype)}> weight_recps({vec_num_range_thread_expr});"
+
+    def reduction_combine_vec(
+        self, reduction_type, var, next_value, use_weight_recps=False
+    ):
         if reduction_type == "max":
             return f"at::vec::maximum({var}, {next_value})"
         elif reduction_type == "min":
@@ -2448,7 +2488,10 @@ def reduction_combine_vec(self, reduction_type, var, next_value):
         elif reduction_type == "xor_sum":
             return f"{var} ^ {next_value}"
         elif reduction_type == "welford_reduce":
-            return f"welford_combine({var}, {next_value})"
+            if use_weight_recps:
+                return f"welford_combine({var}, {next_value}, &weight_recps)"
+            else:
+                return f"welford_combine({var}, {next_value})"
         elif reduction_type == "welford_combine":
             if isinstance(next_value, tuple):
                 # When reading a value from Inductor IR we have a tuple of variable names
@@ -3522,8 +3565,6 @@ def _can_fuse_horizontal_impl(self, node1, node2):
         return self._why_fuse_nodes(node1, node2) is not None
 
     def can_fuse_horizontal(self, node1, node2):
-        if node1.is_template() or node2.is_template():
-            return False
         if (
             len(node1.get_nodes()) + len(node2.get_nodes())
             > config.cpp.max_horizontal_fusion_size
@@ -3604,9 +3645,6 @@ def get_fusion_pair_priority(self, node1, node2):
             return 0
 
     def can_fuse_vertical(self, node1, node2):
-        # TODO(jgong5): support vertical fusion for template nodes
-        if node1.is_template() or node2.is_template():
-            return False
         return (
             self._can_fuse_horizontal_impl(node1, node2) and not node1.is_reduction()
         ) or self.can_fuse_vertical_outer_loop(node1, node2)
@@ -3663,42 +3701,6 @@ def codegen_node(
         if args_num > CppScheduling.MAX_FUSED_KERNEL_ARGS_NUM:
             self._set_flush_status(True)
 
-    def is_cpp_template(self, node: BaseSchedulerNode) -> bool:
-        return isinstance(node, SchedulerNode) and isinstance(
-            node.node, ir.CppTemplateBuffer
-        )
-
-    def codegen_template(
-        self, template_node: BaseSchedulerNode, epilogue_nodes: List[SchedulerNode]
-    ):
-        """
-        Codegen a CPP template, possibly with fused epilogues
-        """
-        counters["inductor"]["cpp_epilogue_fusion_counter"] += len(epilogue_nodes)
-        assert self.is_cpp_template(
-            template_node
-        ), "Template node passed to CppScheduler.codegen_template must be a SchedulerNode that wraps a CppTemplateBuffer"
-        template_node = cast(SchedulerNode, template_node)
-        _, (_, rnumel) = template_node.group
-        assert rnumel == ()
-        ctb: ir.CppTemplateBuffer = cast(ir.CppTemplateBuffer, template_node.node)
-        epilogue_ir_nodes: List[ir.Buffer] = [n.node for n in epilogue_nodes]
-        assert all(
-            isinstance(n, ir.ComputedBuffer) for n in epilogue_ir_nodes
-        ), "Epilogue nodes must all be instances of ir.ComputedBuffer"
-        kernel, render = ctb.make_kernel_render(ctb, epilogue_nodes=epilogue_ir_nodes)
-        with kernel:
-            for node in [template_node, *epilogue_nodes]:
-                node.mark_run()
-            src_code = render()
-
-        with V.set_kernel_handler(kernel):
-            node_schedule = [template_node, *epilogue_nodes]
-            kernel_name = self.define_kernel(src_code, node_schedule, kernel.args)
-        kernel.call_kernel(kernel_name, ctb)
-        V.graph.removed_buffers |= kernel.removed_buffers
-        self.scheduler.free_buffers()
-
     def _get_scheduled_num_args(self):
         return self.kernel_group.get_num_args()
 
@@ -3708,7 +3710,7 @@ def ready_to_flush(self):
     def codegen_sync(self):
         pass
 
-    def define_kernel(self, src_code, nodes, kernel_args=None):
+    def define_kernel(self, src_code, nodes):
         wrapper = V.graph.wrapper_code
         fused_name = (
             get_fused_kernel_name(nodes, config.cpp.descriptive_names)
@@ -3724,8 +3726,7 @@ def define_kernel(self, src_code, nodes, kernel_args=None):
         src_code = src_code.replace("#pragma CMT", "//")
 
         compile_wrapper = IndentedBuffer()
-        args = self.kernel_group.args if kernel_args is None else kernel_args
-        _, _, arg_types = args.cpp_argdefs()
+        _, _, arg_types = self.kernel_group.args.cpp_argdefs()
         if not V.graph.cpp_wrapper:
             compile_wrapper.writeline(f"async_compile.cpp_pybinding({arg_types!r}, '''")
         compile_wrapper.splice(src_code, strip=True)
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
deleted file mode 100644
index c623f262b015..000000000000
--- a/torch/_inductor/codegen/cpp_gemm_template.py
+++ /dev/null
@@ -1,372 +0,0 @@
-from typing import cast, List, Optional
-
-import torch
-import torch.utils
-from .. import ir, lowering as L
-
-from ..kernel.mm_common import mm_args
-from ..select_algorithm import DataProcessorTemplateWrapper
-from ..utils import cache_on_self, has_free_symbols, parallel_num_threads
-from ..virtualized import V
-from .cpp_micro_gemm import create_micro_gemm
-from .cpp_template import CppTemplate
-
-from .cpp_template_kernel import CppTemplateKernel
-from .cpp_utils import GemmBlocking
-
-GEMM_TEMPLATE = r"""
-{{template.header().getvalue()}}
-
-{{micro_gemm.codegen_define(kernel)}}
-
-extern "C"
-{{kernel.def_kernel(inputs={"X": X, "W": W, "inp": inp}, outputs={"Y": Y})}}
-{
-    {{kernel.maybe_codegen_profile()}}
-    constexpr int64_t num_threads = {{num_threads}};
-    constexpr int64_t N = {{kernel.size(Y, 1)}};
-    constexpr int64_t K = {{kernel.size(X, 1)}};
-    constexpr int64_t M0 = {{micro_gemm.register_blocking.block_m}};
-    constexpr int64_t N0 = {{micro_gemm.register_blocking.block_n}};
-    constexpr int64_t K0 = {{micro_gemm.register_blocking.block_k}};
-    constexpr int64_t N0_blocks = (N + N0 - 1) / N0;
-    constexpr int64_t K0_blocks = (K + K0 - 1) / K0;
-
-    static_assert(N % N0 == 0, "N dimension must be multiple of N0");
-
-    // TODO(jgong5): improve cache blocking with CPU info (Mc, Kc)
-    {%- if is_dynamic_M %}
-    const int64_t M = {{kernel.size(Y, 0)}};
-    const int64_t M0_blocks = (M + M0 - 1) / M0;
-    {%- if num_threads > 1 %}
-    const auto [Mt_blocks, Nt_blocks, Kt_blocks] = mm_get_thread_blocking(M, N, K, M0, N0, K0, num_threads);
-    {%- else %}
-    const auto Mt_blocks = M0_blocks;
-    const auto Nt_blocks = N0_blocks;
-    const auto Kt_blocks = K0_blocks;
-    {%- endif %}
-    const int64_t Mc_blocks = Mt_blocks;
-    const int64_t Kc_blocks = Kt_blocks;
-    {%- else %}
-    constexpr int64_t M = {{kernel.size(Y, 0)}};
-    constexpr int64_t M0_blocks = (M + M0 - 1) / M0;
-    constexpr int64_t Mt_blocks = {{template.thread_blocking().block_m}};
-    constexpr int64_t Nt_blocks = {{template.thread_blocking().block_n}};
-    constexpr int64_t Kt_blocks = {{template.thread_blocking().block_k}};
-    constexpr int64_t Mc_blocks = {{template.cache_blocking().block_m}};
-    constexpr int64_t Kc_blocks = {{template.cache_blocking().block_k}};
-    {%- endif %}
-
-    // TODO(jgong5): support k-slicing
-    {{kernel.assert_function}}(Kt_blocks == K0_blocks, "Do not support k slicing yet.");
-    // make sure all partitions are assigned
-    {{kernel.assert_function}}(
-        Mt_blocks * Nt_blocks * Kt_blocks * {{num_threads}} >= M0_blocks * N0_blocks * K0_blocks,
-        "Not all partitions are assigned."
-    );
-
-    {%- if num_threads > 1 %}
-    #pragma omp parallel num_threads({{num_threads}})
-    {
-        int tid = omp_get_thread_num();
-        int64_t m_block_start, m_block_end, n_block_start, n_block_end, k_block_start, k_block_end;
-        mm_get_thread_blocks(
-            tid, M0_blocks, N0_blocks, K0_blocks, Mt_blocks, Nt_blocks, Kt_blocks,
-            m_block_start, m_block_end, n_block_start, n_block_end, k_block_start, k_block_end);
-    {%- else %}
-    {
-        int64_t m_block_start = 0;
-        int64_t m_block_end = M0_blocks;
-        int64_t n_block_start = 0;
-        int64_t n_block_end = N0_blocks;
-        int64_t k_block_start = 0;
-        int64_t k_block_end = K0_blocks;
-    {%- endif %}
-        for (int64_t mc = m_block_start; mc < m_block_end; mc += Mc_blocks) {
-            int64_t m_start = mc * M0;
-            int64_t m_end = std::min((mc + Mc_blocks) * M0, M);
-            for (int64_t nc = n_block_start; nc < n_block_end; ++nc) {
-                int64_t n_start = nc * N0;
-                // TODO(jgong5): use float32 temporary buffer to support bfloat16/float16 gemm
-                {%- if inp is not none and beta != 0 %}
-                for (int64_t m = m_start; m < m_end; ++m) {
-                    #pragma omp simd
-                    for (int64_t n = n_start; n < n_start + N0; ++n) {
-                        {{kernel.index(Y, ["m", "n"])}} = {{beta}} * {{kernel.index(inp, ["m", "n"])}};
-                    }
-                }
-                {%- endif %}
-                for (int64_t kc = k_block_start; kc < k_block_end; kc += Kc_blocks) {
-                    int64_t k_start = kc * K0;
-                    int64_t k_end = std::min((kc + Kc_blocks) * K0, K);
-                    {%- set tile_X = kernel.slice_nd(X, [("m_start", "m_end"), ("k_start", "k_end")]) %}
-                    {%- set tile_W_3d = kernel.slice_nd(W, [("nc", "nc + 1"), ("k_start", "k_end"), ()]) %}
-                    {%- set tile_W = kernel.view(tile_W_3d, ["k_end - k_start", micro_gemm.register_blocking.block_n]) %}
-                    {%- set tile_Y = kernel.slice_nd(Y, [("m_start", "m_end"), ("n_start", "n_start + N0")]) %}
-                    {%- if inp is not none and beta != 0 %}
-                    {{ micro_gemm.codegen_call(kernel, tile_X, tile_W, tile_Y, accum=True)|indent(20, false) }}
-                    {%- else %}
-                    if (kc == k_block_start) {
-                        {{ micro_gemm.codegen_call(kernel, tile_X, tile_W, tile_Y, accum=False)|indent(24, false) }}
-                    } else {
-                        {{ micro_gemm.codegen_call(kernel, tile_X, tile_W, tile_Y, accum=True)|indent(24, false) }}
-                    }
-                    {%- endif %}
-                }
-            }
-        }
-    }
-}
-"""
-
-
-class CppPackedGemmTemplate(CppTemplate):
-    def __init__(
-        self,
-        input_nodes,
-        layout: ir.Layout,
-        num_threads: int,
-        register_blocking: GemmBlocking,
-        beta=1,
-        alpha=1,
-    ):
-        super().__init__("packed_gemm", input_nodes, layout)
-        self.beta = beta
-        self.alpha = alpha
-        self.num_threads = num_threads
-        self.register_blocking = register_blocking
-        m, n = layout.size
-        _, k = input_nodes[0].get_size()
-        self.m, self.n, self.k = m, n, k
-        self.is_dynamic_M = has_free_symbols((m,))
-
-    @cache_on_self
-    def thread_blocking(self) -> GemmBlocking:
-        # TODO(jgong5): allow tuning various blocking options
-        def get_factors(number):
-            factors = []
-            # priorize more evenly divided factors
-            for i in range(int(number**0.5), 0, -1):
-                if number % i == 0:
-                    factors.append(number // i)
-                    factors.append(i)
-            return factors
-
-        def get_blocking(num_threads, factor, m_blocks, n_blocks, k_blocks):
-            thread_block_n = (n_blocks + factor - 1) // factor
-            cofactor = num_threads // factor
-            thread_block_m = (m_blocks + cofactor - 1) // cofactor
-            return GemmBlocking(thread_block_m, thread_block_n, k_blocks)
-
-        assert (
-            not self.is_dynamic_M
-        ), "Unable to determine thread blocking for dynamic M."
-        register_blocking = self.register_blocking
-        m_blocks = (self.m + register_blocking.block_m - 1) // register_blocking.block_m
-        n_blocks = (self.n + register_blocking.block_n - 1) // register_blocking.block_n
-        k_blocks = (self.k + register_blocking.block_k - 1) // register_blocking.block_k
-        factors = get_factors(self.num_threads)
-        assert len(factors) > 0
-        for factor in factors:
-            if n_blocks % factor == 0 and m_blocks % (self.num_threads // factor) == 0:
-                return get_blocking(
-                    self.num_threads, factor, m_blocks, n_blocks, k_blocks
-                )
-        for factor in factors:
-            if n_blocks % factor == 0:
-                return get_blocking(
-                    self.num_threads, factor, m_blocks, n_blocks, k_blocks
-                )
-            cofactor = self.num_threads // factor
-            if m_blocks % cofactor == 0:
-                return get_blocking(
-                    self.num_threads, factor, m_blocks, n_blocks, k_blocks
-                )
-        raise AssertionError("Should not reach here.")
-
-    @cache_on_self
-    def cache_blocking(self) -> GemmBlocking:
-        # TODO(jgong5): improve cache blocking with CPU info
-        assert (
-            not self.is_dynamic_M
-        ), "Unable to determine cache blocking for dynamic M."
-        thread_blocking = self.thread_blocking()
-        return GemmBlocking(thread_blocking.block_m, 1, thread_blocking.block_k)
-
-    @staticmethod
-    def add_choices(
-        choices, layout, input_nodes, beta=1, alpha=1, trans_w=False, input_indices=None
-    ):
-        if input_indices is None:
-            input_indices = list(range(len(input_nodes)))
-
-        def reorder_and_filter(inputs, layout_or_out):
-            if len(input_indices) == 2:
-                x_idx = input_indices[0]
-                w_idx = input_indices[1]
-                return [inputs[x_idx], inputs[w_idx]], layout_or_out
-            else:
-                assert (
-                    len(input_indices) == 3
-                ), "Cpp Packed GEMM template requires 2 or 3 input nodes."
-                # assume the input order is [inp, x, w] and we reorder it to [x, w, inp]
-                inp_idx = input_indices[0]
-                x_idx = input_indices[1]
-                w_idx = input_indices[2]
-                return [inputs[x_idx], inputs[w_idx], inputs[inp_idx]], layout_or_out
-
-        def transpose_weight(inputs, layout_or_out):
-            if not trans_w:
-                return inputs, layout_or_out
-
-            new_inputs = list(inputs)
-            W = inputs[1]
-            if isinstance(W, ir.IRNode):
-                if not isinstance(W, ir.TensorBox):
-                    W = ir.TensorBox(W)
-                new_inputs[1] = L.permute(W, [1, 0])
-                return new_inputs, layout_or_out
-            else:
-                assert isinstance(W, torch.Tensor)
-                new_inputs[1] = W.transpose(0, 1)
-            return new_inputs, layout_or_out
-
-        # TODO(jgong5): decide proper number of threads per problem size
-        num_threads = parallel_num_threads()
-        new_inputs, _ = transpose_weight(*reorder_and_filter(input_nodes, layout))
-        m, n, k, *_ = mm_args(new_inputs[0], new_inputs[1])
-        micro_gemm = create_micro_gemm(
-            "micro_gemm", m, n, k, layout.dtype, alpha=alpha, num_threads=num_threads
-        )
-        assert micro_gemm is not None
-        _, block_n, _ = micro_gemm.register_blocking
-
-        def pack_weight(inputs, layout_or_out):
-            W = inputs[1]
-            new_inputs = list(inputs)
-            if isinstance(W, ir.IRNode):
-                if not isinstance(W, ir.TensorBox):
-                    W = ir.TensorBox(W)
-                k, n = W.get_size()
-                assert (
-                    n % block_n == 0
-                ), f"The last dimension of W must be a multiple of {block_n}."
-                blocked_w = L.permute(
-                    L.view(W, (k, n // block_n, block_n)),
-                    [1, 0, 2],
-                )
-                blocked_w = ir.ExternKernel.realize_input(blocked_w)
-                blocked_w = ir.ExternKernel.require_contiguous(blocked_w)
-                if isinstance(blocked_w, ir.ReinterpretView):
-                    # normalize stride to be "contiguous_strides" per size
-                    # this avoids the problems in L.view during template codegen
-                    assert isinstance(blocked_w.layout, ir.FixedLayout)
-                    blocked_w.layout = ir.FixedLayout(
-                        blocked_w.layout.device,
-                        blocked_w.layout.dtype,
-                        blocked_w.layout.size,
-                        ir.FlexibleLayout.contiguous_strides(blocked_w.layout.size),
-                        blocked_w.layout.offset,
-                    )
-            else:
-                k, n = list(W.shape)
-                blocked_w = (
-                    W.reshape(k, n // block_n, block_n).transpose(0, 1).contiguous()
-                )
-                # normalize stride to be "contiguous_strides" per size
-                # this avoids the problems in L.view during template codegen
-                new_stride = [1]
-                for sz in reversed(blocked_w.shape[1:]):
-                    new_stride.insert(0, new_stride[0] * sz)
-                blocked_w = blocked_w.as_strided(blocked_w.shape, new_stride)
-            new_inputs[1] = blocked_w
-            return new_inputs, layout_or_out
-
-        def preprocessor(inputs, layout):
-            return pack_weight(*transpose_weight(*reorder_and_filter(inputs, layout)))
-
-        def postprocessor(output):
-            if isinstance(output, ir.TensorBox):
-                # prepack the weight as input to the template buffer
-                # TODO(jgong5): prune the unused constants in V.graph
-                # Should we implement it with constant folding in the scheduler instead?
-                template_buffer = ir.InputsKernel.unwrap_storage_for_input(output)
-                assert isinstance(template_buffer, ir.CppTemplateBuffer)
-                new_input_nodes, _ = reorder_and_filter(input_nodes, layout)
-                W_node = new_input_nodes[1]
-                assert W_node.get_name() in V.graph.constants
-                W = V.graph.constants[W_node.get_name()]
-                new_input_nodes[1] = W
-                new_input_nodes, _ = pack_weight(
-                    *transpose_weight(new_input_nodes, layout)
-                )
-                W_packed = new_input_nodes[1]
-                W_packed_constant = V.graph.add_tensor_constant(W_packed)
-                template_buffer.inputs[1] = ir.InputsKernel.unwrap_storage_for_input(
-                    W_packed_constant
-                )
-            return output
-
-        template = DataProcessorTemplateWrapper(
-            CppPackedGemmTemplate,
-            preprocessor,
-            postprocessor,
-            input_nodes=input_nodes,
-            layout=layout,
-            num_threads=num_threads,
-            register_blocking=micro_gemm.register_blocking,
-            beta=beta,
-            alpha=alpha,
-        )
-        template.maybe_append_choice(choices)
-        return template
-
-    def render(  # type: ignore[override]
-        self,
-        kernel: CppTemplateKernel,
-        template_buffer_node: Optional[ir.CppTemplateBuffer] = None,
-        epilogue_nodes: Optional[List[ir.IRNode]] = None,
-        **kwargs,
-    ) -> str:
-        assert not epilogue_nodes, "Epilogue nodes are not supported for GEMM template."
-        assert len(self.input_nodes) >= 2
-
-        X, W = self.input_nodes[0], self.input_nodes[1]
-        inp = self.input_nodes[2] if len(self.input_nodes) > 2 else None
-        Y = self.output_node
-
-        if template_buffer_node is not None:
-            # Use the updated prepacked weight buffer
-            W = template_buffer_node.inputs[1]
-            Y = template_buffer_node
-        if epilogue_nodes is not None and len(epilogue_nodes) > 0:
-            Y = cast(ir.Buffer, epilogue_nodes[-1])
-        assert self.output_node is not None
-
-        micro_gemm = create_micro_gemm(
-            f"{kernel.kernel_name}_micro_gemm",
-            self.m,
-            self.n,
-            self.k,
-            self.layout.dtype,
-            alpha=self.alpha,
-            num_threads=self.num_threads,
-        )
-        assert micro_gemm is not None
-        assert self.register_blocking == micro_gemm.register_blocking
-
-        options = dict(
-            X=X,
-            W=W,
-            inp=inp,
-            Y=Y,
-            beta=self.beta,
-            alpha=self.alpha,
-            num_threads=self.num_threads,
-            micro_gemm=micro_gemm,
-            is_dynamic_M=self.is_dynamic_M,
-            template=self,
-            kernel=kernel,
-            epilogues=epilogue_nodes,
-        )
-        return self._template_from_string(GEMM_TEMPLATE).render(**options)
diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py
deleted file mode 100644
index 7d54bd8605ec..000000000000
--- a/torch/_inductor/codegen/cpp_micro_gemm.py
+++ /dev/null
@@ -1,401 +0,0 @@
-from collections import namedtuple
-from typing import Dict, List, Optional, Type
-
-import sympy
-
-import torch
-
-from .. import ir
-from ..codecache import pick_vec_isa, VecAVX2, VecAVX512
-from ..utils import IndentedBuffer, parallel_num_threads
-from ..virtualized import V
-from .common import KernelTemplate
-from .cpp_template_kernel import CppTemplateKernel
-from .cpp_utils import DTYPE_TO_CPP, GemmBlocking, value_to_cpp
-
-
-class CppMicroGemm:
-    """
-    A class that codegens a kernel that computes small-sized matrix multiplication.
-
-    A micro GEMM kernel is responsible for register blocking, instruction selection,
-    and other CPU architecture-specific optimizations.
-
-    The subclasses need to override `codegen_define` to define the kernel function
-    that is called by the code generated by `codegen_call`.
-    """
-
-    # TODO(jgong5): support constant shapes and lds as template args.
-    DECLARE_KERNEL = r"""
-template <bool accum>
-inline void {{kernel_name}}(
-    const {{input_t}}* __restrict__ A,
-    const {{input_t}}* __restrict__ B,
-    {{output_t}}* __restrict__ C,
-    int64_t M,
-    int64_t N,
-    int64_t K,
-    int64_t lda,
-    int64_t ldb,
-    int64_t ldc
-)
-"""
-
-    def __init__(
-        self,
-        name,
-        input_dtype,
-        output_dtype,
-        compute_dtype,
-        register_blocking,
-        alpha=1,
-    ):
-        self.name = name
-        self.input_dtype = input_dtype
-        self.output_dtype = output_dtype
-        self.compute_dtype = compute_dtype
-        self.register_blocking = register_blocking
-        self.alpha = alpha
-
-    def get_common_options(self):
-        return {
-            "kernel_name": self.name,
-            "input_t": DTYPE_TO_CPP[self.input_dtype],
-            "output_t": DTYPE_TO_CPP[self.output_dtype],
-            "compute_t": DTYPE_TO_CPP[self.compute_dtype],
-            "alpha": self.alpha,
-        }
-
-    def get_kernel_declaration(self):
-        options = self.get_common_options()
-        return KernelTemplate._template_from_string(self.DECLARE_KERNEL).render(options)
-
-    def codegen_define(self, kernel: CppTemplateKernel) -> str:
-        raise NotImplementedError
-
-    def codegen_call(
-        self,
-        kernel: CppTemplateKernel,
-        A: ir.Buffer,
-        B: ir.Buffer,
-        C: ir.Buffer,
-        accum: bool,
-    ) -> str:
-        """
-        Generate the code for calling the templated kernel that computes
-        `C += alpha * A @ B` if `accum` is True, or `C = alpha * A @ B` otherwise.
-        """
-        A_ptr = f"&({kernel.index(A, [0, 0])})"
-        B_ptr = f"&({kernel.index(B, [0, 0])})"
-        C_ptr = f"&({kernel.index(C, [0, 0])})"
-        M = kernel.size(C, 0)
-        N = kernel.size(C, 1)
-        K = kernel.size(A, 1)
-        lda = kernel.stride(A, 0)
-        ldb = kernel.stride(B, 0)
-        ldc = kernel.stride(C, 0)
-        res = IndentedBuffer()
-        res.writeline(f"{self.name}<{value_to_cpp(accum, 'bool')}>(")
-        with res.indent():
-            res.writeline(f"{A_ptr},")
-            res.writeline(f"{B_ptr},")
-            res.writeline(f"{C_ptr},")
-            res.writeline(f"{M},")
-            res.writeline(f"{N},")
-            res.writeline(f"{K},")
-            res.writeline(f"{lda},")
-            res.writeline(f"{ldb},")
-            res.writeline(f"{ldc}")
-        res.writeline(");")
-        return res.getvalue()
-
-
-CppMicroGemmConfig = namedtuple(
-    "CppMicroGemmConfig",
-    [
-        "input_dtype",
-        "output_dtype",
-        "compute_dtype",
-        "vec_isa_cls",
-        "register_blocking",
-    ],
-)
-
-micro_gemm_configs: Dict[Type[CppMicroGemm], List[CppMicroGemmConfig]] = {}
-
-
-def register_micro_gemm(*configs):
-    def inner(cls):
-        assert (
-            cls not in micro_gemm_configs
-        ), f"Duplicate micro_gemm registration for {cls}"
-        assert len(configs) > 0, f"No micro_gemm configs provided for {cls}"
-        micro_gemm_configs[cls] = list(configs)
-        return cls
-
-    return inner
-
-
-class CppMicroGemmRef(CppMicroGemm):
-    """
-    A reference implementation of the CppMicroGemm class with naive C++ code.
-    It is used for correctness debugging.
-    """
-
-    TEMPLATE_ENTRY = r"""
-{{declare_kernel}} {
-    for (int64_t m = 0; m < M; ++m) {
-        for (int64_t n = 0; n < N; ++n) {
-            {{compute_t}} result = accum ? C[m * ldc + n] : 0;
-            for (int64_t k = 0; k < K; ++k) {
-                result += ({{compute_t}})A[m * lda + k] * ({{compute_t}})B[k * ldb + n] * {{alpha}};
-            }
-            C[m * ldc + n] = result;
-        }
-    }
-}
-"""
-
-    def __init__(self, name, input_dtype, output_dtype, compute_dtype, alpha):
-        super().__init__(
-            name, input_dtype, output_dtype, compute_dtype, GemmBlocking(1, 1, 1), alpha
-        )
-
-    def codegen_define(self, kernel: CppTemplateKernel) -> str:
-        options = {
-            "declare_kernel": self.get_kernel_declaration(),
-            **self.get_common_options(),
-        }
-        return KernelTemplate._template_from_string(self.TEMPLATE_ENTRY).render(options)
-
-
-@register_micro_gemm(
-    CppMicroGemmConfig(
-        torch.float32, torch.float32, torch.float32, VecAVX512, GemmBlocking(8, 48, 1)
-    ),
-    CppMicroGemmConfig(
-        torch.float32, torch.float32, torch.float32, VecAVX512, GemmBlocking(8, 32, 1)
-    ),
-    CppMicroGemmConfig(
-        torch.float32, torch.float32, torch.float32, VecAVX512, GemmBlocking(16, 16, 1)
-    ),
-    CppMicroGemmConfig(
-        torch.float32, torch.float32, torch.float32, VecAVX2, GemmBlocking(4, 24, 1)
-    ),
-    CppMicroGemmConfig(
-        torch.float32, torch.float32, torch.float32, VecAVX2, GemmBlocking(4, 16, 1)
-    ),
-    CppMicroGemmConfig(
-        torch.float32, torch.float32, torch.float32, VecAVX2, GemmBlocking(8, 8, 1)
-    ),
-)
-class CppMicroGemmFP32Vec(CppMicroGemm):
-    """
-    This class generates the code for fp32 micro gemm using vec instructions.
-    """
-
-    TEMPLATE_ENTRY = r"""
-{{declare_kernel}} {
-    TORCH_CHECK(N % {{block_n}} == 0, "N dimension must be multiple of {{block_n}}");
-    TORCH_CHECK(K % {{block_k}} == 0, "K dimension must be multiple of {{block_k}}");
-    // TODO(jgong5): loop unroll for M and N
-    for (int64_t m = 0; m < M; m += {{block_m}}) {
-        int64_t block_m = std::min<int64_t>(M - m, {{block_m}});
-        for (int64_t n = 0; n < N; n += {{block_n}}) {
-            if (block_m == {{block_m}}) {
-                {{kernel_name}}_kernel<{{block_m}}, {{block_n}}, accum>(
-                    A + m * lda,
-                    B + n,
-                    C + m * ldc + n,
-                    K,
-                    lda,
-                    ldb,
-                    ldc
-                );
-            } else {
-                switch (block_m) {
-                {%- for b in range(block_m - 1, 0, -1) %}
-                case {{b}}:
-                    {{kernel_name}}_kernel<{{b}}, {{block_n}}, accum>(
-                        A + m * lda,
-                        B + n,
-                        C + m * ldc + n,
-                        K,
-                        lda,
-                        ldb,
-                        ldc
-                    );
-                    break;
-                {%- endfor %}
-                default:
-                    {{kernel.assert_function}}(false, "Unsupported block_m: ", block_m);
-                }
-            }
-        }
-    }
-}
-"""
-
-    TEMPLATE_KERNEL = r"""
-template <int64_t BLOCK_M, int64_t BLOCK_N, bool accum>
-inline void {{kernel_name}}_kernel(
-    const float* __restrict__ A,
-    const float* __restrict__ B,
-    float* __restrict__ C,
-    int64_t K,
-    int64_t lda,
-    int64_t ldb,
-    int64_t ldc
-) {
-    using Vectorized = at::vec::Vectorized<float>;
-    constexpr auto VLEN = Vectorized::size();
-    constexpr auto ROWS = BLOCK_M;
-    constexpr auto COLS = BLOCK_N / VLEN;
-
-    Vectorized va;
-    at::vec::VectorizedN<float, COLS> vb;
-    at::vec::VectorizedN<float, ROWS*COLS> vc;
-
-    auto loadc = [&](auto i) {
-        if constexpr (accum) {
-            constexpr int row = i / COLS;
-            constexpr int col = i % COLS;
-            vc[i] = Vectorized::loadu(C + row * ldc + col * VLEN);
-        } else {
-            vc[i] = Vectorized(0.0f);
-        }
-    };
-    c10::ForcedUnroll<ROWS * COLS>{}(loadc);
-
-    auto compute = [&, COLS](auto i, int k) {
-        constexpr int row = i / COLS;
-        constexpr int col = i % COLS;
-
-        if constexpr (col == 0) {
-            {%- if alpha != 1 %}
-            va = Vectorized(A[row * lda + k] * {{alpha}});
-            {%- else %}
-            va = Vectorized(A[row * lda + k]);
-            {%- endif %}
-        }
-
-        if constexpr (row == 0) {
-            vb[col] = Vectorized::loadu(B + k * ldb + col * VLEN);
-        }
-
-        constexpr int idx = row * COLS + col;
-        vc[idx] = at::vec::fmadd(va, vb[col], vc[idx]);
-    };
-
-    {{kernel.unroll_pragma(4)}}
-    for (int k = 0; k < K; ++k) {
-        c10::ForcedUnroll<ROWS * COLS>{}(compute, k);
-    }
-
-    // store to C
-    auto storec = [&](auto i) {
-        constexpr int row = i / COLS;
-        constexpr int col = i % COLS;
-        vc[i].store(C + row * ldc + col * VLEN);
-    };
-    c10::ForcedUnroll<ROWS * COLS>{}(storec);
-}
-"""
-
-    def codegen_define(self, kernel: CppTemplateKernel) -> str:
-        options = {
-            "declare_kernel": self.get_kernel_declaration(),
-            "kernel": kernel,
-            "block_m": self.register_blocking.block_m,
-            "block_n": self.register_blocking.block_n,
-            "block_k": self.register_blocking.block_k,
-            **self.get_common_options(),
-        }
-        result = KernelTemplate._template_from_string(self.TEMPLATE_KERNEL).render(
-            options
-        )
-        result += KernelTemplate._template_from_string(self.TEMPLATE_ENTRY).render(
-            options
-        )
-        return result
-
-
-def create_micro_gemm(
-    name,
-    m,
-    n,
-    k,
-    input_dtype,
-    output_dtype=None,
-    compute_dtype=None,
-    alpha=1,
-    num_threads=-1,
-    use_ref=False,
-) -> Optional[CppMicroGemm]:
-    def create_from_config(cls, config: CppMicroGemmConfig):
-        return cls(
-            name,
-            config.input_dtype,
-            config.output_dtype,
-            config.compute_dtype,
-            config.register_blocking,
-            alpha,
-        )
-
-    assert isinstance(n, int) or n.is_number, n
-    assert isinstance(k, int) or k.is_number, k
-    m = V.graph.sizevars.size_hint(m) if isinstance(m, sympy.Expr) else m
-    assert isinstance(m, int), m
-    if output_dtype is None:
-        output_dtype = input_dtype
-    if compute_dtype is None:
-        compute_dtype = input_dtype
-    if num_threads < 0:
-        num_threads = parallel_num_threads()
-    vec_isa = pick_vec_isa()
-    matched_configs = []
-    for cls, configs in micro_gemm_configs.items():
-        for config in configs:
-            if not isinstance(vec_isa, config.vec_isa_cls):
-                continue
-            if (
-                config.input_dtype == input_dtype
-                and config.output_dtype == output_dtype
-                and config.compute_dtype == compute_dtype
-            ):
-                block_m, block_n, block_k = config.register_blocking
-                # TODO(jgong5): support n % n_block_size != 0
-                if n % block_n != 0:
-                    continue
-                # Criteria on the ranking of configurations
-                # 1. Dividable by block sizes (block_m, block_k)
-                # 2. Number of mxn blocks is large enough to occupy all the threads
-                # 3. Register blocks are larger
-                dividable_score = 0
-                if k % block_k == 0:
-                    dividable_score += 1
-                if m % block_m == 0:
-                    dividable_score += 1
-                occupancy_score = 0
-                n_blocks = n // block_n
-                total_mxn_blocks = n // block_n * ((m + block_m - 1) // block_m)
-                if n_blocks >= num_threads:
-                    occupancy_score += 1
-                if total_mxn_blocks >= num_threads:
-                    occupancy_score += 1
-                matched_configs.append(
-                    (
-                        (dividable_score, occupancy_score, block_m * block_n * block_k),
-                        cls,
-                        config,
-                    )
-                )
-    if len(matched_configs) == 0:
-        if use_ref:
-            return CppMicroGemmRef(
-                name, input_dtype, output_dtype, compute_dtype, alpha
-            )
-        else:
-            return None
-    # TODO(jgong5): allow autotuning on choices of configs
-    return create_from_config(*max(matched_configs, key=lambda x: x[0])[1:])
diff --git a/torch/_inductor/codegen/cpp_prefix.h b/torch/_inductor/codegen/cpp_prefix.h
index 45f874fc4d26..7e3483ca9994 100644
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@@ -5,7 +5,6 @@
 #include <cmath>
 #include <cstdlib>
 #include <limits>
-#include <memory>
 #include <omp.h>
 
 // WARNING: be extra careful when including more ATen/c10 header files here!
@@ -46,7 +45,7 @@ template <typename T>
 struct Welford {
   T mean = T(0);
   T m2 = T(0);
-  T weight = T(0);
+  int64_t index = 0;
 };
 
 
@@ -59,41 +58,57 @@ struct IsVecType<at::vec::Vectorized<T>>: std::true_type {};
 #endif
 
 template <typename T>
-Welford<T> welford_combine(const Welford<T> &a, const Welford<T> &b) {
-  if constexpr (!IsVecType<T>::value) {
-    if (a.weight == 0) {
-      return b;
-    }
-    if (b.weight == 0) {
-      return a;
+struct WeightRecp {
+  using scalar_t = typename T::value_type;
+  int64_t N;
+  std::vector<scalar_t> weight_recps;
+  WeightRecp(int64_t N) : N(N) {
+    weight_recps.reserve(N);
+    for (const auto i : c10::irange(N)) {
+      weight_recps.push_back(
+          scalar_t(static_cast<double>(1) / static_cast<double>(i + 1)));
     }
   }
-  auto delta = b.mean - a.mean;
-  auto new_weight = a.weight + b.weight;
-  auto wb_over_w = b.weight / new_weight;
-  if constexpr (IsVecType<T>::value) {
-    // Guard against division by zero
-    wb_over_w = T::blendv(wb_over_w, T(0), new_weight == T(0));
+};
+
+template <typename T>
+Welford<T> welford_combine(const Welford<T> &a, const Welford<T> &b) {
+  if (a.index == 0) {
+    return b;
   }
+  if (b.index == 0) {
+    return a;
+  }
+  auto delta = b.mean - a.mean;
+  auto new_index = a.index + b.index;
+  auto wb_over_w = T(b.index) / T(new_index);
   auto result = Welford<T>{
     a.mean + delta * wb_over_w,
-    a.m2 + b.m2 + delta * delta * a.weight * wb_over_w,
-    new_weight
+    a.m2 + b.m2 + delta * delta * T(a.index) * wb_over_w,
+    new_index,
   };
   return result;
 }
 
 template <typename T>
-Welford<T> welford_combine(const Welford<T> &acc, T data) {
+Welford<T> welford_combine(const Welford<T> &acc, T data, const WeightRecp<T>* w=nullptr) {
   // Add a single data point
+  int64_t index = acc.index + 1;
   auto delta = data - acc.mean;
-  auto new_weight = acc.weight + T(1);
-  auto new_mean = acc.mean + delta / new_weight;
+  T new_mean;
+  if constexpr (!IsVecType<T>::value) {
+    new_mean = acc.mean + delta / T(index);
+  } else {
+    new_mean = acc.mean +
+      ((w == nullptr || acc.index >= w->weight_recps.size())
+            ? delta / T(index)
+            : delta * T(w->weight_recps[acc.index]));
+  }
   auto new_delta = data - new_mean;
   auto result = Welford<T>{
     new_mean,
     acc.m2 + delta * new_delta,
-    new_weight
+    index
   };
   return result;
 }
@@ -178,10 +193,11 @@ template <typename scalar_t>
 Welford<scalar_t> welford_vec_reduce_all(Welford<at::vec::Vectorized<scalar_t>> acc) {
   using Vec = at::vec::Vectorized<scalar_t>;
   for (size_t n = 1; n < Vec::size(); n *= 2) {
+    auto index = acc.index;
     auto shuffled = Welford<Vec>{
       vec_shuffle_down(acc.mean, n),
       vec_shuffle_down(acc.m2, n),
-      vec_shuffle_down(acc.weight, n)
+      index,
     };
     acc = welford_combine(acc, shuffled);
   }
@@ -194,8 +210,7 @@ Welford<scalar_t> welford_vec_reduce_all(Welford<at::vec::Vectorized<scalar_t>>
   acc.m2.store(array);
   result.m2 = array[0];
 
-  acc.weight.store(array);
-  result.weight = array[0];
+  result.index = acc.index;
 
   return result;
 }
@@ -294,100 +309,3 @@ atomic_add(volatile T *addr, T offset) {
   std::atomic<T> *atomic_addr = (std::atomic<T> *)addr;
   atomic_addr->fetch_add(offset, std::memory_order_relaxed);
 }
-
-std::tuple<int64_t, int64_t, int64_t> mm_get_thread_blocking(
-    int64_t M,
-    int64_t N,
-    int64_t K,
-    int64_t M0,
-    int64_t N0,
-    int64_t K0,
-    int num_threads) {
-  auto get_factors = [](int64_t number) {
-    int count = 0;
-    for (int64_t i = std::sqrt(number); i > 0; --i) {
-      if (number % i == 0) {
-        count += 2;
-      }
-    }
-    auto factors = std::make_unique<int64_t[]>(count);
-    int index = 0;
-    for (int64_t i = std::sqrt(number); i > 0; --i) {
-      if (number % i == 0) {
-        factors[index++] = number / i;
-        factors[index++] = i;
-      }
-    }
-    return std::make_tuple(std::move(factors), count);
-  };
-
-  auto get_blocking = [](int64_t num_threads,
-                         int64_t factor,
-                         int64_t m_blocks,
-                         int64_t n_blocks,
-                         int64_t k_blocks) {
-    int64_t thread_block_n = (n_blocks + factor - 1) / factor;
-    int64_t cofactor = num_threads / factor;
-    int64_t thread_block_m = (m_blocks + cofactor - 1) / cofactor;
-    return std::make_tuple(thread_block_m, thread_block_n, k_blocks);
-  };
-
-  int64_t m_blocks = (M + M0 - 1) / M0;
-  int64_t n_blocks = (N + N0 - 1) / N0;
-  int64_t k_blocks = (K + K0 - 1) / K0;
-
-  auto [factors, count] = get_factors(num_threads);
-  assert(count > 0);
-
-  for (int i = 0; i < count; ++i) {
-    int64_t factor = factors[i];
-    if (n_blocks % factor == 0 &&
-        m_blocks % (num_threads / factor) == 0) {
-      return get_blocking(
-          num_threads, factor, m_blocks, n_blocks, k_blocks);
-    }
-  }
-
-  for (int i = 0; i < count; ++i) {
-    int64_t factor = factors[i];
-    if (n_blocks % factor == 0) {
-      return get_blocking(
-          num_threads, factor, m_blocks, n_blocks, k_blocks);
-    }
-    int64_t cofactor = num_threads / factor;
-    if (m_blocks % cofactor == 0) {
-      return get_blocking(
-          num_threads, factor, m_blocks, n_blocks, k_blocks);
-    }
-  }
-
-  assert(false && "Should not reach here.");
-  // Dummy return to avoid compiler warning
-  return std::make_tuple(0, 0, 0);
-}
-
-inline void mm_get_thread_blocks(
-    int thread_id,
-    int64_t M_blocks,
-    int64_t N_blocks,
-    int64_t K_blocks,
-    int64_t Mt_blocks,
-    int64_t Nt_blocks,
-    int64_t Kt_blocks,
-    int64_t& m_block_start,
-    int64_t& m_block_end,
-    int64_t& n_block_start,
-    int64_t& n_block_end,
-    int64_t& k_block_start,
-    int64_t& k_block_end) {
-  int64_t num_Kt = (K_blocks + Kt_blocks - 1) / Kt_blocks;
-  k_block_start = (thread_id % num_Kt) * Kt_blocks;
-  k_block_end = std::min(k_block_start + Kt_blocks, K_blocks);
-  thread_id /= num_Kt;
-  int64_t num_Nt = (N_blocks + Nt_blocks - 1) / Nt_blocks;
-  n_block_start = (thread_id % num_Nt) * Nt_blocks;
-  n_block_end = std::min(n_block_start + Nt_blocks, N_blocks);
-  thread_id /= num_Nt;
-  m_block_start = std::min(thread_id * Mt_blocks, M_blocks);
-  m_block_end = std::min(m_block_start + Mt_blocks, M_blocks);
-}
diff --git a/torch/_inductor/codegen/cpp_template.py b/torch/_inductor/codegen/cpp_template.py
deleted file mode 100644
index 3d15010a8838..000000000000
--- a/torch/_inductor/codegen/cpp_template.py
+++ /dev/null
@@ -1,116 +0,0 @@
-import functools
-import itertools
-import logging
-
-import sys
-from typing import List, Optional
-from unittest.mock import patch
-
-import sympy
-
-from .. import codecache, config, ir
-from ..autotune_process import CppBenchmarkRequest, TensorMeta
-from ..utils import IndentedBuffer, Placeholder, unique
-from ..virtualized import V
-from .common import KernelTemplate
-from .cpp_template_kernel import CppTemplateCaller, CppTemplateKernel
-
-log = logging.getLogger(__name__)
-
-
-class CppTemplate(KernelTemplate):
-    index_counter = itertools.count()
-
-    def __init__(
-        self,
-        name: str,
-        input_nodes,
-        layout: ir.Layout,
-    ):
-        super().__init__(name)
-        self.input_nodes = input_nodes
-        self.output_node: ir.Buffer = ir.Buffer("buf_out", layout)
-        self.layout = layout
-
-    def generate(self, **kwargs):
-        kernel_name = f"cpp_{self.name}"
-        with patch.object(
-            V.graph, "get_dtype", self._fake_get_dtype(self.output_node)
-        ), CppTemplateKernel(
-            kernel_name=kernel_name,
-        ) as kernel:
-            code = self.render(kernel=kernel, **kwargs)
-            _, call_args, _ = kernel.args.python_argdefs()
-            log.debug("Generated Code:\n%s", code)
-            log.debug(
-                "Args: cpp_argdefs: %s, python_argdefs: %s",
-                kernel.args.cpp_argdefs(),
-                kernel.args.python_argdefs(),
-            )
-
-        expected_args = list(
-            unique(input_node.get_name() for input_node in self.input_nodes)
-        )
-        expected_args.extend([self.output_node.get_name()])
-        assert list(call_args)[: len(expected_args)] == expected_args, (
-            call_args,
-            expected_args,
-        )
-        extra_args = V.graph.sizevars.size_hints(
-            map(sympy.expand, call_args[len(expected_args) :])
-        )
-
-        kernel_hash_name = f"cpp_{self.name}_{next(self.index_counter)}"
-
-        # Create the BenchmarkRequest for CPP
-        bmreq = CppBenchmarkRequest(
-            kernel_name=kernel_name,
-            input_tensor_meta=TensorMeta.from_irnodes(self.input_nodes),
-            output_tensor_meta=TensorMeta.from_irnodes(self.output_node),
-            extra_args=extra_args,
-            source_code=code,
-        )
-
-        def make_kernel_render(
-            template_node: ir.CppTemplateBuffer,
-            epilogue_nodes: Optional[List[ir.IRNode]] = None,
-        ):
-            kernel = CppTemplateKernel(
-                kernel_name=str(Placeholder.KERNEL_NAME),
-            )
-            render = functools.partial(
-                self.render,
-                kernel=kernel,
-                template_buffer_node=template_node,
-                epilogue_nodes=epilogue_nodes,
-                **kwargs,
-            )
-            return kernel, render
-
-        return CppTemplateCaller(
-            kernel_hash_name,
-            self.name,
-            self.input_nodes,
-            self.output_node.get_layout(),
-            make_kernel_render,
-            bmreq,
-            self,
-        )
-
-    def header(self) -> IndentedBuffer:
-        res = IndentedBuffer()
-        res.writeline(codecache.cpp_prefix())
-        res.splice(
-            """
-                #include "c10/util/Unroll.h"
-            """
-        )
-        enable_kernel_profile = (
-            config.cpp.enable_kernel_profile and sys.platform == "linux"
-        )
-        if enable_kernel_profile:
-            res.writelines(["#include <ATen/record_function.h>"])
-        return res
-
-    def render(self, **kwargs) -> str:
-        raise NotImplementedError
diff --git a/torch/_inductor/codegen/cpp_template_kernel.py b/torch/_inductor/codegen/cpp_template_kernel.py
deleted file mode 100644
index 6a978c45fa28..000000000000
--- a/torch/_inductor/codegen/cpp_template_kernel.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import itertools
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import sympy
-from sympy.parsing.sympy_parser import parse_expr
-
-import torch
-
-from torch._inductor.autotune_process import CppBenchmarkRequest
-from torch._inductor.utils import sympy_index_symbol
-from .. import codecache, config, ir, lowering as L
-from ..virtualized import V
-from .common import Kernel, OpOverrides
-from .cpp_utils import cexpr_index, DTYPE_TO_CPP
-
-
-def parse_expr_with_index_symbols(expr_str: str) -> sympy.Expr:
-    expr = parse_expr(expr_str)
-    int_symbols = {sym: sympy_index_symbol(sym.name) for sym in expr.free_symbols}
-    return expr.subs(int_symbols)
-
-
-def wrap_with_tensorbox(node) -> ir.TensorBox:
-    return (
-        ir.TensorBox.create(node) if isinstance(node, ir.Buffer) else ir.TensorBox(node)
-    )
-
-
-class CppTemplateKernel(Kernel):
-    overrides = OpOverrides
-
-    def __init__(self, kernel_name):
-        super().__init__()
-        self.kernel_name = kernel_name
-
-    def def_kernel(
-        self,
-        inputs: Dict[str, ir.Buffer],
-        outputs: Dict[str, ir.Buffer],
-    ) -> str:
-        for name, inp in inputs.items():
-            if inp is not None:
-                self.args.input_buffers[inp.get_name()] = name
-        for name, out in outputs.items():
-            self.args.output_buffers[out.get_name()] = name
-        unique_sizevars = {
-            s
-            for input in inputs.values()
-            if input is not None
-            for sym in itertools.chain(input.get_size(), input.get_stride())
-            if isinstance(sym, sympy.Expr)
-            for s in sym.free_symbols
-        }
-        unique_sizevars |= {
-            s
-            for output in outputs.values()
-            for sym in itertools.chain(output.get_size(), output.get_stride())
-            if isinstance(sym, sympy.Expr)
-            for s in sym.free_symbols
-        }
-        sizevars = sorted(unique_sizevars, key=str)
-        for sizevar in sizevars:
-            self.args.sizevars[sizevar] = f"k{sizevar}"
-        cpp_argdefs, _, _ = self.args.cpp_argdefs()
-        return f"void {self.kernel_name}({', '.join(cpp_argdefs)})"
-
-    def call_kernel(self, name: str, node: ir.CppTemplateBuffer):
-        wrapper = V.graph.wrapper_code
-        _, call_args, arg_types = self.args.cpp_argdefs()
-        wrapper.generate_kernel_call(name, call_args, cuda=False, arg_types=arg_types)
-
-    def dtype(self, node: ir.Buffer) -> str:
-        return DTYPE_TO_CPP[node.get_dtype()]
-
-    def acc_dtype(self, node: ir.Buffer) -> str:
-        if node.get_dtype() in [torch.float32, torch.bfloat16, torch.half]:
-            return "float"
-        else:
-            raise NotImplementedError(f"Unsupported dtype: {node.get_dtype()}")
-
-    def size(self, node: ir.Buffer, dim: int) -> str:
-        return cexpr_index(self.rename_indexing(node.get_size()[dim]))
-
-    def stride(self, node: ir.Buffer, dim: int) -> str:
-        return cexpr_index(self.rename_indexing(node.get_stride()[dim]))
-
-    def index(self, node: ir.Buffer, indices: List[Any]) -> str:
-        indexer = node.make_indexer()
-        index = indexer([parse_expr_with_index_symbols(str(idx)) for idx in indices])
-        index = self.rename_indexing(index)
-        return f"{self.args.input(node.get_name())}[{cexpr_index(index)}]"
-
-    def slice_nd(self, node, ranges: List[Tuple[Any]]) -> ir.ReinterpretView:
-        """
-        Slice the given node with a list of ranges (start and end) corresponding to its dims.
-        The dim is not sliced if the corresponding range is empty.
-        """
-        assert len(ranges) == len(node.get_size())
-        sliced = wrap_with_tensorbox(node)
-        for dim, _range in enumerate(ranges):
-            if len(_range) == 0:
-                continue
-            assert len(_range) == 2
-            start, end = (parse_expr_with_index_symbols(str(r)) for r in _range)
-            sliced = L.slice_(sliced, dim, start, end, clamp=False)
-        assert isinstance(sliced.data, ir.ReinterpretView)
-        return sliced.data
-
-    def view(self, node, sizes: List[Any]) -> ir.View:
-        node = wrap_with_tensorbox(node)
-        sizes = [parse_expr_with_index_symbols(str(s)) for s in sizes]
-        return L.view(node, sizes).data
-
-    @property
-    def assert_function(self) -> str:
-        if V.graph.aot_mode:
-            return "AOTI_TORCH_CHECK"
-        else:
-            return "TORCH_CHECK"
-
-    def maybe_codegen_profile(self) -> str:
-        if config.cpp.enable_kernel_profile:
-            graph_id = V.graph.graph_id
-            prefix = "graph_" + str(graph_id) + "_" if graph_id is not None else ""
-            return f'RECORD_FUNCTION("{prefix}{self.kernel_name}", c10::ArrayRef<c10::IValue>({{}}));'
-        else:
-            return ""
-
-    def unroll_pragma(self, unroll):
-        if codecache.is_gcc():
-            return f"#pragma GCC unroll {unroll}"
-        else:
-            return f"#pragma unroll {unroll}"
-
-
-class CppTemplateCaller(ir.ChoiceCaller):
-    """
-    CppTemplateCaller
-
-    This class represents a caller for CPP template kernels. It is a subclass of ir.ChoiceCaller.
-    Attributes:
-        name (str): The name of the caller.
-        category (str): The category of the caller.
-        bmreq (CppBenchmarkRequest): The benchmark request for the caller.
-        template_buffer (ir.CppTemplateBuffer): The template buffer for the caller.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        category: str,
-        input_nodes: List[ir.Buffer],
-        layout: ir.Layout,
-        make_kernel_render: Callable[
-            [ir.CppTemplateBuffer, Optional[List[ir.IRNode]]], str
-        ],
-        bmreq: CppBenchmarkRequest,
-        template: "CppTemplate",  # type: ignore[name-defined]  # noqa: F821
-        info_kwargs: Optional[
-            Dict[str, Union[ir.PrimitiveInfoType, List[ir.PrimitiveInfoType]]]
-        ] = None,
-    ):
-        super().__init__(name, input_nodes, layout)
-        self.category = category
-        self.make_kernel_render = make_kernel_render
-        self.bmreq = bmreq
-        self.template = template
-        self.info_kwargs = info_kwargs
-
-    def precompile(self) -> None:
-        assert self.bmreq is not None
-        self.bmreq.precompile()
-
-    def benchmark(self, *args, out) -> float:
-        assert self.bmreq is not None
-        return self.bmreq.benchmark(*args, output_tensor=out)
-
-    def hash_key(self) -> str:
-        return "-".join(
-            [
-                self.category,
-                self.bmreq.hash_key,
-            ]
-        )
-
-    def info_dict(
-        self,
-    ) -> Dict[str, Union[ir.PrimitiveInfoType, List[ir.PrimitiveInfoType]]]:
-        return {"backend": "CPP", "op_type": "unknown"}
-
-    def output_node(self) -> ir.TensorBox:
-        return ir.TensorBox.create(
-            ir.CppTemplateBuffer(
-                layout=self.layout,
-                inputs=self.input_nodes,
-                make_kernel_render=self.make_kernel_render,
-                template=self.template,
-                choice=self,
-            )
-        )
diff --git a/torch/_inductor/codegen/cpp_utils.py b/torch/_inductor/codegen/cpp_utils.py
index a3b4fd3206b6..7e6f06b9e507 100644
--- a/torch/_inductor/codegen/cpp_utils.py
+++ b/torch/_inductor/codegen/cpp_utils.py
@@ -1,7 +1,5 @@
 import math
 
-from collections import namedtuple
-
 import torch
 
 from .common import ExprPrinter
@@ -57,8 +55,6 @@
 
 INDEX_TYPE = "long"
 
-GemmBlocking = namedtuple("GemmBlocking", ["block_m", "block_n", "block_k"])
-
 
 class CppPrinter(ExprPrinter):
     def _print_Integer(self, expr):
diff --git a/torch/_inductor/codegen/cpp_wrapper_cpu.py b/torch/_inductor/codegen/cpp_wrapper_cpu.py
index 6ce230714632..9595f1da6f95 100644
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@@ -552,10 +552,8 @@ def write_wrapper_decl(self):
                         ), "Fails to get the dtype of the sympy.Expr"
                         cpp_dtype = DTYPE_TO_CPP[dtype]
                         if config.abi_compatible:
-                            self.prefix.writeline(f"{cpp_dtype} {input_key};")
-                            dtype_str = str(dtype).split(".")[-1]
-                            self.prefix.writeline(
-                                f"aoti_torch_item_{dtype_str}(inputs[{idx}], &{input_key});"
+                            self.codegen_tensor_item(
+                                dtype, f"inputs[{idx}]", input_key, self.prefix
                             )
                         else:
                             self.prefix.writeline(
@@ -890,6 +888,19 @@ def codegen_scalar_to_tensor(self, output: str):
         )
         return name
 
+    def codegen_tensor_item(
+        self, dtype: torch.dtype, tensor: str, scalar: str, indented_buffer=None
+    ):
+        assert (
+            config.abi_compatible
+        ), "codegen_tensor_item is only used for the ABI-compatible mode"
+        dtype_str = str(dtype).split(".")[-1]
+        writer = indented_buffer or self
+        writer.writeline(f"{DTYPE_TO_CPP[dtype]} {scalar};")
+        writer.writeline(
+            f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_{dtype_str}({tensor}, &{scalar}));"
+        )
+
     @cache_on_self
     def get_output_refs(self):
         return [
@@ -1120,7 +1131,7 @@ def g(args):
         )
 
     def get_c_shim_func_name(self, kernel):
-        if not config.abi_compatible:
+        if not config.abi_compatible or kernel.startswith("aoti_torch_"):
             return kernel
 
         assert "::" in kernel, "Cpp kernel name: " + kernel + " does not contain '::'"
@@ -1376,10 +1387,9 @@ def codegen_shape_tuple(self, shape: Tuple[Expr, ...]) -> str:
     def codegen_dynamic_scalar(self, node):
         (data,) = (t.codegen_reference() for t in node.inputs)
         if config.abi_compatible:
-            dtype = node.inputs[0].get_dtype()
-            dtype_str = str(dtype).split(".")[-1]
-            self.writeline(f"{DTYPE_TO_CPP[dtype]} {node.sym}_raw;")
-            self.writeline(f"aoti_torch_item_{dtype_str}({data}, &{node.sym}_raw);")
+            self.codegen_tensor_item(
+                node.inputs[0].get_dtype(), data, f"{node.sym}_raw"
+            )
         else:
             convert_type = DTYPE_TO_ATEN[node.inputs[0].get_dtype()].replace(
                 "at::k", "to"
@@ -1763,12 +1773,13 @@ def codegen_conditional(self, conditional):
                 outer_outputs.append(out.get_name())
 
             if not isinstance(conditional.predicate, ir.ShapeAsConstantBuffer):
-                predicate = f"{conditional.predicate.get_name()}_scalar"
-                self.writeline(f"bool {predicate};")
                 # in ABI-compatible mode, we need to use the ABI shim function
                 # to extract a C++ bool from the unrelying scalar bool Tensor
-                self.writeline(
-                    f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_bool({conditional.predicate.codegen_reference()}, &{predicate}));"
+                predicate = f"{conditional.predicate.get_name()}_scalar"
+                self.codegen_tensor_item(
+                    torch.bool,
+                    conditional.predicate.codegen_reference(),
+                    predicate,
                 )
             else:
                 # the predicate is not a Tensor: SymBool or Python bool
@@ -1847,12 +1858,7 @@ def codegen_while_loop(self, while_loop):
 
         if config.abi_compatible:
             cond_result = f"{cond_result_name}_scalar"
-            self.writeline(f"bool {cond_result};")
-            # in ABI-compatible mode, we need to use the ABI shim function
-            # to extract a C++ bool from the unrelying scalar bool Tensor
-            self.writeline(
-                f"AOTI_TORCH_ERROR_CODE_CHECK(aoti_torch_item_bool({cond_result_name}, &{cond_result}));"
-            )
+            self.codegen_tensor_item(torch.bool, cond_result_name, cond_result)
         else:
             cond_result = f"{cond_result_name}.item<bool>()"
         self.writeline(f"if (!{cond_result}) break;")
diff --git a/torch/_inductor/codegen/simd.py b/torch/_inductor/codegen/simd.py
new file mode 100644
index 000000000000..2a002fa3677f
--- /dev/null
+++ b/torch/_inductor/codegen/simd.py
@@ -0,0 +1,1917 @@
+from __future__ import annotations
+
+import collections
+import contextlib
+import dataclasses
+import functools
+import itertools
+import logging
+import math
+import operator
+from typing import (
+    Any,
+    Callable,
+    Counter,
+    DefaultDict,
+    Dict,
+    Iterable,
+    List,
+    Optional,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
+
+import sympy
+
+import torch
+import torch._logging
+
+from torch.utils._sympy.functions import FloorDiv, ModularIndexing
+from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
+from ..._dynamo.utils import counters
+from .. import config, ir, scheduler
+from ..codecache import code_hash
+
+from ..dependencies import Dep, MemoryDep, StarDep, WeakDep
+from ..ir import TritonTemplateBuffer
+from ..optimize_indexing import indexing_dtype_strength_reduction
+from ..runtime.hints import ReductionHint, TRITON_MAX_BLOCK
+from ..runtime.runtime_utils import get_max_y_grid, green_text, yellow_text
+from ..scheduler import BaseSchedulerNode, BaseScheduling, WhyNoFuse
+from ..utils import (
+    get_dtype_size,
+    IndentedBuffer,
+    Placeholder,
+    sympy_dot,
+    sympy_index_symbol,
+    sympy_product,
+    sympy_subs,
+    unique,
+)
+from ..virtualized import V
+from .common import CSEVariable, index_prevent_reordering, Kernel, PythonPrinter
+from .multi_kernel import MultiKernel
+
+if TYPE_CHECKING:
+    pass
+
+log = logging.getLogger(__name__)
+perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
+schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
+fusion_log = torch._logging.getArtifactLogger(__name__, "fusion")
+
+
+pexpr = PythonPrinter().doprint
+
+
+@dataclasses.dataclass
+class IndexingOptions:
+    index_str: str
+    mask_vars: Set[sympy.Symbol]
+    mask_str: str
+    expand_str: Optional[str]
+    _has_rindex: bool
+    index: sympy.Expr
+
+    def has_mask(self):
+        return bool(self.mask_vars)
+
+    def has_rindex(self):
+        return self._has_rindex
+
+    def has_tmpmask(self):
+        return "tmp" in self.mask_str
+
+    def has_rmask(self):
+        return "rmask" in self.mask_str
+
+
+@dataclasses.dataclass
+class IterationRanges:
+    """
+    Each range tree represents multiple sets of iteration indexing
+    in a single tiled dimension in the output kernel.
+
+    If you have two loops ranges one (4, 3, 2) and another (4, 6),
+    then the range tree will be:
+            4 (i0)
+        3 (i1)  6 (i3)
+        2 (i2)
+    Where i0 is shared between both loops, but then the split into
+    different indexing vars.  All loop ranges must iterate over
+    the same number of elements.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        var_list: List[sympy.Symbol],
+        var_ranges: Dict[sympy.Symbol, sympy.Expr],
+        numel: sympy.Expr,
+        prefix: str,
+        *,
+        kernel: SIMDKernel,
+        divisor=sympy.Integer(1),
+        length=sympy.Integer(1),
+        root: IterationRangesRoot,
+    ):
+        super().__init__()
+        self.name = name
+        self.var_list = var_list
+        self.var_ranges = var_ranges
+        self.numel = numel
+        self.prefix = prefix
+        self.divisor = divisor
+        self.length = length
+        self.kernel = kernel
+        self.root = root
+
+    def symbol(self):
+        return sympy_index_symbol(self.name)
+
+
+class IterationRangesRoot(IterationRanges):
+    def __init__(
+        self,
+        name: str,
+        numel: sympy.Expr,
+        # TODO: this is probably SymTy.INDEX and SymTy.RINDEX
+        prefix: str,
+        index: int,
+        kernel: SIMDKernel,
+        pid_cache=None,
+        *,
+        is_loop: bool,
+        tensor_dim: Optional[int],
+        grid_dim: Optional[int],
+        has_zdim: bool,
+    ):
+        if pid_cache is None:
+            pid_cache = {}
+        super().__init__(
+            name=name,
+            var_list=[],
+            var_ranges={},
+            numel=numel,
+            prefix=prefix,
+            kernel=kernel,
+            root=self,
+        )
+        self.index = index
+        # Store all the nodes in one flat list
+        self.nodes: Dict[sympy.Expr, IterationRangesEntry] = {}
+        # This is for re-ordering program ID in triton mm template
+        # pid_cache["tl.program_id(0)"] = pid_m
+        self.pid_cache: Dict[str, str] = pid_cache
+
+        # True if the dimension is implemented as a single program looping over
+        # the full dimension (currently only used for non-persistent reduction)
+        assert not is_loop or (prefix == "r" and grid_dim is None)
+        self.is_loop = is_loop
+        # Index of corresponding dimension on triton tensors
+        self.tensor_dim = tensor_dim
+        # Index of corresponding dimension in the triton grid
+        self.grid_dim = grid_dim
+        self.has_zdim = has_zdim
+
+    def __repr__(self):
+        return f"IterationRangesRoot({self.name!r}, {self.numel}, ...)"
+
+    def cache_clear(self):
+        for node in self.nodes.values():
+            node.cache_clear()
+
+    def lookup(self, divisor, length):
+        """
+        Lookup a given RangeTreeEntry, creating it if needed
+        """
+        if V.graph.sizevars.statically_known_equals(divisor * length, self.numel):
+            expr = FloorDiv(sympy_index_symbol(f"{self.prefix}index"), divisor)
+        else:
+            expr = ModularIndexing(
+                sympy_index_symbol(f"{self.prefix}index"), divisor, length
+            )
+
+        if expr not in self.nodes:
+            node = IterationRangesEntry(
+                f"{self.prefix}{next(V.kernel.iter_vars_count)}",
+                divisor,
+                length,
+                expr,
+                self,
+            )
+            V.kernel.range_tree_nodes[node.symbol()] = node
+            self.var_list.append(node.symbol())
+            self.var_ranges[node.symbol()] = length
+            self.nodes[expr] = node
+        return self.nodes[expr]
+
+    def construct_entries(self, lengths: List[sympy.Expr]):
+        divisor = sympy.Integer(1)
+        itervars = []
+        for length in reversed(lengths):
+            itervars.append(self.lookup(divisor, length))
+            divisor = divisor * length
+        return list(reversed(itervars))
+
+    def construct(self, lengths: List[sympy.Expr]):
+        return [e.symbol() for e in self.construct_entries(lengths)]
+
+    def vars_and_sizes(self, index: sympy.Expr):
+        """Figure out vars from this tree used in index"""
+        nodes = [V.kernel.range_tree_nodes.get(s) for s in index.free_symbols]
+        nodes = [n for n in nodes if n and n.prefix == self.prefix]
+        nodes.sort(key=lambda x: V.graph.sizevars.size_hint(x.divisor))
+        divisor = sympy.Integer(1)
+        index_vars = []
+        sizes = []
+
+        def add(node):
+            nonlocal divisor
+            index_vars.append(node.symbol())
+            sizes.append(node.length)
+            divisor = divisor * node.length
+
+        for node in nodes:
+            if not V.graph.sizevars.statically_known_equals(node.divisor, divisor):
+                # fill in unused index var
+                add(self.lookup(divisor, FloorDiv(node.divisor, divisor)))
+                divisor = node.divisor
+            add(node)
+        if not V.graph.sizevars.statically_known_equals(self.numel, divisor):
+            # fill in unused index var
+            add(self.lookup(divisor, FloorDiv(self.numel, divisor)))
+
+        return list(reversed(index_vars)), list(reversed(sizes))
+
+    def ranges_code(self):
+        assert self.tensor_dim is not None
+        size = self.kernel.indexing_size_str(self.tensor_dim)
+        index_dtype = self.kernel.index_dtype
+        convert = f".to({index_dtype})" if index_dtype != "tl.int32" else ""
+        return f"tl.arange(0, {self.prefix.upper()}BLOCK){size}{convert}"
+
+    def scalar_code(self, value):
+        index_dtype = self.kernel.index_dtype
+        ndim = self.kernel.triton_tensor_ndim()
+        size = [1] * ndim
+        return f"tl.full({size}, {value}, {index_dtype})"
+
+    def get_pid(self):
+        assert self.grid_dim is not None
+        key = f"tl.program_id({self.grid_dim})"
+        # y_grid has a limit, so express it in terms of y and z in case of overflow.
+        # z grid is only exercised when max_tiles == 3 (off by default).
+        if (
+            self.grid_dim == 1
+            and not self.has_zdim
+            and not (isinstance(self.numel, int) and self.numel <= get_max_y_grid())
+        ):
+            key = f"{key} * (tl.program_id({self.grid_dim + 1}) + 1)"
+        pid = self.pid_cache.get(key, key)
+        if self.kernel.index_dtype != "tl.int32":
+            return f"{pid}.to({self.kernel.index_dtype})"
+        return pid
+
+    def codegen_header(self, code):
+        x = self.prefix
+        if self.is_loop:
+            code.writeline(f"{self.name} = {x}offset + {x}base")
+        elif self.grid_dim is None:
+            # no need to "{x}offset = "
+            code.writeline(f"{self.name} = {self.ranges_code()}")
+            code.writeline(f"{x}offset = 0")
+        else:
+            if self.tensor_dim is not None:
+                line = f"{x}offset + {self.ranges_code()}"
+            else:
+                line = self.scalar_code(f"{x}offset")
+            code.writelines(
+                [
+                    f"{x}offset = {self.get_pid()} * {x.upper()}BLOCK",
+                    f"{self.name} = {line}",
+                ]
+            )
+        code.writeline(f"{x}mask = {self.name} < {x}numel")
+
+
+class IterationRangesEntry(IterationRanges):
+    def __init__(
+        self,
+        name: str,
+        divisor: sympy.Expr,
+        length: sympy.Expr,
+        expr: sympy.Expr,
+        parent: IterationRanges,
+    ):
+        super().__init__(
+            name=name,
+            numel=parent.numel / length,
+            var_list=parent.var_list,
+            var_ranges=parent.var_ranges,
+            prefix=parent.prefix,
+            divisor=divisor,
+            length=length,
+            kernel=parent.kernel,
+            root=parent.root,
+        )
+        self.parent = parent
+        self.codegen = functools.lru_cache(None)(self._codegen)
+        self.expr = expr
+
+    def __repr__(self):
+        return f"IterationRangesEntry({self.name}, {self.divisor}, {self.length}, {self.expr}, {self.var_ranges})"
+
+    def set_name(self, name):
+        self.codegen = lambda: name  # type: ignore[assignment]
+        self.codegen.cache_clear = lambda: None  # type: ignore[method-assign]
+        self.name = name
+
+    def cache_clear(self):
+        self.codegen.cache_clear()
+
+    def _codegen(self):
+        V.kernel.codegen_iteration_ranges_entry(self)
+        return self.name
+
+    def precomputed_args(self):
+        # for dynamic shapes, find parts of indexing expressions that have to be precomputed
+        precomputed_args: List[sympy.Expr] = []
+        if isinstance(self.expr, sympy.Symbol):
+            return precomputed_args
+        assert isinstance(self.expr, (FloorDiv, ModularIndexing)), type(self.expr)
+        for arg in self.expr.args[1:]:
+            if not isinstance(arg, (sympy.Integer, sympy.Symbol)):
+                symbols = arg.free_symbols
+                if len(symbols) > 0 and all(
+                    symbol_is_type(s, SymT.SIZE) for s in symbols
+                ):
+                    precomputed_args.append(arg)
+        return precomputed_args
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __eq__(self, other):
+        return self.name == other.name
+
+
+def triton_constant(value):
+    if value == float("inf"):
+        return 'float("inf")'
+    elif value == float("-inf"):
+        return 'float("-inf")'
+    elif math.isnan(value):
+        return 'float("nan")'
+    return repr(value)
+
+
+class SIMDKernel(Kernel):
+    """
+    Common base class for Triton/Halide codegen which both use flattened indexing rather than loop nests.
+    """
+
+    sexpr = pexpr
+    kexpr: Callable[[sympy.Expr], str]
+    allow_block_ptr = False
+
+    def __init__(
+        self,
+        *groups,
+        index_dtype: str,
+        mutations: Optional[Set[str]] = None,
+        pid_cache=None,
+        reduction_hint=ReductionHint.DEFAULT,
+        disable_persistent_reduction=False,
+    ):
+        if pid_cache is None:
+            pid_cache = {}
+        super().__init__()
+        self.body = IndentedBuffer()
+        self.indexing_code = IndentedBuffer()
+        self.numels = [V.graph.sizevars.simplify(s) for s in groups]
+        self.mutations: Set[str] = mutations if mutations is not None else set()
+        self.range_trees: List[IterationRangesRoot] = []
+        self.range_tree_nodes: Dict[sympy.Symbol, IterationRangesEntry] = {}
+        self.iter_vars_count = itertools.count()
+        self.inside_reduction = self.numels[-1] != 1
+        self.reduction_hint = reduction_hint
+        self.index_dtype: str = index_dtype
+        self.last_usage: Set[str] = set()
+        self.buf_accesses: DefaultDict[str, List[Dep]] = collections.defaultdict(list)
+        self.persistent_reduction: bool = (
+            not disable_persistent_reduction
+        ) and self.should_use_persistent_reduction()
+        self.no_x_dim = self.want_no_x_dim()
+        self.code_hash = None
+
+        # define this in a closure to make cache local to object
+        @functools.lru_cache(None)
+        def simplify_indexing(index: sympy.Expr):
+            index = V.graph.sizevars.simplify_with_ranges(index, self.var_ranges())
+            for tree in self.range_trees:
+                index = self.combine_contiguous_dims(index, tree)
+            return index
+
+        self.simplify_indexing = simplify_indexing
+        self.initialize_range_tree(pid_cache)
+
+    def want_no_x_dim(self):
+        return False
+
+    def initialize_range_tree(self, pid_cache):
+        no_r_dim = not self.inside_reduction or self.numels[-1] == 1
+
+        prefixes = "zyxr"
+        active_prefixes = prefixes[-len(self.numels) :]
+
+        grid_dims = "xyz"
+        if self.no_x_dim:
+            tensor_dims = "r"
+        elif no_r_dim:
+            tensor_dims = "xyz"
+        else:
+            tensor_dims = "xyzr"
+
+        tensor_dims = "".join(p for p in tensor_dims if p in active_prefixes)
+
+        for i, prefix in enumerate(active_prefixes):
+            is_reduction = prefix == "r"
+            tensor_dim = tensor_dims.find(prefix) if prefix in tensor_dims else None
+            grid_dim = None if is_reduction else grid_dims.find(prefix)
+            index = i if grid_dim is None else grid_dim
+            self.range_trees.append(
+                IterationRangesRoot(
+                    f"{prefix}index",
+                    self.numels[i],
+                    prefix,
+                    index,
+                    self,
+                    pid_cache=pid_cache,
+                    is_loop=is_reduction and not self.persistent_reduction,
+                    tensor_dim=tensor_dim,
+                    grid_dim=grid_dim,
+                    has_zdim="z" in active_prefixes,
+                )
+            )
+
+    def store_reduction(self, name: str, index: sympy.Expr, value: CSEVariable):
+        prior = self.inside_reduction
+        self.inside_reduction = False
+        try:
+            return self.store(name, index, value)
+        finally:
+            self.inside_reduction = prior
+
+    def should_use_persistent_reduction(self) -> bool:
+        return False  # defined in subclass
+
+    def var_ranges(self):
+        return dict(
+            itertools.chain.from_iterable(
+                tree.var_ranges.items() for tree in self.range_trees
+            )
+        )
+
+    def triton_tensor_ndim(self):
+        return sum(int(tree.tensor_dim is not None) for tree in self.range_trees)
+
+    def indexing_size_str(self, i):
+        sizes = ["None"] * self.triton_tensor_ndim()
+        sizes[i] = ":"
+        return f"[{', '.join(sizes)}]"
+
+    def dense_size_list(self) -> List[str]:
+        sizes = ["1"] * self.triton_tensor_ndim()
+        for tree in self.range_trees:
+            if tree.tensor_dim is None:
+                continue
+
+            if tree.prefix != "r" or self.inside_reduction:
+                sizes[tree.tensor_dim] = f"{tree.prefix.upper()}BLOCK"
+        return sizes
+
+    def dense_size_str(self):
+        sizes = self.dense_size_list()
+        return f"[{', '.join(sizes)}]"
+
+    def combine_contiguous_dims(self, index: sympy.Expr, tree: IterationRangesRoot):
+        """
+        More aggressive simplification to merge contiguous dims
+        """
+        if isinstance(index, (sympy.Integer, sympy.Symbol)):
+            return index
+        index_vars, sizes = tree.vars_and_sizes(index)
+        if len(sizes) <= 1:
+            return index
+        new_sizes, reindex, prune = V.graph.sizevars._simplify_loops(
+            index_vars, sizes, index_prevent_reordering([index], index_vars, sizes)
+        )
+        if new_sizes == sizes:
+            return index
+        new_index_vars = tree.construct(new_sizes)
+        new_index = sympy_subs(index, dict(zip(index_vars, reindex(new_index_vars))))
+        return new_index
+
+    def set_last_usage(self, nodes):
+        if not self.inside_reduction or self.persistent_reduction:
+            return
+        self.last_usage = set(
+            itertools.chain.from_iterable(
+                n.last_usage for n in nodes if n is not EnableReduction
+            )
+        )
+
+    def disable_reduction(self):
+        should_flush = self.range_trees[-1].is_loop
+
+        @contextlib.contextmanager
+        def ctx():
+            if self.numels[-1] == 1:
+                assert not self.inside_reduction
+                yield
+                return
+            if should_flush:
+                # calling codegen_body() will flush all the pending buffers
+                # and write out a reduction loop
+                self.codegen_body()
+            self.inside_reduction = False
+            try:
+                yield
+                if should_flush:
+                    # flush out any code before opening the next loop
+                    self.codegen_body()
+            finally:
+                self.inside_reduction = True
+
+        return ctx()
+
+    def set_ranges(self, *lengths):
+        assert len(lengths) == len(self.range_trees)
+        return [
+            ranges.construct(length)
+            for length, ranges in zip(lengths, self.range_trees)
+        ]
+
+    @staticmethod
+    def _split_iteration_ranges(
+        groups: Iterable[sympy.Expr], lengths: List[List[sympy.Expr]]
+    ):
+        sv = V.graph.sizevars
+        new_ranges: List[List[sympy.Expr]] = [[] for _ in groups]
+        remaining = [sv.simplify(g) for g in groups]
+        var_count = itertools.count()
+
+        def add_range(i, expr):
+            expr = sv.simplify(expr)
+            if not sv.statically_known_multiple_of(remaining[i], expr):
+                raise CantSplit
+            # guard on the last item out
+            remaining[i] = FloorDiv(remaining[i], expr)
+            new_ranges[i].append(expr)
+            return next(var_count)
+
+        def make_combined(size, idx1, idx2):
+            def getter(flat_vars):
+                return size * flat_vars[idx1] + flat_vars[idx2]
+
+            return getter
+
+        return_getters_groups = []
+        current_group = 0
+        for length_group in lengths:
+            return_getters = []
+            for size in length_group:
+                if sv.statically_known_equals(size, 1):  # type: ignore[arg-type]
+                    return_getters.append(lambda _: sympy.Integer(0))
+                    continue
+
+                while current_group < len(remaining) and sv.statically_known_equals(
+                    remaining[current_group], 1  # type: ignore[arg-type]
+                ):
+                    # scroll to next group with remaining elements
+                    current_group += 1
+
+                if current_group + 1 < len(remaining) and sv.statically_known_gt(
+                    size, remaining[current_group]
+                ):
+                    # need to break size in two
+                    if not sv.statically_known_multiple_of(
+                        size, remaining[current_group]
+                    ):
+                        raise CantSplit
+                    size1 = remaining[current_group]
+                    size2 = FloorDiv(size, remaining[current_group])
+                    return_getters.append(
+                        make_combined(
+                            size2,
+                            add_range(current_group, size1),
+                            add_range(current_group + 1, size2),
+                        )
+                    )
+                else:
+                    return_getters.append(
+                        operator.itemgetter(add_range(current_group, size))
+                    )
+            return_getters_groups.append(return_getters)
+
+        assert all(
+            V.graph.sizevars.size_hint(s) == 1 for s in remaining
+        ), f"failed to set ranges {remaining} {lengths}"
+
+        return new_ranges, return_getters_groups
+
+    @classmethod
+    def is_compatible(
+        cls, groups: Iterable[sympy.Expr], lengths: List[List[sympy.Expr]]
+    ):
+        try:
+            cls._split_iteration_ranges(groups, lengths)
+            return True
+        except CantSplit:
+            return False
+
+    def split_and_set_ranges(self, lengths: List[List[sympy.Expr]]):
+        """
+        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).
+
+        To do this we need to split up the iteration space of i0 into something like:
+            for i1 in s0:
+              for i2 in s1:
+                i0 = i1*s1 + i2
+                ....
+
+        This function matches and resplits lengths to the groups of
+        this kernel to enable tiled + non-tiled fusions.
+        """
+        groups = [rt.numel for rt in self.range_trees]
+        if not self.inside_reduction:
+            groups[-1] = sympy.Integer(1)
+
+        if len(lengths) == len(self.range_trees) and all(
+            V.graph.sizevars.simplify(sympy_product(x) - g) == 0
+            for x, g in zip(lengths, groups)
+        ):
+            return self.set_ranges(*lengths)
+
+        new_ranges, return_getters_groups = self._split_iteration_ranges(
+            groups, lengths
+        )
+        itervars = list(itertools.chain.from_iterable(self.set_ranges(*new_ranges)))
+        return [[fn(itervars) for fn in fns] for fns in return_getters_groups]
+
+    def is_indirect_indexing(self, index: sympy.Expr):
+        # tmpX  means indirect indexing
+        return free_symbol_is_type(index, SymT.TMP)
+
+    def is_broadcasted(self, index: sympy.Expr):
+        # Note. This may not be correct when there is indirect indexing
+        if self.is_indirect_indexing(index):
+            return False
+
+        index_numels = [1] * len(self.numels)
+        for symbol in index.free_symbols:
+            if symbol not in self.range_tree_nodes:
+                # Non-iterated variables, e.g. strides
+                continue
+            entry = self.range_tree_nodes[symbol]  # type: ignore[index]
+            assert isinstance(entry.parent, IterationRangesRoot)
+            index_numels[entry.parent.index] *= entry.length
+
+        # If the index variables only iterate over a subset of the kernel
+        # numels, then it must be broadcasted.
+        simplify = V.graph.sizevars.simplify
+        return any(
+            simplify(idx_range) != simplify(iter_range)  # type: ignore[arg-type]
+            for idx_range, iter_range in zip(index_numels, self.numels)
+        )
+
+    def index_to_str(self, index: sympy.Expr) -> str:
+        """
+        Convert an index expr to a string that can be used in triton code.
+        e.g. a sympy expression "s2" may actually appear as "ks1" in the triton kernel.
+
+        Index expressions often need to be passed in as arguments to the triton kernel.
+        Rename_indexing and codegen_indexing keep track of the needed indices and add
+        new parameters to the function signature.
+        """
+        if isinstance(index, list):
+            return f"[{', '.join(map(self.index_to_str, index))}]"
+        return self.kexpr(  # type: ignore[call-arg]
+            self.rename_indexing(self.codegen_indexing(index))
+        )
+
+    def indexing(
+        self,
+        index: sympy.Expr,
+        *,
+        copy_shape=None,
+        dense_indexing=False,
+        override_mask=None,
+        block_ptr=False,
+    ):
+        """
+        Compute the index and mask to pass to tl.load() or tl.store()
+        """
+        index = self.simplify_indexing(index)
+        index = sympy_subs(index, V.graph.sizevars.precomputed_replacements)
+        # if simple replacements didn't get rid of floor/ceil, try full subs
+        if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)):
+            index = index.subs(V.graph.sizevars.precomputed_replacements)
+        # last resort, if no range vars are in the expr, hoist it
+        # TODO instead of trying to blindly find complicated exprs, we should hoist the
+        # inputs/outputs sizes and strides, but at the time indexing is generated
+        # kernel inputs and outputs are not set yet, we'd need a deeper refactor
+        # to do it this way
+
+        if len(index.atoms(sympy.ceiling)):
+            for a in index.atoms(sympy.ceiling):
+                # for nested exprs, atoms yields top level first (?)
+                # so if everything goes fine, lower level replacements will come up empty
+                symbols = a.free_symbols
+                if len(symbols) > 0 and all(
+                    symbol_is_type(s, (SymT.SIZE, SymT.PRECOMPUTED_SIZE))
+                    for s in symbols
+                ):
+                    replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)}
+                    index = sympy_subs(index, replacements)
+
+        index = self.simplify_indexing(index)
+        index_vars = index.free_symbols
+        has_rindex = False
+
+        mask_vars: Set[str] = set()
+        for var in index_vars:
+            assert isinstance(var, sympy.Symbol)
+            has_rindex = has_rindex or symbol_is_type(var, SymT.RINDEX)
+            if override_mask:
+                pass
+            elif symbol_is_type(var, SymT.TMP):
+                # indirect indexing
+                cse_var = self.cse.varname_map[var.name]
+                mask_vars.update(cse_var.mask_vars)
+            elif symbol_is_type(
+                var,
+                (
+                    SymT.UNBACKED_INT,
+                    SymT.SIZE,
+                    SymT.PRECOMPUTED_SIZE,
+                    SymT.INDEX,
+                    SymT.FLOAT,
+                    SymT.UNBACKED_FLOAT,
+                ),
+            ):
+                pass
+            else:
+                # var is one of xN, yN or rN
+                assert symbol_is_type(
+                    var, (SymT.RINDEX, SymT.XBLOCK, SymT.YBLOCK)
+                ), var.name
+                mask_vars.add(f"{var.name[0]}mask")
+
+        need_dense = (
+            config.triton.dense_indexing
+            or dense_indexing
+            or self._load_mask is not None
+        ) and index != 0
+
+        have_dense = True
+        have_loop_vars = False
+        dense_mask_vars = set()
+
+        for tree in self.active_range_trees():
+            if index_vars.intersection(tree.var_list):
+                have_loop_vars = True
+            else:
+                have_dense = False
+            dense_mask_vars.add(f"{tree.prefix}mask")
+
+        if (
+            block_ptr
+            and self.allow_block_ptr
+            and config.triton.use_block_ptr
+            and not override_mask
+            and not self._load_mask
+            and len(mask_vars - dense_mask_vars) == 0
+            and not self.is_indirect_indexing(index)
+            and have_loop_vars
+            # workaround https://github.com/openai/triton/issues/2821
+            and self.index_dtype == "tl.int32"
+        ):
+            index_relative_to_xyr_index = sympy_subs(
+                index, {v: t.expr for v, t in self.range_tree_nodes.items()}
+            )
+            range_trees = self.active_range_trees(reorder=True)
+            symbols = [t.symbol() for t in range_trees]
+            strides = [sympy.Wild(f"stride_{s}", exclude=symbols) for s in symbols]
+            offset = sympy.Wild("_offset", exclude=symbols)
+            m = index_relative_to_xyr_index.match(sympy_dot(symbols, strides) + offset)
+            # TODO(jansel): it is sometimes possible to do higher dimensional block_ptrs with
+            #               a tl.reshape the correct block.  We will miss these cases today.
+            if m:
+                self.filter_masks(mask_vars)
+                from .triton import BlockPtrOptions
+
+                return BlockPtrOptions.create(
+                    [m[s] for s in strides],
+                    m[offset],
+                    range_trees,
+                    mask_vars,  # type: ignore[arg-type]
+                )
+
+        expand_str = None
+        index_str = self.index_to_str(index)
+        if isinstance(index, sympy.Integer):
+            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
+            index_str = f"tl.full({expand_str}, {index_str}, tl.int32)"
+            return IndexingOptions(
+                index_str, set(), "None", expand_str, has_rindex, index
+            )
+
+        if need_dense and not have_dense:
+            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
+            index_str = f"tl.broadcast_to({index_str}, {expand_str})"
+            mask_vars = dense_mask_vars
+        elif not have_loop_vars and copy_shape:
+            index_str = f"tl.broadcast_to({index_str}, {copy_shape}.shape)"
+            mask_vars = dense_mask_vars
+
+        if override_mask:
+            mask_vars = {override_mask}
+
+        if self._load_mask:
+            mask_vars.add(self._load_mask)
+
+        self.filter_masks(mask_vars)
+
+        mask_str = " & ".join(sorted(map(str, mask_vars))) if mask_vars else "None"
+        return IndexingOptions(index_str, mask_vars, mask_str, expand_str, has_rindex, index)  # type: ignore[arg-type]
+
+    def active_range_trees(self, reorder=False):
+        trees = [
+            t for t in self.range_trees if t.prefix != "r" or self.inside_reduction
+        ]
+        if reorder and len(trees) > 1:
+            count = sum(t.prefix in "xyz" for t in trees)
+            assert "".join(t.prefix for t in trees[:count]) == "zyx"[-count:], [
+                t.prefix for t in trees[:count]
+            ]
+            trees[:count] = reversed(trees[:count])
+        return trees
+
+    def filter_masks(self, mask_vars):
+        for tree in self.range_trees:
+            # Masks are superfluous if we only have one element
+            if V.graph.sizevars.statically_known_equals(tree.numel, 1):  # type: ignore[arg-type]
+                mask_vars.discard(f"{tree.prefix}mask")
+                continue
+            # Masks are superfluous if numel is a multiple of BLOCK
+            # (We use the fact that BLOCK is required by triton to be a power of 2)
+            if tree.prefix.upper() not in TRITON_MAX_BLOCK:
+                continue
+            max_block = TRITON_MAX_BLOCK[tree.prefix.upper()]
+            # Optional optimization: if block divides numel exactly, we will
+            # never need to do a masked load to handle stragglers at the end.
+            # It's faster to avoid masking at all.  But it is sound to always
+            # mask.
+            if V.graph.sizevars.statically_known_multiple_of(tree.numel, max_block):  # type: ignore[arg-type]
+                mask_vars.discard(f"{tree.prefix}mask")
+
+    def codegen_indexing(self, expr: sympy.Expr):
+        expr = V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges())
+        for sym in sorted(expr.free_symbols, key=str):
+            if sym in self.range_tree_nodes:
+                # if indexing expression is complicated, we precompute it on the host side
+                # and send the result as a kernel argument
+                replacements = {}
+                for ps in self.range_tree_nodes[sym].precomputed_args():  # type: ignore[index]
+                    replacements[ps] = V.graph.sizevars.lookup_precomputed_size(ps)
+                if len(replacements) > 0:
+                    self.range_tree_nodes[sym].expr = sympy_subs(  # type: ignore[index]
+                        self.range_tree_nodes[sym].expr, replacements  # type: ignore[index]
+                    )
+                self.range_tree_nodes[sym].codegen()  # type: ignore[index]
+        return expr
+
+    @contextlib.contextmanager
+    def mask_loads(self, mask):
+        """Context manager to add an additional mask to tl.load/store"""
+        prior = self._load_mask
+        if prior:
+            mask = self.cse.generate(self.compute, f"{mask} & {prior}")
+
+        self._load_mask = mask
+        try:
+            # TODO(jansel): do we need a reshape here?
+            yield mask
+        finally:
+            self._load_mask = prior
+
+    def load_mask(self, var):
+        mask = ""
+        mask_vars = set(var.mask_vars)
+        if self._load_mask:
+            mask_vars.add(self._load_mask)
+
+        if mask_vars:
+            mask = (
+                f"{next(iter(mask_vars))}"
+                if len(mask_vars) == 1
+                # sorted for deterministic order
+                else f"({' & '.join(sorted(map(str, mask_vars)))})"
+            )
+        return mask
+
+    def get_strides_of_load(self, index: sympy.Expr):
+        """
+        This gets the stride of the index for each of the tiling variables
+        (technically, it does it at index 0)
+
+        For example, if
+        xindex = x0 + 512*x1 + 1024*r0
+        x0 = (xindex//512)
+        x1 = (xindex % 512)
+        r0 = rindex // 1024
+
+        this function would return
+        {xindex: 512, rindex: 1024}
+        """
+        index_to_tile_indexes = {k: v.expr for k, v in self.range_tree_nodes.items()}
+        index_in_tile_vars = sympy_subs(index, index_to_tile_indexes)  # type: ignore[arg-type]
+        strides = {}
+        for range_tree in self.range_trees:
+            s = sympy_index_symbol(range_tree.name)
+            strides[s] = sympy_subs(index_in_tile_vars, {s: 1}) - sympy_subs(
+                index_in_tile_vars, {s: 0}
+            )
+        return strides
+
+    @staticmethod
+    def _map_tuple_or_scalar(fn, value):
+        if isinstance(value, tuple):
+            return tuple(map(fn, value))
+        return fn(value)
+
+    def estimate_kernel_num_bytes(self):
+        """
+        Try the best to estimate the total size (in bytes) of the
+        kernel's inputs and outputs, which is used for estimating the memory
+        throughput of this kernel. This information is used for checking how
+        far we are from the peak memory bandwidth. It's important that
+        we want to avoid overestimating the sizes of the inputs and outputs,
+        because it can wrongfully give us a very large memory traffic value,
+        which may be even larger than the theoretical bandwidth and thus
+        become very misleading. This is particularly problematic for cases
+        where we slice some inputs. In those cases, we should only count
+        the size of the "slices" instead of the original inputs, because
+        only the slices contribute to the real memory traffic.
+        """
+        nbytes = []
+        ninplace_args = len(unique(self.args.inplace_buffers.values()))
+        _, call_args, _ = self.args.python_argdefs()
+
+        # For pointwise and reduction kernels, this is the upper-bound numels
+        # for the output buffer.
+        # FIXME: This is not exactly right for cases like below:
+        #    def foo(tensor0, tensor1):
+        #        x0 = narrow(tensor0)
+        #        return cat(x0, tensor1)
+        # For this example, we will end up overestimate the size for the
+        # slice s0. Potentially, we could have precise inputs information
+        # if we maintained the original inputs of the Pointwise kernel created
+        # for the "cat". However, I think it might be a bit overwhelming that
+        # we add such complexity only for handling some particular cases for
+        # benchmarking.
+        out_numel = V.graph.sizevars.size_hint(sympy_product(self.numels))
+        for i, arg in enumerate(call_args):
+            # "buf" may be narrowed. In this case, the number of memory accesses
+            # should be estimated based on the reinterpreted layout.
+            # On the other hand, buf may be broadcasted. In this case,
+            # counting the size of the underline storage would give us
+            # a better estimation in terms of memory accesses.
+            if arg not in self.buf_accesses:
+                nbytes.append(0)
+                continue
+            arg_numel = V.graph.get_numel(arg)
+            buf_size = V.graph.sizevars.size_hint(arg_numel)
+            if buf_size > out_numel:
+                # This arg points to a buf that has been sliced.
+                # We need to count each individual slice to have
+                # a better estimation.
+                indices: Set[Any] = set()
+                no_index_dep_count = 0
+                for dep in self.buf_accesses[arg]:
+                    if isinstance(dep, (StarDep, WeakDep)):
+                        indices.add(f"no_index_dep_{no_index_dep_count}")
+                        no_index_dep_count += 1
+                    else:
+                        indices.add(dep.index)
+                numel = len(indices) * out_numel
+            else:
+                numel = buf_size
+            dtype = V.graph.get_dtype(arg)
+            dtype_size = get_dtype_size(dtype)
+            nbytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
+        return sum(nbytes)
+
+    def warn_mix_layout(self, kernel_name):
+        """
+        Print message if the kernel have mixed layout inputs.
+        Only care about 4D tensor for now.
+        """
+        if (
+            len(self.args.input_buffers) == 1
+            and len(self.args.output_buffers) == 1
+            and len(self.args.inplace_buffers) == 0
+        ):
+            # even if input buffer and output buffer have different layout,
+            # this can be a layout conversion kernel. No need to warn for
+            # the mix layouts.
+            return
+
+        argdefs, call_args, signature = self.args.python_argdefs()
+        uniform_stride_order = None
+        for arg_name in call_args:
+            buf = V.graph.get_buffer(arg_name)
+            if buf and len(buf.layout.size) == 4:
+                # ignore the tensor if only 1 dimension is non-zero
+                if len([x for x in buf.layout.size if x == 1]) == 3:
+                    continue
+                stride_order = ir.get_stride_order(buf.layout.stride)
+                if uniform_stride_order is None:
+                    uniform_stride_order = stride_order
+                elif uniform_stride_order != stride_order:
+                    msg = yellow_text(
+                        f"Expected stride order {uniform_stride_order}, but found stride order"
+                        + f" {stride_order} for kernel {kernel_name}"
+                    )
+                    log.warning(msg)
+
+                    stride_order_list = [
+                        ir.get_stride_order(V.graph.get_buffer(name).layout.stride)
+                        if V.graph.get_buffer(name)
+                        else None
+                        for name in call_args
+                    ]
+                    size_list = [
+                        V.graph.get_buffer(name).layout.size
+                        if V.graph.get_buffer(name)
+                        else None
+                        for name in call_args
+                    ]
+                    source_list = [
+                        "GraphInput"
+                        if name in V.graph.graph_inputs
+                        else "IntermediateBuffer"
+                        if name in V.graph.name_to_buffer
+                        else None
+                        for name in call_args
+                    ]
+
+                    msg = yellow_text(
+                        f"  param names {argdefs}\n  buf names {call_args}\n  strides {stride_order_list}"
+                        + f"\n  sizes {size_list}\n  sources {source_list}\n"
+                    )
+                    log.warning(msg)
+                    return
+        msg = green_text(
+            f"All the inputs for the triton kernel {kernel_name} have uniform layout"
+        )
+        log.warning(msg)
+
+    def codegen_kernel(self):
+        raise NotImplementedError
+
+    def codegen_body(self):
+        pass
+
+    def codegen_iteration_ranges_entry(self, entry: IterationRangesEntry):
+        raise NotImplementedError
+
+
+class SIMDScheduling(BaseScheduling):
+    kernel_type = SIMDKernel  # override in subclass
+    int32_type = "torch.int32"
+    int64_type = "torch.int64"
+
+    def __init__(self, scheduler):
+        self.scheduler = scheduler
+
+    def group_fn(self, sizes):
+        return tuple(V.graph.sizevars.simplify(sympy_product(s)) for s in sizes)
+
+    def can_fuse(self, node1, node2):
+        """
+        Hook called by Scheduler to determine if the Triton backend
+        can fuse node1 and node2.  These nodes might already be
+        FusedSchedulerNodes.
+        """
+        if isinstance(node1, scheduler.ForeachKernelSchedulerNode) or isinstance(
+            node2, scheduler.ForeachKernelSchedulerNode
+        ):
+            return scheduler.ForeachKernelSchedulerNode.can_fuse(node1, node2)
+
+        _, (numel1, rnumel1) = node1.group
+        _, (numel2, rnumel2) = node2.group
+        why = WhyNoFuse(node1, node2)
+
+        if node1.is_split_scan() and not node2.is_split_scan():
+            if node2.is_reduction():
+                why("Split scan cannot fuse with reductions")
+        elif node2.is_split_scan() and not node1.is_split_scan():
+            if node1.is_reduction():
+                why("Split scan cannot fuse with reductions")
+
+        if node1.is_reduction() and node2.is_reduction():
+            reduction_can_fuse = numel1 == numel2 and rnumel1 == rnumel2
+            if not reduction_can_fuse:
+                why(
+                    "numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)",
+                    numel1,
+                    numel2,
+                    rnumel1,
+                    rnumel2,
+                )
+            return reduction_can_fuse
+
+        if not node1.is_reduction() and not node2.is_reduction():
+            if not (numel1 == numel2 and rnumel1 == rnumel2):
+                why(
+                    "numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)",
+                    numel1,
+                    numel2,
+                    rnumel1,
+                    rnumel2,
+                )
+                return False
+
+            if node1.is_template():
+                # Only allow fusion for TritonTemplates for now.
+                # Fusion for CUDATemplates are not supported.
+                is_triton_template = isinstance(node1.node, TritonTemplateBuffer)
+                if not is_triton_template:
+                    why("node1 is not TritonTemplateBuffer")
+                return is_triton_template
+
+            # check for a bad combined tiling
+            tiling1 = self.select_tiling(node1.get_nodes(), numel1, rnumel1)
+            tiling2 = self.select_tiling(node2.get_nodes(), numel1, rnumel1)
+            tiling3 = self.select_tiling(
+                node1.get_nodes() + node2.get_nodes(), numel1, rnumel1
+            )
+            if config.triton.tiling_prevents_pointwise_fusion:
+                cond = True
+                if len(tiling1) > 2:
+                    if len(tiling2) > 2:
+                        cond = tiling1 == tiling2 == tiling3
+                    else:
+                        cond = tiling1 == tiling3
+                elif len(tiling2) > 2:
+                    cond = tiling2 == tiling3
+                if not cond:
+                    why(
+                        "tiling mismatch (%s, %s, %s)",
+                        tiling1,
+                        tiling2,
+                        tiling3,
+                    )
+                    return False
+
+            return True
+
+        if not node1.is_reduction() and node2.is_reduction():
+            assert rnumel1 == 1 and rnumel2 != 1
+            if numel1 == numel2 * rnumel2:
+                if not all(
+                    SIMDKernel.is_compatible((numel2, rnumel2), n.get_ranges())
+                    for n in node1.get_nodes()
+                ):
+                    why("nodes numel/rnumel incompatibility")
+                    return False
+                if (
+                    config.triton.tiling_prevents_reduction_fusion
+                    and not node1.is_template()
+                ):
+                    is_reduction_tiling_valid = self.select_tiling(
+                        node1.get_nodes(), numel1
+                    ) in (
+                        (numel1, 1),
+                        (numel2, rnumel2, 1),
+                    )
+                    if not is_reduction_tiling_valid:
+                        why("invalid tiling for reduction")
+                    return is_reduction_tiling_valid
+                return True
+
+            if numel1 != numel2:
+                why("nodes numel incompatibility")
+            return numel1 == numel2
+
+        assert node1.is_reduction() and not node2.is_reduction()
+        # swap args to hit the case above
+        return self.can_fuse_horizontal(node2, node1)
+
+    can_fuse_vertical = can_fuse
+    can_fuse_horizontal = can_fuse
+
+    def generate_node_schedule(self, nodes, numel, rnumel):
+        node_schedule: List[Any] = []
+        current_loop_writes: Set[str] = set()
+
+        # Writes with a reduced shape, meaning they are only present once the
+        # reduction loop has ended
+        current_loop_reduced_writes = set()
+        current_loop_has_writes = False
+        done = set()
+
+        def fits_in_main_body(n):
+            _, (node_numel, node_rnumel) = n.group
+            return (node_numel == numel and node_rnumel == rnumel) or (
+                node_numel == numel * rnumel and node_rnumel == 1
+            )
+
+        def fits_outside_reduction(n):
+            _, (node_numel, node_rnumel) = n.group
+            return node_numel == numel and node_rnumel == 1 and rnumel != 1
+
+        def schedule_node_in_loop(n):
+            nonlocal current_loop_has_writes
+            done.add(n)
+            node_schedule.append(n)
+            current_loop_has_writes = True
+            # A scan is modelled as a reduction in the scheduler but has a
+            # full sized output that can be used inside the loop body
+            if (
+                n.is_reduction()
+                and isinstance(n, scheduler.SchedulerNode)
+                and isinstance(n.node, ir.ComputedBuffer)
+                and not isinstance(n.node.data, ir.Scan)
+            ):
+                current_loop_reduced_writes.add(n.get_name())
+
+        @contextlib.contextmanager
+        def end_current_reduction_loop():
+            nonlocal current_loop_has_writes
+            if current_loop_has_writes:
+                # flush out any other runnable nodes to reduce number of loops
+                for other_node in nodes[index + 1 :]:
+                    if (
+                        node not in done
+                        and fits_in_main_body(other_node)
+                        and not (current_loop_reduced_writes & other_node.ancestors)
+                    ):
+                        schedule_node_in_loop(node)
+
+            if node_schedule and node_schedule[-1] is EnableReduction:
+                node_schedule.pop()
+            else:
+                node_schedule.append(DisableReduction)
+            yield
+            node_schedule.append(EnableReduction)
+            current_loop_reduced_writes.clear()
+            current_loop_has_writes = False
+
+        for index, node in enumerate(nodes):
+            if node in done:
+                continue
+            done.add(node)
+
+            def requires_closing_previous_reduction(node, node_schedule):
+                if rnumel == 1:
+                    return False
+                if not current_loop_reduced_writes & node.ancestors:
+                    return False
+                assert node_schedule and not isinstance(
+                    node_schedule[-1], (EnableReduction, DisableReduction)
+                )
+                return bool(current_loop_reduced_writes)
+
+            if fits_in_main_body(node):
+                if requires_closing_previous_reduction(node, node_schedule):
+                    with end_current_reduction_loop():
+                        pass  # need to start a new reduction loop
+
+                schedule_node_in_loop(node)
+            elif fits_outside_reduction(node):
+                with end_current_reduction_loop():
+                    node_schedule.append(node)
+            else:
+                raise NotImplementedError(
+                    f"unexpected group: ({numel}, {rnumel}) != {node.group[1]}"
+                )
+
+        return node_schedule
+
+    def codegen_node(
+        self, node: Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode]
+    ):
+        """
+        Given a set of pre-fused nodes, generate a Triton kernel.
+        """
+
+        nodes: List[scheduler.SchedulerNode] = node.get_nodes()  # type: ignore[assignment]
+
+        _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
+
+        node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+        buf_accesses = collections.defaultdict(list)
+        for node in nodes:
+            for access in node.read_writes.reads | node.read_writes.writes:
+                buf_accesses[access.name].append(access)
+
+        schedule_log.debug("Schedule:\n %s", node_schedule)
+
+        return self.codegen_node_schedule(node_schedule, buf_accesses, numel, rnumel)
+
+    @staticmethod
+    def reduction_hint(node):
+        assert node.is_reduction()
+        if all(
+            dep.is_contiguous()
+            for dep in itertools.chain(node.read_writes.reads, node.read_writes.writes)
+        ):
+            return ReductionHint.INNER
+        else:
+            return node.node.data.reduction_hint
+
+    @staticmethod
+    def can_use_32bit_indexing(
+        numel: sympy.Expr, buffers: Iterable[Union[ir.Buffer, ir.TensorBox]]
+    ) -> bool:
+        int_max = torch.iinfo(torch.int32).max
+        size_hint = V.graph.sizevars.size_hint
+        has_hint = V.graph.sizevars.shape_env.has_hint
+
+        def within_32bit(e):
+            # Allow for unhinted e as long as we can still statically prove
+            # (e.g., via ValueRanges) that it is still in bounds
+            if V.graph.sizevars.is_expr_static_and_true(e <= int_max):
+                return True
+            # Otherwise, the hint MUST exist and be in range
+            return has_hint(e) and size_hint(e) <= int_max
+
+        if not within_32bit(numel):
+            return False
+
+        # Any use of a MultiOutputLayout will create a buffer with a
+        # Layout whose sizes are accounted for
+        buf_sizes = [
+            buf.get_layout().storage_size()
+            for buf in buffers
+            if not isinstance(buf.get_layout(), ir.MultiOutputLayout)
+        ]
+
+        if not all(within_32bit(size) for size in buf_sizes):
+            return False
+
+        # Only install guards for 32-bit indexing as there is no correctness
+        # issue with using 64-bit for everything
+        V.graph.sizevars.guard_leq(numel, int_max)  # type: ignore[arg-type]
+        for size in buf_sizes:
+            V.graph.sizevars.guard_leq(size, int_max)  # type: ignore[arg-type]
+        return True
+
+    @classmethod
+    def select_index_dtype(cls, node_schedule, numel, reduction_numel):
+        # Gather all used buffer names
+        buffer_names = set()
+        for node in node_schedule:
+            if not isinstance(node, scheduler.BaseSchedulerNode):
+                continue
+
+            buffer_names.update(node.get_names())
+            buffer_names.update(node.used_buffer_names())
+
+        # Get buffers objects
+
+        def _get_buffer(name: str) -> Union[ir.Buffer, ir.TensorBox]:
+            buf = V.graph.get_buffer(name)
+            if buf is None:
+                raise RuntimeError(f"Failed to find buffer matching name {name}")
+            return buf
+
+        buffers = [V.graph.get_buffer(name) for name in buffer_names]
+
+        # In theory we can separately check xnumel and rnumel are <= int_max
+        # but some indexers do use the full linear index so we need to be
+        # conservative here.
+        total_numel = numel * reduction_numel
+
+        if SIMDScheduling.can_use_32bit_indexing(total_numel, buffers):
+            return cls.int32_type
+        return cls.int64_type
+
+    def has_non_contiguous_pw_in_reduction_kernel(self, node_schedule, numel, rnumel):
+        pointwise_nodes = list(
+            filter(
+                lambda n: n not in (EnableReduction, DisableReduction)
+                and not n.is_reduction()
+                and n.group[1][0] == numel * rnumel,
+                node_schedule,
+            )
+        )
+        for node in pointwise_nodes:
+            # An index can be an integer when loading a random seed.
+            if not all(
+                not isinstance(dep, MemoryDep)
+                or dep.is_contiguous()
+                or isinstance(dep.index, (sympy.Integer, int))
+                or dep.stride1_for_last_dim()
+                for dep in itertools.chain(
+                    node.read_writes.reads, node.read_writes.writes
+                )
+            ):
+                return True
+        return False
+
+    def get_kernel_args(self, node_schedule, numel, reduction_numel):
+        reductions = list(
+            filter(
+                lambda n: n not in (EnableReduction, DisableReduction)
+                and n.is_reduction(),
+                node_schedule,
+            )
+        )
+        if len(reductions) > 0:
+            hints = [self.reduction_hint(n) for n in reductions]
+            if hints.count(hints[0]) == len(hints):
+                reduction_hint_val = hints[0]
+            else:
+                reduction_hint_val = ReductionHint.DEFAULT
+
+            if (
+                reduction_hint_val == ReductionHint.INNER
+                and self.has_non_contiguous_pw_in_reduction_kernel(
+                    node_schedule, numel, reduction_numel
+                )
+            ):
+                reduction_hint_val = ReductionHint.DEFAULT
+        else:
+            reduction_hint_val = ReductionHint.DEFAULT
+
+        mutations = set()
+        for node in node_schedule:
+            if hasattr(node, "get_mutations"):
+                mutations.update(node.get_mutations())
+
+        index_dtype = self.select_index_dtype(node_schedule, numel, reduction_numel)
+
+        return reduction_hint_val, mutations, index_dtype
+
+    def codegen_node_schedule(
+        self, node_schedule, buf_accesses, numel, reduction_numel
+    ):
+        from torch._inductor.codegen.triton_split_scan import TritonSplitScanKernel
+
+        tiled_groups = self.select_tiling(node_schedule, numel, reduction_numel)
+        (
+            reduction_hint_val,
+            mutations,
+            index_dtype,
+        ) = self.get_kernel_args(node_schedule, numel, reduction_numel)
+
+        is_split_scan = any(
+            isinstance(node, BaseSchedulerNode) and node.is_split_scan()
+            for node in node_schedule
+        )
+        kernel_type = TritonSplitScanKernel if is_split_scan else self.kernel_type
+        kernel_args = tiled_groups
+        kernel_kwargs = {
+            "reduction_hint": reduction_hint_val,
+            "mutations": mutations,
+            "index_dtype": index_dtype,
+        }
+        kernel = kernel_type(
+            *kernel_args,
+            **kernel_kwargs,
+        )
+        kernel.buf_accesses = buf_accesses
+
+        self.codegen_node_schedule_with_kernel(node_schedule, kernel)
+
+        with V.set_kernel_handler(kernel):
+            src_code = kernel.codegen_kernel()
+
+        kernel_name = self.define_kernel(src_code, node_schedule, kernel)
+        log.debug("Generating kernel code with kernel_name: %s", kernel_name)
+        kernel.kernel_name = kernel_name
+        kernel.code_hash = code_hash(src_code)
+
+        if kernel.persistent_reduction and config.triton.multi_kernel:
+            kernel2 = self.kernel_type(
+                *kernel_args,
+                **kernel_kwargs,
+                disable_persistent_reduction=True,
+            )
+            self.codegen_node_schedule_with_kernel(node_schedule, kernel2)
+            with V.set_kernel_handler(kernel2):
+                src_code2 = kernel2.codegen_kernel()
+            kernel_name2 = self.define_kernel(src_code2, node_schedule, kernel)
+            kernel2.kernel_name = kernel_name2
+            kernel2.code_hash = code_hash(src_code2)
+
+            final_kernel = MultiKernel([kernel, kernel2])
+        else:
+            final_kernel = kernel  # type: ignore[assignment]
+
+        with V.set_kernel_handler(final_kernel):
+            for node in node_schedule:
+                if node not in (EnableReduction, DisableReduction):
+                    node.mark_run()
+
+        self.codegen_comment(node_schedule)
+        final_kernel.call_kernel(final_kernel.kernel_name)
+        if config.nan_asserts:
+            final_kernel.codegen_nan_check()
+        if config.warn_mix_layout:
+            final_kernel.warn_mix_layout(kernel_name)
+
+        V.graph.removed_buffers |= final_kernel.removed_buffers
+        V.graph.inplaced_to_remove |= final_kernel.inplaced_to_remove
+
+        if (
+            V.graph.wrapper_code.supports_intermediate_hooks
+            and config.generate_intermediate_hooks
+        ):
+            # Not every node in the schedule will actually be live on output;
+            # we can't check dead buffers.
+            live_outs = kernel.args.live_output_buffers()
+            for node in node_schedule:
+                if not isinstance(node, scheduler.BaseSchedulerNode):
+                    continue
+                name = node.get_name()
+                if name not in live_outs:
+                    continue
+                origin_node = node.node.get_origin_node()
+                if origin_node is not None:
+                    counters["inductor"]["intermediate_hooks"] += 1
+                    V.graph.wrapper_code.writeline(
+                        f"run_intermediate_hooks({origin_node.name!r}, {name})"
+                    )
+
+        self.scheduler.free_buffers()
+
+    def codegen_node_schedule_with_kernel(self, node_schedule, kernel):
+        def current_reduction_nodes(nodes):
+            return itertools.takewhile(lambda n: n is not DisableReduction, nodes)
+
+        with kernel:
+            stack = contextlib.ExitStack()
+            kernel.set_last_usage(current_reduction_nodes(node_schedule))
+
+            for node in node_schedule:
+                if node not in (EnableReduction, DisableReduction):
+                    node.decide_inplace_update()
+            for i, node in enumerate(node_schedule):
+                if node is DisableReduction:
+                    stack.enter_context(kernel.disable_reduction())
+                elif node is EnableReduction:
+                    stack.close()
+                    kernel.set_last_usage(current_reduction_nodes(node_schedule[i:]))
+                else:
+                    # TODO - use split ranges ?
+                    indexing_dtype_strength_reduction(node._body)
+                    index_vars = kernel.split_and_set_ranges(node.get_ranges())
+                    node.codegen(index_vars)
+
+    def codegen_template(
+        self, template_node, epilogue_nodes, only_gen_src_code=False
+    ) -> Optional[str]:
+        """
+        Codegen a triton template
+
+        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
+        """
+        _, (numel, rnumel) = template_node.group
+        assert rnumel == 1
+        kernel, render = template_node.node.make_kernel_render(template_node.node)
+        with kernel:
+            if not only_gen_src_code:
+                for node in [template_node, *epilogue_nodes]:
+                    node.mark_run()
+            partial_code = render()
+            for node in epilogue_nodes:
+                node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
+
+        # finalize must be called after adding epilogue above
+        with V.set_kernel_handler(kernel):
+            # TODO: Maybe unify CUDATemplateKernel to also use PartialRender for flexible epilogue fusion.
+            src_code = (
+                partial_code
+                if isinstance(partial_code, str)
+                else partial_code.finalize()
+            )
+            node_schedule = [template_node, *epilogue_nodes]
+
+            if config.benchmark_kernel:
+                num_gb = kernel.estimate_kernel_num_bytes() / 1e9
+                grid_args = V.graph.sizevars.size_hints(kernel.call_sizes)
+                assert kernel.meta is not None, "meta is None"
+                grid = kernel.grid_fn(*grid_args, kernel.meta)
+                src_code = (
+                    f"{kernel.imports_for_benchmark_kernel()}\n"
+                    f"{src_code}\n"
+                    f"{kernel.codegen_kernel_benchmark(num_gb, grid).getvalue()}"
+                )
+
+            if only_gen_src_code:
+                return src_code
+
+            kernel_name = self.define_kernel(src_code, node_schedule, kernel)
+
+        self.codegen_comment(node_schedule)
+        kernel.call_kernel(kernel_name, template_node.node)
+        V.graph.removed_buffers |= kernel.removed_buffers
+        V.graph.inplaced_to_remove |= kernel.inplaced_to_remove
+        self.scheduler.free_buffers()
+        return None
+
+    def codegen_sync(self):
+        V.graph.wrapper_code.writeline(V.graph.device_ops.synchronize())
+
+    def codegen_foreach(self, foreach_node):
+        from .triton_foreach import ForeachKernel
+
+        for partitions_with_metadata in ForeachKernel.horizontal_partition(
+            foreach_node.get_subkernel_nodes(), self
+        ):
+            kernel = ForeachKernel()
+            for nodes, tiled_groups, numel, rnumel in partitions_with_metadata:
+                node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+                (
+                    reduction_hint_val,
+                    mutations,
+                    index_dtype,
+                ) = self.get_kernel_args(node_schedule, numel, rnumel)
+
+                subkernel = kernel.create_sub_kernel(
+                    *tiled_groups,
+                    reduction_hint=reduction_hint_val,
+                    mutations=mutations,
+                    index_dtype=index_dtype,
+                )
+
+                self.codegen_node_schedule_with_kernel(
+                    node_schedule,
+                    subkernel,
+                )
+
+                with V.set_kernel_handler(subkernel):
+                    for node in node_schedule:
+                        if node not in (EnableReduction, DisableReduction):
+                            node.mark_run()
+                V.graph.removed_buffers |= subkernel.removed_buffers
+                V.graph.inplaced_to_remove |= subkernel.inplaced_to_remove
+
+            src_code = kernel.codegen_kernel()
+            kernel_name = self.define_kernel(src_code, [foreach_node], kernel)
+            self.codegen_comment([foreach_node])
+            kernel.call_kernel(V.graph.wrapper_code, kernel_name)
+
+        self.scheduler.free_buffers()
+
+    @staticmethod
+    @functools.lru_cache(32)
+    def candidate_tilings(node):
+        ranges, reduction_ranges = node.get_ranges()
+        if len(ranges) <= 1:
+            return ()
+
+        rw = node.pointwise_read_writes()
+        assert len(rw.range_vars) == len(ranges)
+
+        # isinstance(dep, MemoryDep): this filters out StarDeps. StarDeps refer to reads
+        # that need to access the entire tensor; they don't contribute read indexing
+        # information (and practically, they don't have dep.index so they can't be used
+        # for stride_hints below
+        dep_sources = [rw.reads, rw.writes]
+        assert all(
+            isinstance(dep, (MemoryDep, StarDep))
+            for dep in itertools.chain.from_iterable(dep_sources)
+        )
+        deps = [
+            dep
+            for dep in itertools.chain.from_iterable(dep_sources)
+            if dep.name not in V.graph.removed_buffers and isinstance(dep, MemoryDep)
+        ]
+        write_names = {dep.name for dep in rw.writes}
+
+        tilings: List[CandidateTiling] = []
+
+        for dep in deps:
+            strides = V.graph.sizevars.stride_hints(dep.index, rw.range_vars)
+            assert len(strides) == len(ranges)
+            try:
+                split = strides.index(1) + 1
+                if split == len(ranges):
+                    continue
+                if all(s == 0 for s in strides[split:]):
+                    # if this is a broadcasted tensor and all dimensions after split are broadcast,
+                    # this is not a real split
+                    continue
+
+            except ValueError:
+                continue
+            tiled_groups = (
+                V.graph.sizevars.simplify(sympy_product(ranges[:split])),
+                V.graph.sizevars.simplify(sympy_product(ranges[split:])),
+            )
+            # score by number of elements
+            score = V.graph.sizevars.size_hint(
+                sympy_product(
+                    size for size, stride in zip(ranges, strides) if stride != 0
+                )
+            )
+            if dep.name in write_names:
+                # ngimel said contiguous writes is more important than reads
+                score *= 2
+            if CandidateTiling.is_good_size(tiled_groups[0]):
+                score *= 2
+            if CandidateTiling.is_good_size(tiled_groups[1]):
+                score *= 2
+
+            if (
+                V.graph.sizevars.size_hint(
+                    score - sympy_product(itertools.chain(ranges, reduction_ranges))
+                )
+                >= 0
+            ):
+                tilings.append(CandidateTiling(tiled_groups, score, dep.name))
+        return tilings
+
+    @classmethod
+    def select_tiling(cls, node_schedule, numel, reduction_numel=sympy.Integer(1)):
+        """
+        Heuristics to decide how to tile kernels.
+        Currently, we tile based on stride-1 dimensions.
+
+        Returns:
+            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`
+
+        """
+        if reduction_numel != 1 or config.triton.max_tiles <= 1:
+            # TODO(jansel): should we tile reductions?
+            # do perf hint here if stride-1 dim is not being reduced
+            if perf_hint_log.level <= logging.WARNING:
+                for node in EnableReduction.filter(node_schedule):
+                    if len(cls.candidate_tilings(node)) > 0:
+                        perf_hint_log.info("reduction over non-contiguous dims")
+                        break
+            return (numel, reduction_numel)
+
+        seen_names = set()
+        candidate_tiles: Counter[Any] = collections.Counter()
+        for node in EnableReduction.filter(node_schedule):
+            for tiling in cls.candidate_tilings(node):
+                if tiling.name in seen_names:
+                    continue
+                seen_names.add(tiling.name)
+                candidate_tiles[tiling.tiling] += tiling.score
+
+        ranked_tilings = [tiling for tiling, score in candidate_tiles.most_common()]
+
+        if config.triton.max_tiles >= 3:
+            # Consider adding a third dimension of tiling, but only
+            # when a1 is a multiple of b1; otherwise, you have a lot
+            # of stragglers which is annoying to generate code for.
+            #
+            # NB: More than three max tiles is not enabled by default.
+
+            # Add one 3D tiling choice
+            for i in range(1, len(ranked_tilings)):
+                a0, a1 = ranked_tilings[0]
+                b0, b1 = ranked_tilings[i]
+                if V.graph.sizevars.size_hint(a1 - b1) == 0:
+                    continue
+                if V.graph.sizevars.size_hint(a1 - b1) < 0:
+                    # swap so a0 is bigger
+                    a0, a1 = ranked_tilings[i]
+                    b0, b1 = ranked_tilings[0]
+                assert V.graph.sizevars.size_hint(a1 - b1) > 0
+                if V.graph.sizevars.statically_known_multiple_of(a1, b1):
+                    tiling = (a0, FloorDiv(a1, b1), b1)
+                    ranked_tilings = [tiling] + ranked_tilings
+                    break  # only 1 choice for now
+
+        if len(ranked_tilings) > 1:
+            perf_hint_log.info("possibly bad tiling: %s", ranked_tilings)
+
+        for tiled_groups in ranked_tilings:
+            new_groups = (*tiled_groups, reduction_numel)
+            if all(
+                SIMDKernel.is_compatible(new_groups, node.get_ranges())
+                for node in node_schedule
+                if isinstance(node, scheduler.SchedulerNode)
+            ):
+                return new_groups
+
+        return (numel, reduction_numel)
+
+    def flush(self):
+        pass
+
+    def ready_to_flush(self) -> bool:
+        return False
+
+    def generate_kernel_code_from_nodes(self, nodes, benchmark_kernel=False):
+        @dataclasses.dataclass
+        class LastUsageHolder:
+            n: Any
+            last_usage: Any
+
+            def __del__(self):
+                self.n.last_usage = self.last_usage
+
+        last_usage_holders = [LastUsageHolder(n, n.last_usage) for n in nodes]
+
+        # empty last_usage. May cause more aggressive 'evict_last'. Should be fine.
+        for n in nodes:
+            n.last_usage = set()
+
+        if not nodes[0].is_template():
+            _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
+            node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
+
+            tiled_groups = self.select_tiling(node_schedule, numel, rnumel)
+            reduction_hint_val, mutations, index_dtype = self.get_kernel_args(
+                node_schedule, numel, rnumel
+            )
+
+            kernel = self.kernel_type(
+                *tiled_groups,
+                reduction_hint=reduction_hint_val,
+                mutations=mutations,
+                index_dtype=index_dtype,
+            )
+
+            self.codegen_node_schedule_with_kernel(node_schedule, kernel)
+            with config.patch(
+                "benchmark_kernel", benchmark_kernel
+            ), V.set_kernel_handler(kernel):
+                src_code = kernel.codegen_kernel()
+        else:
+            template_node = nodes[0]
+            epilogue_nodes = nodes[1:]
+
+            with config.patch("benchmark_kernel", benchmark_kernel):
+                src_code = self.codegen_template(
+                    template_node, epilogue_nodes, only_gen_src_code=True
+                )
+
+        src_code = src_code.replace(str(Placeholder.KERNEL_NAME), "triton_")
+        return src_code
+
+    def codegen_comment(self, node_schedule):
+        pass
+
+    def define_kernel(self, src_code, node_schedule, kernel):
+        raise NotImplementedError
+
+
+@dataclasses.dataclass
+class CandidateTiling:
+    tiling: Tuple[sympy.Expr, sympy.Expr]
+    score: int  # higher is better
+    name: Optional[str] = None
+
+    @staticmethod
+    def is_good_size(s):
+        """Somewhat arbitrary heuristic used to boost scores for some sizes"""
+        s = V.graph.sizevars.size_hint(s)
+        return s >= 32 and (s % 32 == 0)
+
+
+class DisableReduction:
+    """
+    Marker to invoke `kernel.disable_reduction()`.  This closes a
+    reduction loop and allows for pointwise ops to occur on the output
+    of a reduction.
+    """
+
+
+class EnableReduction:
+    """
+    Marker to end a DisableReduction block.
+    """
+
+    @staticmethod
+    def filter(node_schedule):
+        """
+        Get the nodes from node_schedule skipping those in a
+        DisableReduction block.
+        """
+        disabled = False
+        for node in node_schedule:
+            if node in (EnableReduction, DisableReduction):
+                # Don't tile stuff outside the main reduction loop
+                disabled = node is DisableReduction
+            elif disabled:
+                pass
+            else:
+                yield node
+
+
+class CantSplit(Exception):
+    pass
diff --git a/torch/_inductor/codegen/triton.py b/torch/_inductor/codegen/triton.py
index ac6699675af1..183d28605b87 100644
--- a/torch/_inductor/codegen/triton.py
+++ b/torch/_inductor/codegen/triton.py
@@ -1,30 +1,13 @@
 from __future__ import annotations
 
-import collections
-import contextlib
 import dataclasses
 import functools
 import itertools
 import logging
-import math
-import operator
 import os
 import textwrap
 from functools import lru_cache
-from typing import (
-    Any,
-    Callable,
-    cast,
-    Counter,
-    DefaultDict,
-    Dict,
-    Iterable,
-    List,
-    Optional,
-    Set,
-    Tuple,
-    Union,
-)
+from typing import Any, Callable, cast, Dict, List, Optional, Set, Tuple, Union
 
 import sympy
 
@@ -33,42 +16,24 @@
 import torch.utils._pytree as pytree
 from torch._dynamo.utils import preserve_rng_state
 
-from torch._inductor.metrics import is_metric_table_enabled, log_kernel_metadata
 from torch._inductor.runtime.hints import AutotuneHint, DeviceProperties
 from torch._prims_common import is_integer_dtype
-from torch.utils._sympy.functions import FloorDiv, ModularIndexing
-from torch.utils._sympy.symbol import free_symbol_is_type, symbol_is_type, SymT
-from torch.utils._sympy.value_ranges import ValueRanges
 from torch.utils._triton import has_triton_package
+from ...utils._sympy.value_ranges import ValueRanges
 
-from ..._dynamo.utils import counters
-from .. import config, ir, scheduler
+from .. import config, ir
 from ..codecache import code_hash, get_path, PyCodeCache
-from ..dependencies import Dep, MemoryDep, StarDep, WeakDep
-from ..ir import IRNode, TritonTemplateBuffer
-from ..optimize_indexing import indexing_dtype_strength_reduction
+from ..ir import IRNode
+from ..metrics import is_metric_table_enabled, log_kernel_metadata
 from ..runtime.hints import ReductionHint, TRITON_MAX_BLOCK
-from ..runtime.runtime_utils import (
-    do_bench_gpu,
-    get_max_y_grid,
-    green_text,
-    next_power_of_2,
-    yellow_text,
-)
-from ..scheduler import BaseSchedulerNode, BaseScheduling, WhyNoFuse
+from ..runtime.runtime_utils import do_bench_gpu, next_power_of_2
 from ..utils import (
     cache_on_self,
     get_bounds_index_expr,
-    get_dtype_size,
     get_fused_kernel_name,
     get_kernel_metadata,
     is_welford_reduction,
     Placeholder,
-    sympy_dot,
-    sympy_index_symbol,
-    sympy_product,
-    sympy_subs,
-    unique,
 )
 from ..virtualized import _ops as ops, OpsHandler, ReductionType, StoreMode, V
 from ..wrapper_benchmark import get_kernel_category_by_source_code
@@ -77,17 +42,21 @@
     CSEVariable,
     DeferredLine,
     IndentedBuffer,
-    index_prevent_reordering,
-    Kernel,
     OpOverrides,
     PythonPrinter,
     SizeArg,
     TensorArg,
 )
-from .multi_kernel import MultiKernel
+from .simd import (
+    IndexingOptions,
+    IterationRangesEntry,
+    pexpr,
+    SIMDKernel,
+    SIMDScheduling,
+    triton_constant,
+)
 from .triton_utils import config_of, signature_of, signature_to_meta
 
-
 log = logging.getLogger(__name__)
 perf_hint_log = torch._logging.getArtifactLogger(__name__, "perf_hints")
 schedule_log = torch._logging.getArtifactLogger(__name__, "schedule")
@@ -133,27 +102,6 @@ def gen_common_triton_imports():
     return imports.getvalue()
 
 
-@dataclasses.dataclass
-class IndexingOptions:
-    index_str: str
-    mask_vars: Set[sympy.Symbol]
-    mask_str: str
-    expand_str: Optional[str]
-    _has_rindex: bool
-
-    def has_mask(self):
-        return bool(self.mask_vars)
-
-    def has_rindex(self):
-        return self._has_rindex
-
-    def has_tmpmask(self):
-        return "tmp" in self.mask_str
-
-    def has_rmask(self):
-        return "rmask" in self.mask_str
-
-
 @dataclasses.dataclass
 class BlockPtrOptions:
     constant_offset: sympy.Expr
@@ -416,7 +364,6 @@ def _print_RoundDecimal(self, expr):
 
 
 texpr = TritonPrinter().doprint
-pexpr = PythonPrinter().doprint
 
 
 def triton_compute_type(dtype):
@@ -455,16 +402,6 @@ def triton_acc_type(dtype):
     return triton_compute_type(dtype)
 
 
-def triton_constant(value):
-    if value == float("inf"):
-        return 'float("inf")'
-    elif value == float("-inf"):
-        return 'float("-inf")'
-    elif math.isnan(value):
-        return 'float("nan")'
-    return repr(value)
-
-
 class TritonCSEVariable(CSEVariable):
     def __init__(self, name, bounds: ValueRanges[Any]):
         super().__init__(name, bounds)
@@ -487,9 +424,6 @@ def update_on_args(self, name, args, kwargs):
                 # those reads should subsequently be masked,
                 self.mask_vars.update({f"{arg.name[0]}mask"})
 
-    def __repr__(self):
-        return f"TritonCSEVariable(name={self.name})"
-
 
 class TritonOverrides(OpOverrides):
     """Map element-wise ops to Triton"""
@@ -965,283 +899,6 @@ def _typecheck_TritonKernelOverrides(h: TritonKernelOverrides) -> OpsHandler[str
     return h
 
 
-@dataclasses.dataclass
-class IterationRanges:
-    """
-    Each range tree represents multiple sets of iteration indexing
-    in a single tiled dimension in the output kernel.
-
-    If you have two loops ranges one (4, 3, 2) and another (4, 6),
-    then the range tree will be:
-            4 (i0)
-        3 (i1)  6 (i3)
-        2 (i2)
-    Where i0 is shared between both loops, but then the split into
-    different indexing vars.  All loop ranges must iterate over
-    the same number of elements.
-    """
-
-    def __init__(
-        self,
-        name: str,
-        var_list: List[sympy.Symbol],
-        var_ranges: Dict[sympy.Symbol, sympy.Expr],
-        numel: sympy.Expr,
-        prefix: str,
-        *,
-        kernel: TritonKernel,
-        divisor=sympy.Integer(1),
-        length=sympy.Integer(1),
-        root: IterationRangesRoot,
-    ):
-        super().__init__()
-        self.name = name
-        self.var_list = var_list
-        self.var_ranges = var_ranges
-        self.numel = numel
-        self.prefix = prefix
-        self.divisor = divisor
-        self.length = length
-        self.kernel = kernel
-        self.root = root
-
-    def symbol(self):
-        return sympy_index_symbol(self.name)
-
-
-class IterationRangesRoot(IterationRanges):
-    def __init__(
-        self,
-        name: str,
-        numel: sympy.Expr,
-        # TODO: this is probably SymTy.INDEX and SymTy.RINDEX
-        prefix: str,
-        index: int,
-        kernel: TritonKernel,
-        pid_cache=None,
-        *,
-        is_loop: bool,
-        tensor_dim: Optional[int],
-        grid_dim: Optional[int],
-        has_zdim: bool,
-    ):
-        if pid_cache is None:
-            pid_cache = {}
-        super().__init__(
-            name=name,
-            var_list=[],
-            var_ranges={},
-            numel=numel,
-            prefix=prefix,
-            kernel=kernel,
-            root=self,
-        )
-        self.index = index
-        # Store all the nodes in one flat list
-        self.nodes: Dict[sympy.Expr, IterationRangesEntry] = {}
-        # This is for re-ordering program ID in triton mm template
-        # pid_cache["tl.program_id(0)"] = pid_m
-        self.pid_cache: Dict[str, str] = pid_cache
-
-        # True if the dimension is implemented as a single program looping over
-        # the full dimension (currently only used for non-persistent reduction)
-        assert not is_loop or (prefix == "r" and grid_dim is None)
-        self.is_loop = is_loop
-        # Index of corresponding dimension on triton tensors
-        self.tensor_dim = tensor_dim
-        # Index of corresponding dimension in the triton grid
-        self.grid_dim = grid_dim
-        self.has_zdim = has_zdim
-
-    def __repr__(self):
-        return f"IterationRangesRoot({self.name!r}, {self.numel}, ...)"
-
-    def cache_clear(self):
-        for node in self.nodes.values():
-            node.cache_clear()
-
-    def lookup(self, divisor, length):
-        """
-        Lookup a given RangeTreeEntry, creating it if needed
-        """
-        if V.graph.sizevars.statically_known_equals(divisor * length, self.numel):
-            expr = FloorDiv(sympy_index_symbol(f"{self.prefix}index"), divisor)
-        else:
-            expr = ModularIndexing(
-                sympy_index_symbol(f"{self.prefix}index"), divisor, length
-            )
-
-        if expr not in self.nodes:
-            node = IterationRangesEntry(
-                f"{self.prefix}{next(V.kernel.iter_vars_count)}",
-                divisor,
-                length,
-                expr,
-                self,
-            )
-            V.kernel.range_tree_nodes[node.symbol()] = node
-            self.var_list.append(node.symbol())
-            self.var_ranges[node.symbol()] = length
-            self.nodes[expr] = node
-        return self.nodes[expr]
-
-    def construct_entries(self, lengths: List[sympy.Expr]):
-        divisor = sympy.Integer(1)
-        itervars = []
-        for length in reversed(lengths):
-            itervars.append(self.lookup(divisor, length))
-            divisor = divisor * length
-        return list(reversed(itervars))
-
-    def construct(self, lengths: List[sympy.Expr]):
-        return [e.symbol() for e in self.construct_entries(lengths)]
-
-    def vars_and_sizes(self, index: sympy.Expr):
-        """Figure out vars from this tree used in index"""
-        nodes = [V.kernel.range_tree_nodes.get(s) for s in index.free_symbols]
-        nodes = [n for n in nodes if n and n.prefix == self.prefix]
-        nodes.sort(key=lambda x: V.graph.sizevars.size_hint(x.divisor))
-        divisor = sympy.Integer(1)
-        index_vars = []
-        sizes = []
-
-        def add(node):
-            nonlocal divisor
-            index_vars.append(node.symbol())
-            sizes.append(node.length)
-            divisor = divisor * node.length
-
-        for node in nodes:
-            if not V.graph.sizevars.statically_known_equals(node.divisor, divisor):
-                # fill in unused index var
-                add(self.lookup(divisor, FloorDiv(node.divisor, divisor)))
-                divisor = node.divisor
-            add(node)
-        if not V.graph.sizevars.statically_known_equals(self.numel, divisor):
-            # fill in unused index var
-            add(self.lookup(divisor, FloorDiv(self.numel, divisor)))
-
-        return list(reversed(index_vars)), list(reversed(sizes))
-
-    def ranges_code(self):
-        assert self.tensor_dim is not None
-        size = self.kernel.indexing_size_str(self.tensor_dim)
-        index_dtype = self.kernel.index_dtype
-        convert = f".to({index_dtype})" if index_dtype != "tl.int32" else ""
-        return f"tl.arange(0, {self.prefix.upper()}BLOCK){size}{convert}"
-
-    def scalar_code(self, value):
-        index_dtype = self.kernel.index_dtype
-        ndim = self.kernel.triton_tensor_ndim()
-        size = [1] * ndim
-        return f"tl.full({size}, {value}, {index_dtype})"
-
-    def get_pid(self):
-        assert self.grid_dim is not None
-        key = f"tl.program_id({self.grid_dim})"
-        # y_grid has a limit, so express it in terms of y and z in case of overflow.
-        # z grid is only exercised when max_tiles == 3 (off by default).
-        if (
-            self.grid_dim == 1
-            and not self.has_zdim
-            and not (isinstance(self.numel, int) and self.numel <= get_max_y_grid())
-        ):
-            key = f"{key} * (tl.program_id({self.grid_dim + 1}) + 1)"
-        pid = self.pid_cache.get(key, key)
-        if self.kernel.index_dtype != "tl.int32":
-            return f"{pid}.to({self.kernel.index_dtype})"
-        return pid
-
-    def codegen_header(self, code):
-        x = self.prefix
-        if self.is_loop:
-            code.writeline(f"{self.name} = {x}offset + {x}base")
-        elif self.grid_dim is None:
-            # no need to "{x}offset = "
-            code.writeline(f"{self.name} = {self.ranges_code()}")
-            code.writeline(f"{x}offset = 0")
-        else:
-            if self.tensor_dim is not None:
-                line = f"{x}offset + {self.ranges_code()}"
-            else:
-                line = self.scalar_code(f"{x}offset")
-            code.writelines(
-                [
-                    f"{x}offset = {self.get_pid()} * {x.upper()}BLOCK",
-                    f"{self.name} = {line}",
-                ]
-            )
-        code.writeline(f"{x}mask = {self.name} < {x}numel")
-
-
-class IterationRangesEntry(IterationRanges):
-    def __init__(
-        self,
-        name: str,
-        divisor: sympy.Expr,
-        length: sympy.Expr,
-        expr: sympy.Expr,
-        parent: IterationRanges,
-    ):
-        super().__init__(
-            name=name,
-            numel=parent.numel / length,
-            var_list=parent.var_list,
-            var_ranges=parent.var_ranges,
-            prefix=parent.prefix,
-            divisor=divisor,
-            length=length,
-            kernel=parent.kernel,
-            root=parent.root,
-        )
-        self.parent = parent
-        self.codegen = functools.lru_cache(None)(self._codegen)
-        self.expr = expr
-
-    def __repr__(self):
-        return f"IterationRangesEntry({self.name}, {self.divisor}, {self.length}, {self.expr}, {self.var_ranges})"
-
-    def set_name(self, name):
-        self.codegen = lambda: name  # type: ignore[assignment]
-        self.codegen.cache_clear = lambda: None  # type: ignore[method-assign]
-        self.name = name
-
-    def cache_clear(self):
-        self.codegen.cache_clear()
-
-    def writeline(self, line):
-        if self.root.is_loop:
-            V.kernel.indexing_code.writeline(line)
-        else:
-            # lift non-reduction stores outside loop
-            V.kernel.body.writeline(line)
-
-    def _codegen(self):
-        self.writeline(f"{self.name} = " + texpr(V.kernel.rename_indexing(self.expr)))
-        return self.name
-
-    def precomputed_args(self):
-        # for dynamic shapes, find parts of indexing expressions that have to be precomputed
-        precomputed_args: List[sympy.Expr] = []
-        if isinstance(self.expr, sympy.Symbol):
-            return precomputed_args
-        assert isinstance(self.expr, (FloorDiv, ModularIndexing)), type(self.expr)
-        for arg in self.expr.args[1:]:
-            if not isinstance(arg, (sympy.Integer, sympy.Symbol)):
-                symbols = arg.free_symbols
-                if len(symbols) > 0 and all(
-                    symbol_is_type(s, SymT.SIZE) for s in symbols
-                ):
-                    precomputed_args.append(arg)
-        return precomputed_args
-
-    def __hash__(self):
-        return hash(self.name)
-
-    def __eq__(self, other):
-        return self.name == other.name
-
-
 class HelperFunctions:
     """An ordered set of helper functions."""
 
@@ -1281,11 +938,11 @@ def __getitem__(self, idx):
         return self.finalized_helpers[idx]
 
 
-class TritonKernel(Kernel):
+class TritonKernel(SIMDKernel):
     overrides = TritonKernelOverrides  # type: ignore[assignment]
-    sexpr = pexpr
-
     helper_functions: HelperFunctions
+    kexpr: Callable[[sympy.Expr], str] = texpr
+    allow_block_ptr = True
 
     def __init__(
         self,
@@ -1297,54 +954,35 @@ def __init__(
         min_elem_per_thread=0,
         disable_persistent_reduction=False,
     ):
-        if pid_cache is None:
-            pid_cache = {}
-        super().__init__()
-        self.numels = [V.graph.sizevars.simplify(s) for s in groups]
-        self.mutations: Set[str] = mutations if mutations is not None else set()
-        self.range_trees: List[IterationRangesRoot] = []
-        self.range_tree_nodes: Dict[sympy.Symbol, IterationRangesEntry] = {}
-        self.iter_vars_count = itertools.count()
-        self.inside_reduction = self.numels[-1] != 1
-        self.body = IndentedBuffer()
-        self.indexing_code = IndentedBuffer()
+        super().__init__(
+            *groups,
+            index_dtype=index_dtype,
+            mutations=mutations,
+            reduction_hint=reduction_hint,
+            pid_cache=pid_cache,
+            disable_persistent_reduction=disable_persistent_reduction,
+        )
         self.suffix: IndentedBuffer = IndentedBuffer()  # type: ignore[assignment]
         self.outside_loop_vars: Set[Any] = set()
-        self.reduction_hint = reduction_hint
-        self.index_dtype: str = index_dtype
         self.min_elem_per_thread = min_elem_per_thread
-        self.last_usage: Set[str] = set()
         self.block_ptr_id = itertools.count()
-        # buffer accesses in the kernel
-        self.buf_accesses: DefaultDict[str, List[Dep]] = collections.defaultdict(list)
-
-        self.persistent_reduction: bool = (
-            not disable_persistent_reduction
-        ) and self.should_use_persistent_reduction()
-        self.no_x_dim = (
-            self.reduction_hint == ReductionHint.INNER
-            and self.persistent_reduction
-            and len(self.numels) == 2
-            and self.numels[-1] >= 256
-        )
-        self.initialize_range_tree(pid_cache)
-
         self.helper_functions = HelperFunctions()
 
         # A set of autotuning hints to pass as part of triton_meta
         self.autotune_hints: Set[AutotuneHint] = set()
+        self.triton_meta: Optional[Dict[str, object]] = None
 
-        # define this in a closure to make cache local to object
-        @functools.lru_cache(None)
-        def simplify_indexing(index: sympy.Expr):
-            index = V.graph.sizevars.simplify_with_ranges(index, self.var_ranges())
-            for tree in self.range_trees:
-                index = self.combine_contiguous_dims(index, tree)
-            return index
+        self.codegen_range_tree()
 
-        self.simplify_indexing = simplify_indexing
-        self.code_hash = None
-        self.triton_meta: Optional[Dict[str, object]] = None
+    def codegen_range_tree(self):
+        for tree in self.range_trees:
+            # reduction indexing goes inside a loop
+            if not tree.is_loop:
+                tree.codegen_header(self.body)
+        if self.inside_reduction and self.range_trees[-1].is_loop:
+            # workaround for this issue:
+            # https://gist.github.com/jansel/6527126f781559095c5531f98a4235a7
+            self.body.writeline(f"rbase = {self.range_trees[-1].ranges_code()}")
 
     def need_numel_args(self):
         r"""
@@ -1384,507 +1022,21 @@ def should_use_persistent_reduction(self) -> bool:
         V.graph.sizevars.guard_leq(self.numels[-1], next_power_of_2(hint))  # type: ignore[arg-type]
         return True
 
-    def set_last_usage(self, nodes):
-        if not self.inside_reduction or self.persistent_reduction:
-            return
-        self.last_usage = set(
-            itertools.chain.from_iterable(
-                n.last_usage for n in nodes if n is not EnableReduction
-            )
-        )
-
-    def initialize_range_tree(self, pid_cache):
-        no_r_dim = not self.inside_reduction or self.numels[-1] == 1
-
-        prefixes = "zyxr"
-        active_prefixes = prefixes[-len(self.numels) :]
-
-        grid_dims = "xyz"
-        if self.no_x_dim:
-            tensor_dims = "r"
-        elif no_r_dim:
-            tensor_dims = "xyz"
-        else:
-            tensor_dims = "xyzr"
-
-        tensor_dims = "".join(p for p in tensor_dims if p in active_prefixes)
-
-        for i, prefix in enumerate(active_prefixes):
-            is_reduction = prefix == "r"
-            tensor_dim = tensor_dims.find(prefix) if prefix in tensor_dims else None
-            grid_dim = None if is_reduction else grid_dims.find(prefix)
-            index = i if grid_dim is None else grid_dim
-            self.range_trees.append(
-                IterationRangesRoot(
-                    f"{prefix}index",
-                    self.numels[i],
-                    prefix,
-                    index,
-                    self,
-                    pid_cache=pid_cache,
-                    is_loop=is_reduction and not self.persistent_reduction,
-                    tensor_dim=tensor_dim,
-                    grid_dim=grid_dim,
-                    has_zdim="z" in active_prefixes,
-                )
-            )
-        for tree in self.range_trees:
-            # reduction indexing goes inside a loop
-            if not tree.is_loop:
-                tree.codegen_header(self.body)
-        if self.inside_reduction and self.range_trees[-1].is_loop:
-            # workaround for this issue:
-            # https://gist.github.com/jansel/6527126f781559095c5531f98a4235a7
-            self.body.writeline(f"rbase = {self.range_trees[-1].ranges_code()}")
-
-    def disable_reduction(self):
-        should_flush = self.range_trees[-1].is_loop
-
-        @contextlib.contextmanager
-        def ctx():
-            if self.numels[-1] == 1:
-                assert not self.inside_reduction
-                yield
-                return
-            if should_flush:
-                # calling codegen_body() will flush all the pending buffers
-                # and write out a reduction loop
-                self.codegen_body()
-            self.inside_reduction = False
-            try:
-                yield
-                if should_flush:
-                    # flush out any code before opening the next loop
-                    self.codegen_body()
-            finally:
-                self.inside_reduction = True
-
-        return ctx()
-
-    def set_ranges(self, *lengths):
-        assert len(lengths) == len(self.range_trees)
-        return [
-            ranges.construct(length)
-            for length, ranges in zip(lengths, self.range_trees)
-        ]
-
-    @staticmethod
-    def _split_iteration_ranges(
-        groups: Iterable[sympy.Expr], lengths: List[List[sympy.Expr]]
-    ):
-        sv = V.graph.sizevars
-        new_ranges: List[List[sympy.Expr]] = [[] for _ in groups]
-        remaining = [sv.simplify(g) for g in groups]
-        var_count = itertools.count()
-
-        def add_range(i, expr):
-            expr = sv.simplify(expr)
-            if not sv.statically_known_multiple_of(remaining[i], expr):
-                raise CantSplit
-            # guard on the last item out
-            remaining[i] = FloorDiv(remaining[i], expr)
-            new_ranges[i].append(expr)
-            return next(var_count)
-
-        def make_combined(size, idx1, idx2):
-            def getter(flat_vars):
-                return size * flat_vars[idx1] + flat_vars[idx2]
-
-            return getter
-
-        return_getters_groups = []
-        current_group = 0
-        for length_group in lengths:
-            return_getters = []
-            for size in length_group:
-                if sv.statically_known_equals(size, 1):  # type: ignore[arg-type]
-                    return_getters.append(lambda _: sympy.Integer(0))
-                    continue
-
-                while (
-                    current_group < len(remaining)
-                    and sv.size_hint(remaining[current_group]) == 1
-                ):
-                    # scroll to next group with remaining elements
-                    current_group += 1
-
-                if sv.size_hint(size) > sv.size_hint(remaining[current_group]):
-                    # need to break size in two
-                    if not sv.statically_known_multiple_of(
-                        size, remaining[current_group]
-                    ):
-                        raise CantSplit
-                    size1 = remaining[current_group]
-                    size2 = FloorDiv(size, remaining[current_group])
-                    return_getters.append(
-                        make_combined(
-                            size2,
-                            add_range(current_group, size1),
-                            add_range(current_group + 1, size2),
-                        )
-                    )
-                else:
-                    return_getters.append(
-                        operator.itemgetter(add_range(current_group, size))
-                    )
-            return_getters_groups.append(return_getters)
-
-        assert all(
-            V.graph.sizevars.size_hint(s) == 1 for s in remaining
-        ), f"failed to set ranges {remaining} {lengths}"
-
-        return new_ranges, return_getters_groups
-
-    @classmethod
-    def is_compatible(
-        cls, groups: Iterable[sympy.Expr], lengths: List[List[sympy.Expr]]
-    ):
-        try:
-            cls._split_iteration_ranges(groups, lengths)
-            return True
-        except CantSplit:
-            return False
-
-    def split_and_set_ranges(self, lengths: List[List[sympy.Expr]]):
-        """
-        We may want to fuse `for i0 in s0*s1` into a tiled kernel with groups (s0, s1).
-
-        To do this we need to split up the iteration space of i0 into something like:
-            for i1 in s0:
-              for i2 in s1:
-                i0 = i1*s1 + i2
-                ....
-
-        This function matches and resplits lengths to the groups of
-        this kernel to enable tiled + non-tiled fusions.
-        """
-        groups = [rt.numel for rt in self.range_trees]
-        if not self.inside_reduction:
-            groups[-1] = sympy.Integer(1)
-
-        if len(lengths) == len(self.range_trees) and all(
-            V.graph.sizevars.simplify(sympy_product(x) - g) == 0
-            for x, g in zip(lengths, groups)
-        ):
-            return self.set_ranges(*lengths)
-
-        new_ranges, return_getters_groups = self._split_iteration_ranges(
-            groups, lengths
-        )
-        itervars = list(itertools.chain.from_iterable(self.set_ranges(*new_ranges)))
-        return [[fn(itervars) for fn in fns] for fns in return_getters_groups]
-
-    def is_indirect_indexing(self, index: sympy.Expr):
-        # tmpX  means indirect indexing
-        return free_symbol_is_type(index, SymT.TMP)
-
-    def is_broadcasted(self, index: sympy.Expr):
-        # Note. This may not be correct when there is indirect indexing
-        if self.is_indirect_indexing(index):
-            return False
-
-        index_numels = [1] * len(self.numels)
-        for symbol in index.free_symbols:
-            if symbol not in self.range_tree_nodes:
-                # Non-iterated variables, e.g. strides
-                continue
-            entry = self.range_tree_nodes[symbol]  # type: ignore[index]
-            assert isinstance(entry.parent, IterationRangesRoot)
-            index_numels[entry.parent.index] *= entry.length
-
-        # If the index variables only iterate over a subset of the kernel
-        # numels, then it must be broadcasted.
-        simplify = V.graph.sizevars.simplify
-        return any(
-            simplify(idx_range) != simplify(iter_range)  # type: ignore[arg-type]
-            for idx_range, iter_range in zip(index_numels, self.numels)
-        )
-
-    def combine_contiguous_dims(self, index: sympy.Expr, tree: IterationRangesRoot):
-        """
-        More aggressive simplification to merge contiguous dims
-        """
-        if isinstance(index, (sympy.Integer, sympy.Symbol)):
-            return index
-        index_vars, sizes = tree.vars_and_sizes(index)
-        if len(sizes) <= 1:
-            return index
-        new_sizes, reindex, prune = V.graph.sizevars._simplify_loops(
-            index_vars, sizes, index_prevent_reordering([index], index_vars, sizes)
-        )
-        if new_sizes == sizes:
-            return index
-        new_index_vars = tree.construct(new_sizes)
-        new_index = sympy_subs(index, dict(zip(index_vars, reindex(new_index_vars))))
-        return new_index
-
-    def index_to_str(self, index: sympy.Expr) -> str:
-        """
-        Convert an index expr to a string that can be used in triton code.
-        e.g. a sympy expression "s2" may actually appear as "ks1" in the triton kernel.
-
-        Index expressions often need to be passed in as arguments to the triton kernel.
-        Rename_indexing and codegen_indexing keep track of the needed indices and add
-        new parameters to the function signature.
-        """
-        if isinstance(index, list):
-            return f"[{', '.join(map(self.index_to_str, index))}]"
-        return texpr(self.rename_indexing(self.codegen_indexing(index)))
-
-    def indexing(
-        self,
-        index: sympy.Expr,
-        *,
-        copy_shape=None,
-        dense_indexing=False,
-        override_mask=None,
-        block_ptr=False,
-    ) -> Union[IndexingOptions, BlockPtrOptions]:
-        """
-        Compute the index and mask to pass to tl.load() or tl.store()
-        """
-        index = self.simplify_indexing(index)
-        index = sympy_subs(index, V.graph.sizevars.precomputed_replacements)
-        # if simple replacements didn't get rid of floor/ceil, try full subs
-        if len(index.atoms(sympy.floor)) or len(index.atoms(sympy.ceiling)):
-            index = index.subs(V.graph.sizevars.precomputed_replacements)
-        # last resort, if no range vars are in the expr, hoist it
-        # TODO instead of trying to blindly find complicated exprs, we should hoist the
-        # inputs/outputs sizes and strides, but at the time indexing is generated
-        # kernel inputs and outputs are not set yet, we'd need a deeper refactor
-        # to do it this way
-
-        if len(index.atoms(sympy.ceiling)):
-            for a in index.atoms(sympy.ceiling):
-                # for nested exprs, atoms yields top level first (?)
-                # so if everything goes fine, lower level replacements will come up empty
-                symbols = a.free_symbols
-                if len(symbols) > 0 and all(
-                    symbol_is_type(s, (SymT.SIZE, SymT.PRECOMPUTED_SIZE))
-                    for s in symbols
-                ):
-                    replacements = {a: V.graph.sizevars.lookup_precomputed_size(a)}
-                    index = sympy_subs(index, replacements)
-
-        index = self.simplify_indexing(index)
-        index_vars = index.free_symbols
-        has_rindex = False
-
-        mask_vars: Set[str] = set()
-        for var in index_vars:
-            assert isinstance(var, sympy.Symbol)
-            has_rindex = has_rindex or symbol_is_type(var, SymT.RINDEX)
-            if override_mask:
-                pass
-            elif symbol_is_type(var, SymT.TMP):
-                # indirect indexing
-                cse_var = self.cse.varname_map[var.name]
-                mask_vars.update(cse_var.mask_vars)
-            elif symbol_is_type(
-                var,
-                (
-                    SymT.UNBACKED_INT,
-                    SymT.SIZE,
-                    SymT.PRECOMPUTED_SIZE,
-                    SymT.INDEX,
-                    SymT.FLOAT,
-                    SymT.UNBACKED_FLOAT,
-                ),
-            ):
-                pass
-            else:
-                # var is one of xN, yN or rN
-                assert symbol_is_type(
-                    var, (SymT.RINDEX, SymT.XBLOCK, SymT.YBLOCK)
-                ), var.name
-                mask_vars.add(f"{var.name[0]}mask")
-
-        need_dense = (
-            config.triton.dense_indexing
-            or dense_indexing
-            or self._load_mask is not None
-        ) and index != 0
-
-        have_dense = True
-        have_loop_vars = False
-        dense_mask_vars = set()
-
-        for tree in self.active_range_trees():
-            if index_vars.intersection(tree.var_list):
-                have_loop_vars = True
-            else:
-                have_dense = False
-            dense_mask_vars.add(f"{tree.prefix}mask")
-
-        if (
-            block_ptr
-            and config.triton.use_block_ptr
-            and not override_mask
-            and not self._load_mask
-            and len(mask_vars - dense_mask_vars) == 0
-            and not self.is_indirect_indexing(index)
-            and have_loop_vars
-            # workaround https://github.com/openai/triton/issues/2821
-            and self.index_dtype == "tl.int32"
-        ):
-            index_relative_to_xyr_index = sympy_subs(
-                index, {v: t.expr for v, t in self.range_tree_nodes.items()}
-            )
-            range_trees = self.active_range_trees(reorder=True)
-            symbols = [t.symbol() for t in range_trees]
-            strides = [sympy.Wild(f"stride_{s}", exclude=symbols) for s in symbols]
-            offset = sympy.Wild("_offset", exclude=symbols)
-            m = index_relative_to_xyr_index.match(sympy_dot(symbols, strides) + offset)
-            # TODO(jansel): it is sometimes possible to do higher dimensional block_ptrs with
-            #               a tl.reshape the correct block.  We will miss these cases today.
-            if m:
-                self.filter_masks(mask_vars)
-                return BlockPtrOptions.create(
-                    [m[s] for s in strides],
-                    m[offset],
-                    range_trees,
-                    mask_vars,  # type: ignore[arg-type]
-                )
-
-        expand_str = None
-        index_str = self.index_to_str(index)
-        if isinstance(index, sympy.Integer):
-            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
-            index_str = f"tl.full({expand_str}, {index_str}, tl.int32)"
-            return IndexingOptions(index_str, set(), "None", expand_str, has_rindex)
-
-        if need_dense and not have_dense:
-            expand_str = f"{copy_shape}.shape" if copy_shape else self.dense_size_str()
-            index_str = f"tl.broadcast_to({index_str}, {expand_str})"
-            mask_vars = dense_mask_vars
-        elif not have_loop_vars and copy_shape:
-            index_str = f"tl.broadcast_to({index_str}, {copy_shape}.shape)"
-            mask_vars = dense_mask_vars
-
-        if override_mask:
-            mask_vars = {override_mask}
-
-        if self._load_mask:
-            mask_vars.add(self._load_mask)
-
-        self.filter_masks(mask_vars)
-
-        mask_str = " & ".join(sorted(map(str, mask_vars))) if mask_vars else "None"
-        return IndexingOptions(index_str, mask_vars, mask_str, expand_str, has_rindex)  # type: ignore[arg-type]
-
-    def active_range_trees(self, reorder=False):
-        trees = [
-            t for t in self.range_trees if t.prefix != "r" or self.inside_reduction
-        ]
-        if reorder and len(trees) > 1:
-            count = sum(t.prefix in "xyz" for t in trees)
-            assert "".join(t.prefix for t in trees[:count]) == "zyx"[-count:], [
-                t.prefix for t in trees[:count]
-            ]
-            trees[:count] = reversed(trees[:count])
-        return trees
-
-    def filter_masks(self, mask_vars):
-        for tree in self.range_trees:
-            # Masks are superfluous if we only have one element
-            if V.graph.sizevars.statically_known_equals(tree.numel, 1):  # type: ignore[arg-type]
-                mask_vars.discard(f"{tree.prefix}mask")
-                continue
-            # Masks are superfluous if numel is a multiple of BLOCK
-            # (We use the fact that BLOCK is required by triton to be a power of 2)
-            if tree.prefix.upper() not in TRITON_MAX_BLOCK:
-                continue
-            max_block = TRITON_MAX_BLOCK[tree.prefix.upper()]
-            # Optional optimization: if block divides numel exactly, we will
-            # never need to do a masked load to handle stragglers at the end.
-            # It's faster to avoid masking at all.  But it is sound to always
-            # mask.
-            if V.graph.sizevars.statically_known_multiple_of(tree.numel, max_block):  # type: ignore[arg-type]
-                mask_vars.discard(f"{tree.prefix}mask")
-
-    def var_ranges(self):
-        return dict(
-            itertools.chain.from_iterable(
-                tree.var_ranges.items() for tree in self.range_trees
-            )
+    def want_no_x_dim(self):
+        return (
+            self.reduction_hint == ReductionHint.INNER
+            and self.persistent_reduction
+            and len(self.numels) == 2
+            and self.numels[-1] >= 256
         )
 
-    def codegen_indexing(self, expr: sympy.Expr):
-        expr = V.graph.sizevars.simplify_with_ranges(expr, self.var_ranges())
-        for sym in sorted(expr.free_symbols, key=str):
-            if sym in self.range_tree_nodes:
-                # if indexing expression is complicated, we precompute it on the host side
-                # and send the result as a kernel argument
-                replacements = {}
-                for ps in self.range_tree_nodes[sym].precomputed_args():  # type: ignore[index]
-                    replacements[ps] = V.graph.sizevars.lookup_precomputed_size(ps)
-                if len(replacements) > 0:
-                    self.range_tree_nodes[sym].expr = sympy_subs(  # type: ignore[index]
-                        self.range_tree_nodes[sym].expr, replacements  # type: ignore[index]
-                    )
-                self.range_tree_nodes[sym].codegen()  # type: ignore[index]
-        return expr
-
-    @contextlib.contextmanager
-    def mask_loads(self, mask):
-        """Context manager to add an additional mask to tl.load/store"""
-        prior = self._load_mask
-        if prior:
-            mask = self.cse.generate(self.compute, f"{mask} & {prior}")
-
-        self._load_mask = mask
-        try:
-            # TODO(jansel): do we need a reshape here?
-            yield mask
-        finally:
-            self._load_mask = prior
-
     def generate_assert(self, check):
         return torch.version.hip is None and super().generate_assert(check)
 
-    def load_mask(self, var):
-        mask = ""
-        mask_vars = set(var.mask_vars)
-        if self._load_mask:
-            mask_vars.add(self._load_mask)
-
-        if mask_vars:
-            mask = (
-                f"{next(iter(mask_vars))}"
-                if len(mask_vars) == 1
-                # sorted for deterministic order
-                else f"({' & '.join(sorted(map(str, mask_vars)))})"
-            )
-        return mask
-
     @property
     def assert_function(self) -> str:
         return "tl.device_assert"
 
-    def get_strides_of_load(self, index: sympy.Expr):
-        """
-        This gets the stride of the index for each of the tiling variables
-        (technically, it does it at index 0)
-
-        For example, if
-        xindex = x0 + 512*x1 + 1024*r0
-        x0 = (xindex//512)
-        x1 = (xindex % 512)
-        r0 = rindex // 1024
-
-        this function would return
-        {xindex: 512, rindex: 1024}
-        """
-        index_to_tile_indexes = {k: v.expr for k, v in self.range_tree_nodes.items()}
-        index_in_tile_vars = sympy_subs(index, index_to_tile_indexes)  # type: ignore[arg-type]
-        strides = {}
-        for range_tree in self.range_trees:
-            s = sympy_index_symbol(range_tree.name)
-            strides[s] = sympy_subs(index_in_tile_vars, {s: 1}) - sympy_subs(
-                index_in_tile_vars, {s: 0}
-            )
-        return strides
-
     def codegen_block_ptr(
         self, name: str, var: str, indexing: BlockPtrOptions, other=""
     ) -> Tuple[str, Optional[DeferredLine], str]:
@@ -2125,12 +1277,6 @@ def reduction_resize(self, value):
         sizes[-1] = "None"
         return f"{value}[{', '.join(sizes)}]"
 
-    @staticmethod
-    def _map_tuple_or_scalar(fn, value):
-        if isinstance(value, tuple):
-            return tuple(map(fn, value))
-        return fn(value)
-
     def reduction(
         self,
         dtype: torch.dtype,
@@ -2662,7 +1808,7 @@ def codegen_kernel_benchmark(self, num_gb, grid=None):
 
             result.writeline("args = get_args()")
             result.writeline(
-                "ms = do_bench_gpu(lambda: call(args), rep=40, fast_flush=True)"
+                "ms = do_bench(lambda: call(args), rep=40, fast_flush=True)"
             )
             result.writeline(f"num_gb = {num_gb}")
             result.writeline("gb_per_s = num_gb / (ms / 1e3)")
@@ -2684,68 +1830,6 @@ def imports_for_benchmark_kernel(self):
             )
         )
 
-    def estimate_kernel_num_bytes(self):
-        """
-        Try the best to estimate the total size (in bytes) of the
-        kernel's inputs and outputs, which is used for estimating the memory
-        throughput of this kernel. This information is used for checking how
-        far we are from the peak memory bandwidth. It's important that
-        we want to avoid overestimating the sizes of the inputs and outputs,
-        because it can wrongfully give us a very large memory traffic value,
-        which may be even larger than the theoretical bandwidth and thus
-        become very misleading. This is particularly problematic for cases
-        where we slice some inputs. In those cases, we should only count
-        the size of the "slices" instead of the original inputs, because
-        only the slices contribute to the real memory traffic.
-        """
-        nbytes = []
-        ninplace_args = len(unique(self.args.inplace_buffers.values()))
-        _, call_args, _ = self.args.python_argdefs()
-
-        # For pointwise and reduction kernels, this is the upper-bound numels
-        # for the output buffer.
-        # FIXME: This is not exactly right for cases like below:
-        #    def foo(tensor0, tensor1):
-        #        x0 = narrow(tensor0)
-        #        return cat(x0, tensor1)
-        # For this example, we will end up overestimate the size for the
-        # slice s0. Potentially, we could have precise inputs information
-        # if we maintained the original inputs of the Pointwise kernel created
-        # for the "cat". However, I think it might be a bit overwhelming that
-        # we add such complexity only for handling some particular cases for
-        # benchmarking.
-        out_numel = V.graph.sizevars.size_hint(sympy_product(self.numels))
-        for i, arg in enumerate(call_args):
-            # "buf" may be narrowed. In this case, the number of memory accesses
-            # should be estimated based on the reinterpreted layout.
-            # On the other hand, buf may be broadcasted. In this case,
-            # counting the size of the underline storage would give us
-            # a better estimation in terms of memory accesses.
-            if arg not in self.buf_accesses:
-                nbytes.append(0)
-                continue
-            arg_numel = V.graph.get_numel(arg)
-            buf_size = V.graph.sizevars.size_hint(arg_numel)
-            if buf_size > out_numel:
-                # This arg points to a buf that has been sliced.
-                # We need to count each individual slice to have
-                # a better estimation.
-                indices: Set[Any] = set()
-                no_index_dep_count = 0
-                for dep in self.buf_accesses[arg]:
-                    if isinstance(dep, (StarDep, WeakDep)):
-                        indices.add(f"no_index_dep_{no_index_dep_count}")
-                        no_index_dep_count += 1
-                    else:
-                        indices.add(dep.index)
-                numel = len(indices) * out_numel
-            else:
-                numel = buf_size
-            dtype = V.graph.get_dtype(arg)
-            dtype_size = get_dtype_size(dtype)
-            nbytes.append(numel * dtype_size * (1 + int(i < ninplace_args)))
-        return sum(nbytes)
-
     def _get_heuristic(self):
         if self.persistent_reduction:
             assert self.inside_reduction
@@ -2991,28 +2075,6 @@ def KERNEL_NAME(in_ptr0, in_ptr1, out_ptr2, xnumel, rnumel, XBLOCK : tl.constexp
             if tree.prefix == "x" and self.no_x_dim:
                 code.writeline("XBLOCK: tl.constexpr = 1")
 
-    def triton_tensor_ndim(self):
-        return sum(int(tree.tensor_dim is not None) for tree in self.range_trees)
-
-    def indexing_size_str(self, i):
-        sizes = ["None"] * self.triton_tensor_ndim()
-        sizes[i] = ":"
-        return f"[{', '.join(sizes)}]"
-
-    def dense_size_list(self) -> List[str]:
-        sizes = ["1"] * self.triton_tensor_ndim()
-        for tree in self.range_trees:
-            if tree.tensor_dim is None:
-                continue
-
-            if tree.prefix != "r" or self.inside_reduction:
-                sizes[tree.tensor_dim] = f"{tree.prefix.upper()}BLOCK"
-        return sizes
-
-    def dense_size_str(self):
-        sizes = self.dense_size_list()
-        return f"[{', '.join(sizes)}]"
-
     def _get_grid_fn(self):
         return "grid"
 
@@ -3076,439 +2138,22 @@ def codegen_nan_check(self):
                 line = f"assert not {arg}.isinf().any().item()"
                 wrapper.writeline(line)
 
-    def warn_mix_layout(self, kernel_name):
-        """
-        Print message if the kernel have mixed layout inputs.
-        Only care about 4D tensor for now.
-        """
-        if (
-            len(self.args.input_buffers) == 1
-            and len(self.args.output_buffers) == 1
-            and len(self.args.inplace_buffers) == 0
-        ):
-            # even if input buffer and output buffer have different layout,
-            # this can be a layout conversion kernel. No need to warn for
-            # the mix layouts.
-            return
-
-        argdefs, call_args, signature = self.args.python_argdefs()
-        uniform_stride_order = None
-        for arg_name in call_args:
-            buf = V.graph.get_buffer(arg_name)
-            if buf and len(buf.layout.size) == 4:
-                # ignore the tensor if only 1 dimension is non-zero
-                if len([x for x in buf.layout.size if x == 1]) == 3:
-                    continue
-                stride_order = ir.get_stride_order(buf.layout.stride)
-                if uniform_stride_order is None:
-                    uniform_stride_order = stride_order
-                elif uniform_stride_order != stride_order:
-                    msg = yellow_text(
-                        f"Expected stride order {uniform_stride_order}, but found stride order"
-                        + f" {stride_order} for kernel {kernel_name}"
-                    )
-                    log.warning(msg)
-
-                    stride_order_list = [
-                        ir.get_stride_order(V.graph.get_buffer(name).layout.stride)
-                        if V.graph.get_buffer(name)
-                        else None
-                        for name in call_args
-                    ]
-                    size_list = [
-                        V.graph.get_buffer(name).layout.size
-                        if V.graph.get_buffer(name)
-                        else None
-                        for name in call_args
-                    ]
-                    source_list = [
-                        "GraphInput"
-                        if name in V.graph.graph_inputs
-                        else "IntermediateBuffer"
-                        if name in V.graph.name_to_buffer
-                        else None
-                        for name in call_args
-                    ]
-
-                    msg = yellow_text(
-                        f"  param names {argdefs}\n  buf names {call_args}\n  strides {stride_order_list}"
-                        + f"\n  sizes {size_list}\n  sources {source_list}\n"
-                    )
-                    log.warning(msg)
-                    return
-        msg = green_text(
-            f"All the inputs for the triton kernel {kernel_name} have uniform layout"
-        )
-        log.warning(msg)
-
     def create_cse_var(self, *args, **kwargs):
         return TritonCSEVariable(*args, **kwargs)
 
-
-class TritonScheduling(BaseScheduling):
-    def __init__(self, scheduler):
-        self.scheduler = scheduler
-
-    def group_fn(self, sizes):
-        return tuple(V.graph.sizevars.simplify(sympy_product(s)) for s in sizes)
-
-    def can_fuse(self, node1, node2):
-        """
-        Hook called by Scheduler to determine if the Triton backend
-        can fuse node1 and node2.  These nodes might already be
-        FusedSchedulerNodes.
-        """
-        if isinstance(node1, scheduler.ForeachKernelSchedulerNode) or isinstance(
-            node2, scheduler.ForeachKernelSchedulerNode
-        ):
-            return scheduler.ForeachKernelSchedulerNode.can_fuse(node1, node2)
-
-        _, (numel1, rnumel1) = node1.group
-        _, (numel2, rnumel2) = node2.group
-        why = WhyNoFuse(node1, node2)
-
-        if node1.is_split_scan() and not node2.is_split_scan():
-            if node2.is_reduction():
-                why("Split scan cannot fuse with reductions")
-        elif node2.is_split_scan() and not node1.is_split_scan():
-            if node1.is_reduction():
-                why("Split scan cannot fuse with reductions")
-
-        if node1.is_reduction() and node2.is_reduction():
-            reduction_can_fuse = numel1 == numel2 and rnumel1 == rnumel2
-            if not reduction_can_fuse:
-                why(
-                    "numel/rnumel mismatch (reduce) (%s, %s), (%s, %s)",
-                    numel1,
-                    numel2,
-                    rnumel1,
-                    rnumel2,
-                )
-            return reduction_can_fuse
-
-        if not node1.is_reduction() and not node2.is_reduction():
-            if not (numel1 == numel2 and rnumel1 == rnumel2):
-                why(
-                    "numel/rnumel mismatch (non-reduce) (%s, %s), (%s, %s)",
-                    numel1,
-                    numel2,
-                    rnumel1,
-                    rnumel2,
-                )
-                return False
-
-            if node1.is_template():
-                # Only allow fusion for TritonTemplates for now.
-                # Fusion for CUDATemplates are not supported.
-                is_triton_template = isinstance(node1.node, TritonTemplateBuffer)
-                if not is_triton_template:
-                    why("node1 is not TritonTemplateBuffer")
-                return is_triton_template
-
-            # check for a bad combined tiling
-            tiling1 = self.select_tiling(node1.get_nodes(), numel1, rnumel1)
-            tiling2 = self.select_tiling(node2.get_nodes(), numel1, rnumel1)
-            tiling3 = self.select_tiling(
-                node1.get_nodes() + node2.get_nodes(), numel1, rnumel1
-            )
-            if config.triton.tiling_prevents_pointwise_fusion:
-                cond = True
-                if len(tiling1) > 2:
-                    if len(tiling2) > 2:
-                        cond = tiling1 == tiling2 == tiling3
-                    else:
-                        cond = tiling1 == tiling3
-                elif len(tiling2) > 2:
-                    cond = tiling2 == tiling3
-                if not cond:
-                    why(
-                        "tiling mismatch (%s, %s, %s)",
-                        tiling1,
-                        tiling2,
-                        tiling3,
-                    )
-                    return False
-
-            return True
-
-        if not node1.is_reduction() and node2.is_reduction():
-            assert rnumel1 == 1 and rnumel2 != 1
-            if numel1 == numel2 * rnumel2:
-                if not all(
-                    TritonKernel.is_compatible((numel2, rnumel2), n.get_ranges())
-                    for n in node1.get_nodes()
-                ):
-                    why("nodes numel/rnumel incompatibility")
-                    return False
-                if (
-                    config.triton.tiling_prevents_reduction_fusion
-                    and not node1.is_template()
-                ):
-                    is_reduction_tiling_valid = self.select_tiling(
-                        node1.get_nodes(), numel1
-                    ) in (
-                        (numel1, 1),
-                        (numel2, rnumel2, 1),
-                    )
-                    if not is_reduction_tiling_valid:
-                        why("invalid tiling for reduction")
-                    return is_reduction_tiling_valid
-                return True
-
-            if numel1 != numel2:
-                why("nodes numel incompatibility")
-            return numel1 == numel2
-
-        assert node1.is_reduction() and not node2.is_reduction()
-        # swap args to hit the case above
-        return self.can_fuse_horizontal(node2, node1)
-
-    can_fuse_vertical = can_fuse
-    can_fuse_horizontal = can_fuse
-
-    def generate_node_schedule(self, nodes, numel, rnumel):
-        node_schedule: List[Any] = []
-        current_loop_writes: Set[str] = set()
-
-        # Writes with a reduced shape, meaning they are only present once the
-        # reduction loop has ended
-        current_loop_reduced_writes = set()
-        current_loop_has_writes = False
-        done = set()
-
-        def fits_in_main_body(n):
-            _, (node_numel, node_rnumel) = n.group
-            return (node_numel == numel and node_rnumel == rnumel) or (
-                node_numel == numel * rnumel and node_rnumel == 1
-            )
-
-        def fits_outside_reduction(n):
-            _, (node_numel, node_rnumel) = n.group
-            return node_numel == numel and node_rnumel == 1 and rnumel != 1
-
-        def schedule_node_in_loop(n):
-            nonlocal current_loop_has_writes
-            done.add(n)
-            node_schedule.append(n)
-            current_loop_has_writes = True
-            # A scan is modelled as a reduction in the scheduler but has a
-            # full sized output that can be used inside the loop body
-            if (
-                n.is_reduction()
-                and isinstance(n, scheduler.SchedulerNode)
-                and isinstance(n.node, ir.ComputedBuffer)
-                and not isinstance(n.node.data, ir.Scan)
-            ):
-                current_loop_reduced_writes.add(n.get_name())
-
-        @contextlib.contextmanager
-        def end_current_reduction_loop():
-            nonlocal current_loop_has_writes
-            if current_loop_has_writes:
-                # flush out any other runnable nodes to reduce number of loops
-                for other_node in nodes[index + 1 :]:
-                    if (
-                        node not in done
-                        and fits_in_main_body(other_node)
-                        and not (current_loop_reduced_writes & other_node.ancestors)
-                    ):
-                        schedule_node_in_loop(node)
-
-            if node_schedule and node_schedule[-1] is EnableReduction:
-                node_schedule.pop()
-            else:
-                node_schedule.append(DisableReduction)
-            yield
-            node_schedule.append(EnableReduction)
-            current_loop_reduced_writes.clear()
-            current_loop_has_writes = False
-
-        for index, node in enumerate(nodes):
-            if node in done:
-                continue
-            done.add(node)
-
-            def requires_closing_previous_reduction(node, node_schedule):
-                if rnumel == 1:
-                    return False
-                if not current_loop_reduced_writes & node.ancestors:
-                    return False
-                assert node_schedule and not isinstance(
-                    node_schedule[-1], (EnableReduction, DisableReduction)
-                )
-                return bool(current_loop_reduced_writes)
-
-            if fits_in_main_body(node):
-                if requires_closing_previous_reduction(node, node_schedule):
-                    with end_current_reduction_loop():
-                        pass  # need to start a new reduction loop
-
-                schedule_node_in_loop(node)
-            elif fits_outside_reduction(node):
-                with end_current_reduction_loop():
-                    node_schedule.append(node)
-            else:
-                raise NotImplementedError(
-                    f"unexpected group: ({numel}, {rnumel}) != {node.group[1]}"
-                )
-
-        return node_schedule
-
-    def codegen_node(
-        self, node: Union[scheduler.FusedSchedulerNode, scheduler.SchedulerNode]
-    ):
-        """
-        Given a set of pre-fused nodes, generate a Triton kernel.
-        """
-
-        nodes: List[scheduler.SchedulerNode] = node.get_nodes()  # type: ignore[assignment]
-
-        _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
-
-        node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
-        buf_accesses = collections.defaultdict(list)
-        for node in nodes:
-            for access in node.read_writes.reads | node.read_writes.writes:
-                buf_accesses[access.name].append(access)
-
-        schedule_log.debug("Schedule:\n %s", node_schedule)
-
-        return self.codegen_node_schedule(node_schedule, buf_accesses, numel, rnumel)
-
-    @staticmethod
-    def reduction_hint(node):
-        assert node.is_reduction()
-        if all(
-            dep.is_contiguous()
-            for dep in itertools.chain(node.read_writes.reads, node.read_writes.writes)
-        ):
-            return ReductionHint.INNER
+    def codegen_iteration_ranges_entry(self, entry: IterationRangesEntry):
+        line = f"{entry.name} = {self.kexpr(self.rename_indexing(entry.expr))}"
+        if entry.root.is_loop:
+            self.indexing_code.writeline(line)
         else:
-            return node.node.data.reduction_hint
-
-    @staticmethod
-    def can_use_32bit_indexing(
-        numel: sympy.Expr, buffers: Iterable[Union[ir.Buffer, ir.TensorBox]]
-    ) -> bool:
-        int_max = torch.iinfo(torch.int32).max
-        size_hint = V.graph.sizevars.size_hint
-        has_hint = V.graph.sizevars.shape_env.has_hint
-
-        def within_32bit(e):
-            # Allow for unhinted e as long as we can still statically prove
-            # (e.g., via ValueRanges) that it is still in bounds
-            if V.graph.sizevars.is_expr_static_and_true(e <= int_max):
-                return True
-            # Otherwise, the hint MUST exist and be in range
-            return has_hint(e) and size_hint(e) <= int_max
-
-        if not within_32bit(numel):
-            return False
-
-        # Any use of a MultiOutputLayout will create a buffer with a
-        # Layout whose sizes are accounted for
-        buf_sizes = [
-            buf.get_layout().storage_size()
-            for buf in buffers
-            if not isinstance(buf.get_layout(), ir.MultiOutputLayout)
-        ]
-
-        if not all(within_32bit(size) for size in buf_sizes):
-            return False
-
-        # Only install guards for 32-bit indexing as there is no correctness
-        # issue with using 64-bit for everything
-        V.graph.sizevars.guard_leq(numel, int_max)  # type: ignore[arg-type]
-        for size in buf_sizes:
-            V.graph.sizevars.guard_leq(size, int_max)  # type: ignore[arg-type]
-        return True
-
-    @staticmethod
-    def select_index_dtype(node_schedule, numel, reduction_numel):
-        # Gather all used buffer names
-        buffer_names = set()
-        for node in node_schedule:
-            if not isinstance(node, scheduler.BaseSchedulerNode):
-                continue
-
-            buffer_names.update(node.get_names())
-            buffer_names.update(node.used_buffer_names())
-
-        # Get buffers objects
-
-        def _get_buffer(name: str) -> Union[ir.Buffer, ir.TensorBox]:
-            buf = V.graph.get_buffer(name)
-            if buf is None:
-                raise RuntimeError(f"Failed to find buffer matching name {name}")
-            return buf
-
-        buffers = [V.graph.get_buffer(name) for name in buffer_names]
-
-        # In theory we can separately check xnumel and rnumel are <= int_max
-        # but some indexers do use the full linear index so we need to be
-        # conservative here.
-        total_numel = numel * reduction_numel
-
-        if TritonScheduling.can_use_32bit_indexing(total_numel, buffers):
-            return "tl.int32"
-        return "tl.int64"
-
-    def has_non_contiguous_pw_in_reduction_kernel(self, node_schedule, numel, rnumel):
-        pointwise_nodes = list(
-            filter(
-                lambda n: n not in (EnableReduction, DisableReduction)
-                and not n.is_reduction()
-                and n.group[1][0] == numel * rnumel,
-                node_schedule,
-            )
-        )
-        for node in pointwise_nodes:
-            # An index can be an integer when loading a random seed.
-            if not all(
-                not isinstance(dep, MemoryDep)
-                or dep.is_contiguous()
-                or isinstance(dep.index, (sympy.Integer, int))
-                or dep.stride1_for_last_dim()
-                for dep in itertools.chain(
-                    node.read_writes.reads, node.read_writes.writes
-                )
-            ):
-                return True
-        return False
-
-    def get_kernel_args(self, node_schedule, numel, reduction_numel):
-        reductions = list(
-            filter(
-                lambda n: n not in (EnableReduction, DisableReduction)
-                and n.is_reduction(),
-                node_schedule,
-            )
-        )
-        if len(reductions) > 0:
-            hints = [self.reduction_hint(n) for n in reductions]
-            if hints.count(hints[0]) == len(hints):
-                reduction_hint_val = hints[0]
-            else:
-                reduction_hint_val = ReductionHint.DEFAULT
-
-            if (
-                reduction_hint_val == ReductionHint.INNER
-                and self.has_non_contiguous_pw_in_reduction_kernel(
-                    node_schedule, numel, reduction_numel
-                )
-            ):
-                reduction_hint_val = ReductionHint.DEFAULT
-        else:
-            reduction_hint_val = ReductionHint.DEFAULT
-
-        mutations = set()
-        for node in node_schedule:
-            if hasattr(node, "get_mutations"):
-                mutations.update(node.get_mutations())
+            # lift non-reduction stores outside loop
+            self.body.writeline(line)
 
-        index_dtype = self.select_index_dtype(node_schedule, numel, reduction_numel)
 
-        return reduction_hint_val, mutations, index_dtype
+class TritonScheduling(SIMDScheduling):
+    int32_type = "tl.int32"
+    int64_type = "tl.int64"
+    kernel_type = TritonKernel
 
     def codegen_comment(self, node_schedule):
         wrapper = V.graph.wrapper_code
@@ -3536,123 +2181,7 @@ def codegen_comment(self, node_schedule):
                     f"{wrapper.comment} Fused node name list: {', '.join(node_names)}"
                 )
 
-    def codegen_node_schedule(
-        self, node_schedule, buf_accesses, numel, reduction_numel
-    ):
-        from torch._inductor.codegen.triton_split_scan import TritonSplitScanKernel
-
-        tiled_groups = self.select_tiling(node_schedule, numel, reduction_numel)
-        (
-            reduction_hint_val,
-            mutations,
-            index_dtype,
-        ) = self.get_kernel_args(node_schedule, numel, reduction_numel)
-
-        is_split_scan = any(
-            isinstance(node, BaseSchedulerNode) and node.is_split_scan()
-            for node in node_schedule
-        )
-        kernel_type = TritonSplitScanKernel if is_split_scan else TritonKernel
-        kernel_args = tiled_groups
-        kernel_kwargs = {
-            "reduction_hint": reduction_hint_val,
-            "mutations": mutations,
-            "index_dtype": index_dtype,
-        }
-        kernel = kernel_type(
-            *kernel_args,
-            **kernel_kwargs,
-        )
-        kernel.buf_accesses = buf_accesses
-
-        self.codegen_node_schedule_with_kernel(node_schedule, kernel)
-
-        with V.set_kernel_handler(kernel):
-            src_code = kernel.codegen_kernel()
-
-        kernel_name = self.define_kernel(src_code, node_schedule)
-        log.debug("Generating kernel code with kernel_name: %s", kernel_name)
-        kernel.kernel_name = kernel_name
-        kernel.code_hash = code_hash(src_code)
-
-        if kernel.persistent_reduction and config.triton.multi_kernel:
-            kernel2 = TritonKernel(
-                *kernel_args,
-                **kernel_kwargs,
-                disable_persistent_reduction=True,
-            )
-            self.codegen_node_schedule_with_kernel(node_schedule, kernel2)
-            with V.set_kernel_handler(kernel2):
-                src_code2 = kernel2.codegen_kernel()
-            kernel_name2 = self.define_kernel(src_code2, node_schedule)
-            kernel2.kernel_name = kernel_name2
-            kernel2.code_hash = code_hash(src_code2)
-
-            final_kernel = MultiKernel([kernel, kernel2])
-        else:
-            final_kernel = kernel  # type: ignore[assignment]
-
-        with V.set_kernel_handler(final_kernel):
-            for node in node_schedule:
-                if node not in (EnableReduction, DisableReduction):
-                    node.mark_run()
-
-        self.codegen_comment(node_schedule)
-        final_kernel.call_kernel(final_kernel.kernel_name)
-        if config.nan_asserts:
-            final_kernel.codegen_nan_check()
-        if config.warn_mix_layout:
-            final_kernel.warn_mix_layout(kernel_name)
-
-        V.graph.removed_buffers |= final_kernel.removed_buffers
-        V.graph.inplaced_to_remove |= final_kernel.inplaced_to_remove
-
-        if (
-            V.graph.wrapper_code.supports_intermediate_hooks
-            and config.generate_intermediate_hooks
-        ):
-            # Not every node in the schedule will actually be live on output;
-            # we can't check dead buffers.
-            live_outs = kernel.args.live_output_buffers()
-            for node in node_schedule:
-                if not isinstance(node, scheduler.BaseSchedulerNode):
-                    continue
-                name = node.get_name()
-                if name not in live_outs:
-                    continue
-                origin_node = node.node.get_origin_node()
-                if origin_node is not None:
-                    counters["inductor"]["intermediate_hooks"] += 1
-                    V.graph.wrapper_code.writeline(
-                        f"run_intermediate_hooks({origin_node.name!r}, {name})"
-                    )
-
-        self.scheduler.free_buffers()
-
-    def codegen_node_schedule_with_kernel(self, node_schedule, kernel):
-        def current_reduction_nodes(nodes):
-            return itertools.takewhile(lambda n: n is not DisableReduction, nodes)
-
-        with kernel:
-            stack = contextlib.ExitStack()
-            kernel.set_last_usage(current_reduction_nodes(node_schedule))
-
-            for node in node_schedule:
-                if node not in (EnableReduction, DisableReduction):
-                    node.decide_inplace_update()
-            for i, node in enumerate(node_schedule):
-                if node is DisableReduction:
-                    stack.enter_context(kernel.disable_reduction())
-                elif node is EnableReduction:
-                    stack.close()
-                    kernel.set_last_usage(current_reduction_nodes(node_schedule[i:]))
-                else:
-                    # TODO - use split ranges ?
-                    indexing_dtype_strength_reduction(node._body)
-                    index_vars = kernel.split_and_set_ranges(node.get_ranges())
-                    node.codegen(index_vars)
-
-    def define_kernel(self, src_code, node_schedule):
+    def define_kernel(self, src_code, node_schedule, kernel):
         wrapper = V.graph.wrapper_code
         if src_code in wrapper.src_to_kernel:
             kernel_name = wrapper.src_to_kernel[src_code]
@@ -3704,293 +2233,6 @@ def define_kernel(self, src_code, node_schedule):
 
         return kernel_name
 
-    def codegen_template(
-        self, template_node, epilogue_nodes, only_gen_src_code=False
-    ) -> Optional[str]:
-        """
-        Codegen a triton template
-
-        If `only_gen_src_code` the src code will be returned instead of codegen'd into the wrapper
-        """
-        _, (numel, rnumel) = template_node.group
-        assert rnumel == 1
-        kernel, render = template_node.node.make_kernel_render(template_node.node)
-        with kernel:
-            if not only_gen_src_code:
-                for node in [template_node, *epilogue_nodes]:
-                    node.mark_run()
-            partial_code = render()
-            for node in epilogue_nodes:
-                node.codegen(kernel.split_and_set_ranges(node.get_ranges()))
-
-        # finalize must be called after adding epilogue above
-        with V.set_kernel_handler(kernel):
-            # TODO: Maybe unify CUDATemplateKernel to also use PartialRender for flexible epilogue fusion.
-            src_code = (
-                partial_code
-                if isinstance(partial_code, str)
-                else partial_code.finalize()
-            )
-            node_schedule = [template_node, *epilogue_nodes]
-
-            if config.benchmark_kernel:
-                num_gb = kernel.estimate_kernel_num_bytes() / 1e9
-                grid_args = V.graph.sizevars.size_hints(kernel.call_sizes)
-                assert kernel.meta is not None, "meta is None"
-                grid = kernel.grid_fn(*grid_args, kernel.meta)
-                src_code = (
-                    f"{kernel.imports_for_benchmark_kernel()}\n"
-                    f"{src_code}\n"
-                    f"{kernel.codegen_kernel_benchmark(num_gb, grid).getvalue()}"
-                )
-
-            if only_gen_src_code:
-                return src_code
-
-            kernel_name = self.define_kernel(src_code, node_schedule)
-
-        self.codegen_comment(node_schedule)
-        kernel.call_kernel(kernel_name, template_node.node)
-        V.graph.removed_buffers |= kernel.removed_buffers
-        V.graph.inplaced_to_remove |= kernel.inplaced_to_remove
-        self.scheduler.free_buffers()
-        return None
-
-    def codegen_sync(self):
-        V.graph.wrapper_code.writeline(V.graph.device_ops.synchronize())
-
-    def codegen_foreach(self, foreach_node):
-        from .triton_foreach import ForeachKernel
-
-        for partitions_with_metadata in ForeachKernel.horizontal_partition(
-            foreach_node.get_subkernel_nodes(), self
-        ):
-            kernel = ForeachKernel()
-            for nodes, tiled_groups, numel, rnumel in partitions_with_metadata:
-                node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
-                (
-                    reduction_hint_val,
-                    mutations,
-                    index_dtype,
-                ) = self.get_kernel_args(node_schedule, numel, rnumel)
-
-                subkernel = kernel.create_sub_kernel(
-                    *tiled_groups,
-                    reduction_hint=reduction_hint_val,
-                    mutations=mutations,
-                    index_dtype=index_dtype,
-                )
-
-                self.codegen_node_schedule_with_kernel(
-                    node_schedule,
-                    subkernel,
-                )
-
-                with V.set_kernel_handler(subkernel):
-                    for node in node_schedule:
-                        if node not in (EnableReduction, DisableReduction):
-                            node.mark_run()
-                V.graph.removed_buffers |= subkernel.removed_buffers
-                V.graph.inplaced_to_remove |= subkernel.inplaced_to_remove
-
-            src_code = kernel.codegen_kernel()
-            kernel_name = self.define_kernel(src_code, [foreach_node])
-            self.codegen_comment([foreach_node])
-            kernel.call_kernel(V.graph.wrapper_code, kernel_name)
-
-        self.scheduler.free_buffers()
-
-    @staticmethod
-    @functools.lru_cache(32)
-    def candidate_tilings(node):
-        ranges, reduction_ranges = node.get_ranges()
-        if len(ranges) <= 1:
-            return ()
-
-        rw = node.pointwise_read_writes()
-        assert len(rw.range_vars) == len(ranges)
-
-        # isinstance(dep, MemoryDep): this filters out StarDeps. StarDeps refer to reads
-        # that need to access the entire tensor; they don't contribute read indexing
-        # information (and practically, they don't have dep.index so they can't be used
-        # for stride_hints below
-        dep_sources = [rw.reads, rw.writes]
-        assert all(
-            isinstance(dep, (MemoryDep, StarDep))
-            for dep in itertools.chain.from_iterable(dep_sources)
-        )
-        deps = [
-            dep
-            for dep in itertools.chain.from_iterable(dep_sources)
-            if dep.name not in V.graph.removed_buffers and isinstance(dep, MemoryDep)
-        ]
-        write_names = {dep.name for dep in rw.writes}
-
-        tilings: List[CandidateTiling] = []
-
-        for dep in deps:
-            strides = V.graph.sizevars.stride_hints(dep.index, rw.range_vars)
-            assert len(strides) == len(ranges)
-            try:
-                split = strides.index(1) + 1
-                if split == len(ranges):
-                    continue
-                if all(s == 0 for s in strides[split:]):
-                    # if this is a broadcasted tensor and all dimensions after split are broadcast,
-                    # this is not a real split
-                    continue
-
-            except ValueError:
-                continue
-            tiled_groups = (
-                V.graph.sizevars.simplify(sympy_product(ranges[:split])),
-                V.graph.sizevars.simplify(sympy_product(ranges[split:])),
-            )
-            # score by number of elements
-            score = V.graph.sizevars.size_hint(
-                sympy_product(
-                    size for size, stride in zip(ranges, strides) if stride != 0
-                )
-            )
-            if dep.name in write_names:
-                # ngimel said contiguous writes is more important than reads
-                score *= 2
-            if CandidateTiling.is_good_size(tiled_groups[0]):
-                score *= 2
-            if CandidateTiling.is_good_size(tiled_groups[1]):
-                score *= 2
-
-            if (
-                V.graph.sizevars.size_hint(
-                    score - sympy_product(itertools.chain(ranges, reduction_ranges))
-                )
-                >= 0
-            ):
-                tilings.append(CandidateTiling(tiled_groups, score, dep.name))
-        return tilings
-
-    @classmethod
-    def select_tiling(cls, node_schedule, numel, reduction_numel=sympy.Integer(1)):
-        """
-        Heuristics to decide how to tile kernels.
-        Currently, we tile based on stride-1 dimensions.
-
-        Returns:
-            `(tile1, tile2, reduction_numel)` s.t. `tile1 * tile2 == numel`
-
-        """
-        if reduction_numel != 1 or config.triton.max_tiles <= 1:
-            # TODO(jansel): should we tile reductions?
-            # do perf hint here if stride-1 dim is not being reduced
-            if perf_hint_log.level <= logging.WARNING:
-                for node in EnableReduction.filter(node_schedule):
-                    if len(cls.candidate_tilings(node)) > 0:
-                        perf_hint_log.info("reduction over non-contiguous dims")
-                        break
-            return (numel, reduction_numel)
-
-        seen_names = set()
-        candidate_tiles: Counter[Any] = collections.Counter()
-        for node in EnableReduction.filter(node_schedule):
-            for tiling in cls.candidate_tilings(node):
-                if tiling.name in seen_names:
-                    continue
-                seen_names.add(tiling.name)
-                candidate_tiles[tiling.tiling] += tiling.score
-
-        ranked_tilings = [tiling for tiling, score in candidate_tiles.most_common()]
-
-        if config.triton.max_tiles >= 3:
-            # Consider adding a third dimension of tiling, but only
-            # when a1 is a multiple of b1; otherwise, you have a lot
-            # of stragglers which is annoying to generate code for.
-            #
-            # NB: More than three max tiles is not enabled by default.
-
-            # Add one 3D tiling choice
-            for i in range(1, len(ranked_tilings)):
-                a0, a1 = ranked_tilings[0]
-                b0, b1 = ranked_tilings[i]
-                if V.graph.sizevars.size_hint(a1 - b1) == 0:
-                    continue
-                if V.graph.sizevars.size_hint(a1 - b1) < 0:
-                    # swap so a0 is bigger
-                    a0, a1 = ranked_tilings[i]
-                    b0, b1 = ranked_tilings[0]
-                assert V.graph.sizevars.size_hint(a1 - b1) > 0
-                if V.graph.sizevars.statically_known_multiple_of(a1, b1):
-                    tiling = (a0, FloorDiv(a1, b1), b1)
-                    ranked_tilings = [tiling] + ranked_tilings
-                    break  # only 1 choice for now
-
-        if len(ranked_tilings) > 1:
-            perf_hint_log.info("possibly bad tiling: %s", ranked_tilings)
-
-        for tiled_groups in ranked_tilings:
-            new_groups = (*tiled_groups, reduction_numel)
-            if all(
-                TritonKernel.is_compatible(new_groups, node.get_ranges())
-                for node in node_schedule
-                if isinstance(node, scheduler.SchedulerNode)
-            ):
-                return new_groups
-
-        return (numel, reduction_numel)
-
-    def flush(self):
-        pass
-
-    def ready_to_flush(self) -> bool:
-        return False
-
-    def generate_kernel_code_from_nodes(self, nodes, benchmark_kernel=False):
-        @dataclasses.dataclass
-        class LastUsageHolder:
-            n: Any
-            last_usage: Any
-
-            def __del__(self):
-                self.n.last_usage = self.last_usage
-
-        last_usage_holders = [LastUsageHolder(n, n.last_usage) for n in nodes]
-
-        # empty last_usage. May cause more aggressive 'evict_last'. Should be fine.
-        for n in nodes:
-            n.last_usage = set()
-
-        if not nodes[0].is_template():
-            _, (numel, rnumel) = max(nodes, key=lambda x: int(x.is_reduction())).group
-            node_schedule = self.generate_node_schedule(nodes, numel, rnumel)
-
-            tiled_groups = self.select_tiling(node_schedule, numel, rnumel)
-            reduction_hint_val, mutations, index_dtype = self.get_kernel_args(
-                node_schedule, numel, rnumel
-            )
-
-            kernel = TritonKernel(
-                *tiled_groups,
-                reduction_hint=reduction_hint_val,
-                mutations=mutations,
-                index_dtype=index_dtype,
-            )
-
-            self.codegen_node_schedule_with_kernel(node_schedule, kernel)
-            with config.patch(
-                "benchmark_kernel", benchmark_kernel
-            ), V.set_kernel_handler(kernel):
-                src_code = kernel.codegen_kernel()
-        else:
-            template_node = nodes[0]
-            epilogue_nodes = nodes[1:]
-
-            with config.patch("benchmark_kernel", benchmark_kernel):
-                src_code = self.codegen_template(
-                    template_node, epilogue_nodes, only_gen_src_code=True
-                )
-
-        src_code = src_code.replace(str(Placeholder.KERNEL_NAME), "triton_")
-        return src_code
-
     @preserve_rng_state()
     def benchmark_fused_nodes(self, nodes):
         src_code = self.generate_kernel_code_from_nodes(nodes, benchmark_kernel=True)
@@ -4061,50 +2303,3 @@ def store_cache():
         )
         store_cache()
         return ms, mod.__file__
-
-
-@dataclasses.dataclass
-class CandidateTiling:
-    tiling: Tuple[sympy.Expr, sympy.Expr]
-    score: int  # higher is better
-    name: Optional[str] = None
-
-    @staticmethod
-    def is_good_size(s):
-        """Somewhat arbitrary heuristic used to boost scores for some sizes"""
-        s = V.graph.sizevars.size_hint(s)
-        return s >= 32 and (s % 32 == 0)
-
-
-class DisableReduction:
-    """
-    Marker to invoke `kernel.disable_reduction()`.  This closes a
-    reduction loop and allows for pointwise ops to occur on the output
-    of a reduction.
-    """
-
-
-class EnableReduction:
-    """
-    Marker to end a DisableReduction block.
-    """
-
-    @staticmethod
-    def filter(node_schedule):
-        """
-        Get the nodes from node_schedule skipping those in a
-        DisableReduction block.
-        """
-        disabled = False
-        for node in node_schedule:
-            if node in (EnableReduction, DisableReduction):
-                # Don't tile stuff outside the main reduction loop
-                disabled = node is DisableReduction
-            elif disabled:
-                pass
-            else:
-                yield node
-
-
-class CantSplit(Exception):
-    pass
diff --git a/torch/_inductor/codegen/triton_split_scan.py b/torch/_inductor/codegen/triton_split_scan.py
index 8df904946e4a..2a8e0142fbd4 100644
--- a/torch/_inductor/codegen/triton_split_scan.py
+++ b/torch/_inductor/codegen/triton_split_scan.py
@@ -4,12 +4,9 @@
 
 import torch._inductor.runtime.hints
 from torch._inductor import config
+from torch._inductor.codegen.simd import IterationRangesRoot
 
-from torch._inductor.codegen.triton import (
-    IterationRangesRoot,
-    triton_compute_type,
-    TritonKernel,
-)
+from torch._inductor.codegen.triton import triton_compute_type, TritonKernel
 
 from torch._prims_common import prod
 
diff --git a/torch/_inductor/config.py b/torch/_inductor/config.py
index da9bb8eb5f57..db8a6d9ae3b6 100644
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@@ -232,14 +232,21 @@ def is_fbcode():
 force_same_precision = (
     True if is_fbcode() else os.environ.get("TORCHINDUCTOR_FORCE_SAME_PRECISION") == "1"
 )
+
 # Specify candidate backends for gemm autotune.
-# Possible choices are combinations of: ATen, Triton, CUTLASS, CPP.
+# Possible choices are combinations of: ATen, Triton, CUTLASS.
 # ATen: default Pytorch ATen kernels.
 # Triton: Triton templates defined in torch inductor.
 # CUTLASS: Cutlass templates and kernels.
-# CPP: CPP templates and kernels for CPU.
 max_autotune_gemm_backends = os.environ.get(
-    "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS", "ATEN,TRITON,CPP"
+    "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_BACKENDS", "ATEN,TRITON"
+).upper()
+
+# Specify the size of the search space for GEMM autotuning.
+# DEFAULT     - balance between compile time overhead and performance
+# EXHAUSTIVE  - maximize performance
+max_autotune_gemm_search_space = os.environ.get(
+    "TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE", "DEFAULT"
 ).upper()
 
 # the value used as a fallback for the unbacked SymInts
@@ -318,15 +325,13 @@ def is_fbcode():
 benchmark_fusion = os.environ.get("TORCHINDUCTOR_BENCHMARK_FUSION") == "1"
 enabled_metric_tables = os.environ.get("TORCHINDUCTOR_ENABLED_METRIC_TABLES", "")
 
-benchmark_multi_templates = (
-    os.environ.get(
-        "TORCHINDUCTOR_BENCHMARK_MULTI_TEMPLATES", "0" if is_fbcode() else "1"
-    )
-    == "1"
+# For Triton Templates, select fastest of best template + epilogue vs best template + separate epilogue kernel
+benchmark_epilogue_fusion = (
+    os.environ.get("TORCHINDUCTOR_BENCHMARK_EPILOGUE_FUSION", "1") == "1"
 )
 
 # Take how many of the top triton kernels to benchmark epilogue
-max_epilogue_benchmarked_choices = 3
+max_epilogue_benchmarked_choices = 1
 
 # how many nodes to allow into a single fusion
 max_fusion_size = 64
@@ -459,6 +464,9 @@ def decide_compile_threads():
 # For user visible outputs, inductor will make sure the stride matches with eager.
 bw_outputs_user_visible = True
 
+# Whether to always use shape padding if it is enabled and possible
+force_shape_pad: bool = False
+
 # Fx-based linear/matmul/bmm + permute/transpose vertical fusion
 permute_fusion = os.environ.get("TORCHINDUCTOR_PERMUTE_FUSION", "0") == "1"
 
@@ -719,6 +727,10 @@ class aot_inductor:
 
     debug_compile = os.environ.get("AOT_INDUCTOR_DEBUG_COMPILE", "0") == "1"
 
+    debug_dump_consts_bin: bool = (
+        os.environ.get("AOT_INDUCTOR_DEBUG_DUMP_CONSTS_BIN", "0") == "1"
+    )
+
     # Serialized tree spec for flattening inputs
     serialized_in_spec = ""
 
diff --git a/torch/_inductor/decomposition.py b/torch/_inductor/decomposition.py
index a4fd1a9191c1..01803af15260 100644
--- a/torch/_inductor/decomposition.py
+++ b/torch/_inductor/decomposition.py
@@ -28,7 +28,11 @@
 )
 
 from . import config, inductor_prims
-from .utils import needs_fallback_due_to_atomic_add_limitations, use_scatter_fallback
+from .utils import (
+    is_gpu,
+    needs_fallback_due_to_atomic_add_limitations,
+    use_scatter_fallback,
+)
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
@@ -167,7 +171,7 @@ def convolution_backward(
     groups,
     output_mask,
 ):
-    if not output_mask[2] or grad_output.device.type != "cuda":
+    if not output_mask[2] or not is_gpu(grad_output.device.type):
         return NotImplemented
     grad_bias = aten.sum(grad_output, [0] + list(range(2, grad_output.dim())))
     grad_inp, grad_weight, _ = aten.convolution_backward(
@@ -593,7 +597,7 @@ def select_decomp_table():
 
 @register_decomposition(aten.masked_scatter)
 def masked_scatter(self, mask, source):
-    if self.device.type == "cuda":
+    if is_gpu(self.device.type):
         # This two-step algorithm is the same as eager CUDA, for eager CPU we
         # use a 1-shot serial iteration.
         self, mask = aten.broadcast_tensors([self, mask])
diff --git a/torch/_inductor/dependencies.py b/torch/_inductor/dependencies.py
index 52f7238f9a88..49696b0722b2 100644
--- a/torch/_inductor/dependencies.py
+++ b/torch/_inductor/dependencies.py
@@ -59,9 +59,10 @@ class MemoryDep(Dep):
     index: sympy.Expr
     var_names: Tuple[sympy.Symbol, ...]
     size: Tuple[sympy.Expr, ...]
+    mode: Optional[str] = None
 
     def __repr__(self):
-        return f"MemoryDep({self.name!r}, {self.index}, {self.ranges})"
+        return f"MemoryDep({self.name!r}, {self.index}, {self.ranges}, {self.mode})"
 
     def get_offset(self):
         """
@@ -130,7 +131,11 @@ def get_numel(self) -> sympy.Expr:
     def rename(self, renames: Dict[str, str]) -> "MemoryDep":
         if self.name in renames:
             return MemoryDep(
-                renames[self.name], self.index, var_names=self.var_names, size=self.size
+                renames[self.name],
+                self.index,
+                var_names=self.var_names,
+                size=self.size,
+                mode=self.mode,
             )
         return self
 
@@ -186,6 +191,7 @@ def is_indirect(self) -> bool:
 @dataclasses.dataclass(frozen=True)
 class StarDep(Dep):
     name: str
+    mode: Optional[str] = None
 
     # depends on the entire buffer
     @property
@@ -197,7 +203,7 @@ def get_numel(self) -> sympy.Expr:
 
     def rename(self, renames: Dict[str, str]) -> "StarDep":
         if self.name in renames:
-            return StarDep(renames[self.name])
+            return StarDep(renames[self.name], self.mode)
         return self
 
     def numbytes_hint(self):
@@ -400,7 +406,7 @@ def load_seed(self, name: str, index: int):
         return self.load(name, sympy.Integer(index))
 
     def store(self, name: str, index: sympy.Expr, value: str, mode=None) -> str:
-        self._writes.add(MemoryDep(name, *self.canonicalize(index)))
+        self._writes.add(MemoryDep(name, *self.canonicalize(index), mode=mode))
         return f"store({name}, {sympy_str(index)}, {value}, {mode})"
 
     def store_reduction(self, name: str, index, value) -> str:
diff --git a/torch/_inductor/fx_passes/pad_mm.py b/torch/_inductor/fx_passes/pad_mm.py
index e351d38d96ec..43f7e009af83 100644
--- a/torch/_inductor/fx_passes/pad_mm.py
+++ b/torch/_inductor/fx_passes/pad_mm.py
@@ -1,4 +1,6 @@
 import functools
+import itertools
+import operator
 from typing import List, Optional, Union
 
 import torch
@@ -7,7 +9,7 @@
 from torch._inductor import utils
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.utils._mode_utils import no_dispatch
-from torch.utils._triton import has_triton
+from ...utils._triton import has_triton
 
 from ..pattern_matcher import fwd_only, gen_register_replacement, joint_fwd_bwd, Match
 
@@ -111,32 +113,10 @@ def addmm_pattern(
 def should_pad_addmm(match: Match) -> bool:
     mat1, mat2, input = fetch_fake_tensors(match, ("mat1", "mat2", "input"))
     return should_pad_common(mat1, mat2, input) and should_pad_bench(
-        mat1, mat2, torch.ops.aten.addmm, input=input
+        match, mat1, mat2, torch.ops.aten.addmm, input=input
     )
 
 
-def addmm_replace(
-    input: Optional[Tensor], mat1: Tensor, mat2: Tensor, beta=1.0, alpha=1.0
-) -> Tensor:
-    m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1))
-    k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
-    n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2))
-
-    if m_padded_length != 0 or k_padded_length != 0 or n_padded_length != 0:
-        return pad_addmm(
-            input,
-            mat1,
-            mat2,
-            m_padded_length,
-            k_padded_length,
-            n_padded_length,
-            beta,
-            alpha,
-        )
-
-    return aten.addmm(input, mat1, mat2, beta=beta, alpha=alpha)
-
-
 def pad_addmm(
     input: Optional[Tensor],
     mat1: Tensor,
@@ -146,36 +126,55 @@ def pad_addmm(
     n_padded_length: int,
     beta=1.0,
     alpha=1.0,
+    mat1_pre_padded: bool = False,
+    mat2_pre_padded: bool = False,
 ):
-    # addmm decomp with padding will go through pad_addmm multiple times if multiple dimensions are needed to be padded
-    if k_padded_length != 0:
-        mat1 = pad_dim(mat1, k_padded_length, 1)
-        mat2 = pad_dim(mat2, k_padded_length, 0)
-    elif n_padded_length != 0:
-        mat2 = pad_dim(mat2, n_padded_length, 1)
-    elif m_padded_length != 0:
-        mat1 = pad_dim(mat1, m_padded_length, 0)
+    # for paddings, dim order is reversed for some reasons
+    # and for every dim, we need to specify left and right padding
+    if not mat1_pre_padded:
+        mat1 = pad_mat1(
+            mat1, m_padded_length=m_padded_length, k_padded_length=k_padded_length
+        )
+    if not mat2_pre_padded:
+        mat2 = pad_mat2(
+            mat2, k_padded_length=k_padded_length, n_padded_length=n_padded_length
+        )
 
     # the add broadcasts, so we only pad if the dimension != 1
-    if input is not None and k_padded_length == 0:
+    if input is not None:
         if n_padded_length != 0:
             if input.dim() == 2 and input.shape[1] != 1:
                 input = pad_dim(input, n_padded_length, 1)
             elif input.dim() == 1 and input.shape[0] != 1:
                 input = pad_dim(input, n_padded_length, 0)
-        elif m_padded_length != 0 and input.dim() == 2 and input.shape[0] != 1:
+        if m_padded_length != 0 and input.dim() == 2 and input.shape[0] != 1:
             input = pad_dim(input, m_padded_length, 0)
 
-    if k_padded_length != 0:
-        return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha)
-    elif n_padded_length != 0:
-        return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha)[
-            :, :-n_padded_length
-        ]
-    else:
-        return addmm_replace(input, mat1, mat2, beta=beta, alpha=alpha)[
-            :-m_padded_length, :
-        ]
+    res = aten.addmm(input, mat1, mat2, beta=beta, alpha=alpha)
+
+    if m_padded_length != 0:
+        res = res[:-m_padded_length, :]
+    if n_padded_length != 0:
+        res = res[:, :-n_padded_length]
+    return res
+
+
+def addmm_replace(
+    input: Optional[Tensor], mat1: Tensor, mat2: Tensor, beta=1.0, alpha=1.0
+) -> Tensor:
+    k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
+    n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2))
+    m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1))
+    return pad_addmm(
+        input,
+        mat1,
+        mat2,
+        m_padded_length,
+        k_padded_length,
+        n_padded_length,
+        beta,
+        alpha,
+    )
 
 
 def is_mm_compute_bound(M: int, K: int, N: int, dtype: torch.dtype) -> bool:
@@ -216,16 +215,29 @@ def get_pad_cache():
     return torch._inductor.codecache.LocalCache()
 
 
-def get_cached_should_pad(key):
+def get_cached_should_pad(key: str) -> bool:
     return get_pad_cache().lookup(key)
 
 
-def set_cached_should_pad(key, value):
+def set_cached_should_pad(key: str, value: bool):
+    return get_pad_cache().set_value(key, value=value)
+
+
+def get_cached_base_mm_benchmark_time(key: str) -> float:
+    return get_pad_cache().lookup(key)
+
+
+def set_cached_base_mm_benchmark_time(key: str, value: float):
     return get_pad_cache().set_value(key, value=value)
 
 
 def should_pad_bench_key(
-    mat1: Tensor, mat2: Tensor, op, input: Optional[Tensor] = None
+    match,
+    mat1: Tensor,
+    mat2: Tensor,
+    op,
+    input: Optional[Tensor] = None,
+    is_base_time_key=False,
 ) -> str:
     def tensor_key(t):
         return (t.shape, t.stride(), t.dtype)
@@ -233,44 +245,80 @@ def tensor_key(t):
     tf32_key = (
         None if mat1.dtype != torch.float32 else torch.backends.cuda.matmul.allow_tf32
     )
+
+    def fmt_pad(name):
+        if is_base_time_key:
+            return None
+        return f"exclude_pad:{should_exclude_padding_time(match, name)}"
+
     key = (
         tensor_key(mat1),
         tensor_key(mat2),
+        fmt_pad("mat1"),
+        fmt_pad("mat2"),
         op,
         input if input is None else tensor_key(input),
         tf32_key,
     )
 
-    return str(key)
+    key = str(key)
+    if is_base_time_key:
+        key = f"base mm time: {key}"
+    return key
 
 
-def should_pad_bench(
-    mat1: Tensor, mat2: Tensor, op, input: Optional[Tensor] = None
-) -> bool:
-    if not has_triton():
+def get_non_view_def(node):
+    if node.op == operator.getitem:
+        return get_non_view_def(node.args[0])
+
+    if (
+        node.op == "call_function"
+        and isinstance(node.target, torch._ops.OpOverload)
+        and utils.is_view(node.target)
+    ):
+        return get_non_view_def(node.all_input_nodes[0])
+
+    return node
+
+
+def should_exclude_padding_time(match, arg_name):
+    node_def = get_non_view_def(match.kwargs[arg_name])
+
+    # constant padding converts tensors to contiguous so even if the input tensor
+    # can be planned layout transform is not free. TODO - way to pad and preserve layout ?
+    if not fetch_fake_tensors(match, (arg_name,))[0].is_contiguous():
         return False
 
+    # optimistically assume we should be able to memory plan away
+    # all non inputs
+    return node_def.op != "placeholder"
+
+
+def should_pad_bench(
+    match, mat1: Tensor, mat2: Tensor, op, input: Optional[Tensor] = None
+) -> bool:
     do_bench = functools.partial(
         torch._inductor.runtime.runtime_utils.do_bench_gpu,
         warmup=5,
     )
-
+    m_padded_length = 0
+    n_padded_length = 0
+    batchsize = 1
     with no_dispatch():
         if op is torch.ops.aten.mm or op is torch.ops.aten.addmm:
             m = mat1.shape[0]
             k = mat1.shape[1]
             n = mat2.shape[1]
-
-            m_padded_length = get_padded_length(m, get_alignment_size(mat1))
             k_padded_length = get_padded_length(k, get_alignment_size(mat1))
             n_padded_length = get_padded_length(n, get_alignment_size(mat2))
+            m_padded_length = get_padded_length(m, get_alignment_size(mat1))
         elif op is torch.ops.aten.bmm:
+            batchsize = mat1.shape[0]
             m = mat1.shape[1]
             k = mat1.shape[2]
             n = mat2.shape[2]
-
-            m_padded_length = get_padded_length(m, get_alignment_size(mat1))
             k_padded_length = get_padded_length(k, get_alignment_size(mat1))
+            m_padded_length = get_padded_length(m, get_alignment_size(mat1))
             n_padded_length = get_padded_length(n, get_alignment_size(mat2))
         else:
             return False
@@ -278,20 +326,34 @@ def should_pad_bench(
         if m_padded_length == k_padded_length == n_padded_length == 0:
             return False
 
+        def realize_symbols(ds):
+            return [d if isinstance(d, int) else d.node.hint for d in ds]
+
+        if any(
+            dim == 0
+            for dim in itertools.chain(
+                realize_symbols(mat1.shape), realize_symbols(mat2.shape)
+            )
+        ):
+            return False
+
+        if torch._inductor.config.force_shape_pad:
+            return True
+
+        if not has_triton():
+            return False
+
         if not is_mm_compute_bound(m, k, n, mat1.dtype):
             return False
 
         # We don't want to look up the cache for cases that are trivially false
         # since it does file io
-        key = should_pad_bench_key(mat1, mat2, op, input)
+        key = should_pad_bench_key(match, mat1, mat2, op, input)
 
         cached_pad = get_cached_should_pad(key)
         if cached_pad is not None:
             return cached_pad
 
-        def realize_symbols(ds):
-            return [d if isinstance(d, int) else d.node.hint for d in ds]
-
         def realize_tensor(t):
             if isinstance(t, FakeTensor):
                 size_hints = realize_symbols(t.size())
@@ -306,19 +368,48 @@ def realize_tensor(t):
 
         mat1 = realize_tensor(mat1)
         mat2 = realize_tensor(mat2)
-        if op is torch.ops.aten.bmm or op is torch.ops.aten.mm:
-            ori_time = do_bench(
-                lambda: op(mat1, mat2),
-            )
-        else:
-            if input is not None:
-                input = realize_tensor(input)
-            ori_time = do_bench(
-                lambda: op(input, mat1, mat2),
+
+        # since we key on whether or not the inputs can be memory planned, set cache for the
+        # original time which is unaffected by whether or not the input can be planned
+        ori_time_key = should_pad_bench_key(
+            match, mat1, mat2, op, input, is_base_time_key=True
+        )
+        ori_time = get_cached_base_mm_benchmark_time(ori_time_key)
+        if ori_time is None:
+            if op is torch.ops.aten.bmm or op is torch.ops.aten.mm:
+                ori_time = do_bench(
+                    lambda: op(mat1, mat2),
+                )
+            else:
+                if input is not None:
+                    # realize bias for addmm
+                    input = realize_tensor(input)
+                ori_time = do_bench(
+                    lambda: op(input, mat1, mat2),
+                )
+            set_cached_base_mm_benchmark_time(ori_time_key, ori_time)
+
+        mat1_pad = mat1
+        mat2_pad = mat2
+
+        is_bmm = op is torch.ops.aten.bmm
+        mat1_pre_padded = should_exclude_padding_time(match, "mat1")
+        if mat1_pre_padded:
+            mat1_pad = pad_mat1(
+                mat1_pad,
+                m_padded_length=m_padded_length,
+                k_padded_length=k_padded_length,
+                is_bmm=is_bmm,
             )
 
-        mat1_pad = torch.randn_like(mat1)
-        mat2_pad = torch.randn_like(mat2)
+        mat2_pre_padded = should_exclude_padding_time(match, "mat2")
+        if mat2_pre_padded:
+            mat2_pad = pad_mat2(
+                mat2_pad,
+                k_padded_length=k_padded_length,
+                n_padded_length=n_padded_length,
+                is_bmm=is_bmm,
+            )
 
         if op is torch.ops.aten.addmm:
             input_pad = None
@@ -332,6 +423,8 @@ def realize_tensor(t):
                     m_padded_length,
                     k_padded_length,
                     n_padded_length,
+                    mat1_pre_padded=mat1_pre_padded,
+                    mat2_pre_padded=mat2_pre_padded,
                 ),
             )
         elif op is torch.ops.aten.mm:
@@ -342,6 +435,8 @@ def realize_tensor(t):
                     m_padded_length,
                     k_padded_length,
                     n_padded_length,
+                    mat1_pre_padded=mat1_pre_padded,
+                    mat2_pre_padded=mat2_pre_padded,
                 ),
             )
         else:
@@ -352,6 +447,8 @@ def realize_tensor(t):
                     m_padded_length,
                     k_padded_length,
                     n_padded_length,
+                    mat1_pre_padded=mat1_pre_padded,
+                    mat2_pre_padded=mat2_pre_padded,
                 ),
             )
 
@@ -371,16 +468,29 @@ def mm_pattern(mat1: Tensor, mat2: Tensor) -> Tensor:
 def should_pad_mm(match: Match) -> bool:
     mat1, mat2 = fetch_fake_tensors(match, ("mat1", "mat2"))
     return should_pad_common(mat1, mat2) and should_pad_bench(
-        mat1, mat2, torch.ops.aten.mm
+        match, mat1, mat2, torch.ops.aten.mm
     )
 
 
-def mm_replace(mat1: Tensor, mat2: Tensor) -> Tensor:
-    m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1))
-    k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
-    n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2))
+def pad_mat1(mat1, *, m_padded_length, k_padded_length, is_bmm=False):
+    if k_padded_length != 0 or m_padded_length != 0:
+        # dim order is reversed for constant_pad_nd, for every dim we specify right and left padding
+        pad_arg = [0, k_padded_length, 0, m_padded_length]
+        if is_bmm:
+            pad_arg.extend((0, 0))
+        return aten.constant_pad_nd(mat1, pad_arg)
+    return mat1
 
-    return pad_mm(mat1, mat2, m_padded_length, k_padded_length, n_padded_length)
+
+def pad_mat2(mat2, *, k_padded_length, n_padded_length, is_bmm=False):
+    if k_padded_length != 0 or n_padded_length != 0:
+        # dim order is reversed for constant_pad_nd, for every dim we specify right and left padding
+        pad_arg = [0, n_padded_length, 0, k_padded_length]
+        if is_bmm:
+            pad_arg.extend((0, 0))
+        return aten.constant_pad_nd(mat2, pad_arg)
+    else:
+        return mat2
 
 
 def pad_mm(
@@ -389,18 +499,36 @@ def pad_mm(
     m_padded_length: int,
     k_padded_length: int,
     n_padded_length: int,
+    mat1_pre_padded: bool = False,
+    mat2_pre_padded: bool = False,
 ) -> Tensor:
-    # mm_replace will go through pad_mm multiple times if multiple dimensions are needed to be padded
-    if k_padded_length != 0:
-        mat1 = pad_dim(mat1, k_padded_length, 1)
-        mat2 = pad_dim(mat2, k_padded_length, 0)
-        return torch.ops.aten.mm(mat1, mat2)
-    elif n_padded_length != 0:
-        mat2 = pad_dim(mat2, n_padded_length, 1)
-        return torch.ops.aten.mm(mat1, mat2)[:, :-n_padded_length]
-    else:
-        mat1 = pad_dim(mat1, m_padded_length, 0)
-        return torch.ops.aten.mm(mat1, mat2)[:-m_padded_length, :]
+    if not mat1_pre_padded:
+        mat1 = pad_mat1(
+            mat1, m_padded_length=m_padded_length, k_padded_length=k_padded_length
+        )
+    if not mat2_pre_padded:
+        mat2 = pad_mat2(
+            mat2, k_padded_length=k_padded_length, n_padded_length=n_padded_length
+        )
+    res = aten.mm(mat1, mat2)
+    if m_padded_length != 0:
+        res = res[:-m_padded_length, :]
+    if n_padded_length != 0:
+        res = res[:, :-n_padded_length]
+    return res
+
+
+def mm_replace(mat1: Tensor, mat2: Tensor) -> Tensor:
+    k_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
+    m_padded_length = get_padded_length(mat1.shape[0], get_alignment_size(mat1))
+    n_padded_length = get_padded_length(mat2.shape[1], get_alignment_size(mat2))
+    return pad_mm(
+        mat1,
+        mat2,
+        m_padded_length,
+        k_padded_length,
+        n_padded_length,
+    )
 
 
 def bmm_pattern(mat1: Tensor, mat2: Tensor) -> Tensor:
@@ -410,40 +538,52 @@ def bmm_pattern(mat1: Tensor, mat2: Tensor) -> Tensor:
 def should_pad_bmm(match: Match) -> bool:
     mat1, mat2 = fetch_fake_tensors(match, ("mat1", "mat2"))
     return should_pad_common(mat1, mat2) and should_pad_bench(
-        mat1, mat2, torch.ops.aten.bmm
+        match, mat1, mat2, torch.ops.aten.bmm
     )
 
 
-def bmm_replace(mat1: Tensor, mat2: Tensor) -> Tensor:
-    m_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
-    k_padded_length = get_padded_length(mat1.shape[2], get_alignment_size(mat1))
-    n_padded_length = get_padded_length(mat2.shape[2], get_alignment_size(mat2))
-
-    if m_padded_length != 0 or k_padded_length != 0 or n_padded_length != 0:
-        return pad_bmm(mat1, mat2, m_padded_length, k_padded_length, n_padded_length)
-
-    return aten.bmm(mat1, mat2)
-
-
 def pad_bmm(
     mat1: Tensor,
     mat2: Tensor,
     m_padded_length: int,
     k_padded_length: int,
     n_padded_length: int,
+    mat1_pre_padded: bool = False,
+    mat2_pre_padded: bool = False,
 ) -> Tensor:
-    # bmm_replace will go through pad_bmm multiple times if multiple dimensions are needed to be padded
-    if k_padded_length != 0:
-        mat1 = pad_dim(mat1, k_padded_length, 2)
-        mat2 = pad_dim(mat2, k_padded_length, 1)
-
-        return aten.bmm(mat1, mat2)
-    elif n_padded_length != 0:
-        mat2 = pad_dim(mat2, n_padded_length, 2)
-        return aten.bmm(mat1, mat2)[:, :, :-n_padded_length].contiguous()
-    else:
-        mat1 = pad_dim(mat1, m_padded_length, 1)
-        return aten.bmm(mat1, mat2)[:, :-m_padded_length, :].contiguous()
+    if not mat1_pre_padded:
+        mat1 = pad_mat1(
+            mat1,
+            m_padded_length=m_padded_length,
+            k_padded_length=k_padded_length,
+            is_bmm=True,
+        )
+    if not mat2_pre_padded:
+        mat2 = pad_mat2(
+            mat2,
+            k_padded_length=k_padded_length,
+            n_padded_length=n_padded_length,
+            is_bmm=True,
+        )
+    res = aten.bmm(mat1, mat2)
+    if m_padded_length != 0:
+        res = res[:, :-m_padded_length, :]
+    if n_padded_length != 0:
+        res = res[:, :, :-n_padded_length]
+    return res
+
+
+def bmm_replace(mat1: Tensor, mat2: Tensor) -> Tensor:
+    k_padded_length = get_padded_length(mat1.shape[2], get_alignment_size(mat1))
+    n_padded_length = get_padded_length(mat2.shape[2], get_alignment_size(mat2))
+    m_padded_length = get_padded_length(mat1.shape[1], get_alignment_size(mat1))
+    return pad_bmm(
+        mat1,
+        mat2,
+        m_padded_length,
+        k_padded_length,
+        n_padded_length,
+    )
 
 
 @functools.lru_cache(None)
diff --git a/torch/_inductor/fx_passes/quantization.py b/torch/_inductor/fx_passes/quantization.py
index 0d4fc3b42933..4476a9ccd512 100644
--- a/torch/_inductor/fx_passes/quantization.py
+++ b/torch/_inductor/fx_passes/quantization.py
@@ -224,17 +224,26 @@ def generate_pattern_with_binary(
     binary_post_op,
     computation_call,
     extra_input_pattern,
-    int8_mixed_bf16_with_inplace_add=False,
+    dtype_convert=False,
+    swap_inputs=False,
 ):
-    binary_pattern = CallFunction(
-        binary_post_op,
-        computation_call,
-        extra_input_pattern,
+    binary_pattern = (
+        CallFunction(
+            binary_post_op,
+            extra_input_pattern,
+            computation_call,
+        )
+        if swap_inputs
+        else CallFunction(
+            binary_post_op,
+            computation_call,
+            extra_input_pattern,
+        )
     )
     return _may_generate_pattern_with_dtype_convert(
         binary_pattern,
         KeywordArg("convert_dtype_after_inplace_add"),
-        int8_mixed_bf16_with_inplace_add,
+        dtype_convert,
     )
 
 
@@ -435,10 +444,109 @@ def qlinear(match: Match, *args, **kwargs):
     return qlinear
 
 
-def _is_valid_quantized_conv_binary_optimization_pattern():
-    # Check if it's a valid Conv Binary Pattern:
-    # * qconv2d_pointwise should only has one users
-    # * Extra input of binary node comes from dequant pattern
+def _register_quantized_linear_binary_lowering(
+    pattern,
+    pass_number,
+    computation_op,
+    binary_unary_attr,
+):
+    @register_lowering_pattern(
+        pattern,
+        extra_check=_is_valid_qlinear_binary_optimization_pattern(),
+        pass_number=pass_number,
+    )
+    def qlinear_binary(match: Match, *args, **kwargs):
+        output_dtype = _get_pattern_output_dtype(match)
+        # Activation QParams
+        x, x_scale, x_zp = (
+            kwargs["x"],
+            kwargs["x_scale"],
+            kwargs["x_zp"],
+        )
+        x2 = (
+            kwargs["accum"]
+            if binary_unary_attr.binary_op_name == "sum"
+            else kwargs["other"]
+        )
+        x2_scale = 1.0
+        x2_zp = 0
+        # Weight QParams
+        packed_weight, w_scale, w_zp = (
+            kwargs["packed_weight"],
+            kwargs["w_scale"],
+            kwargs["w_zp"],
+        )
+        # bias
+        b = kwargs["b"] if "b" in kwargs else None
+        # Output QParams
+        o_inv_scale = kwargs["o_inv_scale"] if output_dtype is None else 1.0
+        o_zero_point = kwargs["o_zp"] if output_dtype is None else 0
+
+        x2.realize()
+        from .mkldnn_fusion import _can_be_inplace
+
+        if binary_unary_attr.binary_op_name == "sum":
+            assert _can_be_inplace(
+                x2
+            ), "QLinear Binary Inplace Fusion requires accum is not an alias or mutation."
+
+        # if the binary post op is sum but output dtype is not the same as accum,
+        # use accum's dtype as output dtype
+        out_dtype = output_dtype
+        if (
+            output_dtype
+            and binary_unary_attr.binary_op_name == "sum"
+            and output_dtype != x2.dtype
+        ):
+            out_dtype = x2.dtype
+
+        computation_args = (
+            x,
+            x_scale,
+            x_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            b,
+            o_inv_scale,
+            o_zero_point,
+            out_dtype,
+            x2,
+            x2_scale,
+            x2_zp,
+            binary_unary_attr.binary_op_name,
+            binary_unary_attr.alpha,
+            binary_unary_attr.unary_op_name,
+            binary_unary_attr.scalars_attr,
+            binary_unary_attr.algorithm_attr,
+        )
+        counters["inductor"]["qlinear_binary_matcher_count"] += 1
+        counters["inductor"]["qlinear_binary_matcher_nodes"] += len(match.nodes)
+        return L[computation_op](*computation_args)
+
+    return qlinear_binary
+
+
+def _is_valid_qconv_binary_optimization_pattern():
+    return _is_valid_quantized_op_binary_optimization_pattern(
+        torch.ops.onednn.qconv2d_pointwise
+    )
+
+
+def _is_valid_qlinear_binary_optimization_pattern():
+    return _is_valid_quantized_op_binary_optimization_pattern(
+        torch.ops.onednn.qlinear_pointwise,
+        # we don't insert q-dq for extra input due to accuracy issues
+        extra_input_from_dequant=False,
+    )
+
+
+def _is_valid_quantized_op_binary_optimization_pattern(
+    qop, extra_input_from_dequant=True
+):
+    # Check if it's a valid Binary Pattern for qconv2d and qlinear:
+    # * qop_pointwise should only has one users
+    # * If extra_input_from_dequant is True, extra input of binary node should come from dequant pattern
     # * the two inputs of binary node should have attribute "meta" and should be tensors
     # * the two inputs of binary node should have the same shape
     # * All users of the extra input in this pattern should be
@@ -446,8 +554,8 @@ def _is_valid_quantized_conv_binary_optimization_pattern():
     #   connected to the compute node.
     def fn(match):
         output_dtype = _get_pattern_output_dtype(match)
-        compute_node = filter_nodes(match.nodes, torch.ops.onednn.qconv2d_pointwise)[0]
-        # qconv2d_pointwise should only have one user
+        compute_node = filter_nodes(match.nodes, qop)[0]
+        # qop_pointwise should only have one user
         if len(compute_node.users) != 1:
             return False
         binary_node_inputs = next(iter(compute_node.users)).args
@@ -460,9 +568,12 @@ def fn(match):
                     break
             assert extra_input_of_binary_node is not None
             # Extra input of binary node comes from dequant pattern
-            if (not isinstance(extra_input_of_binary_node, torch.fx.Node)) or (
-                extra_input_of_binary_node.target
-                != quantized_decomposed.dequantize_per_tensor.default
+            if extra_input_from_dequant and (
+                (not isinstance(extra_input_of_binary_node, torch.fx.Node))
+                or (
+                    extra_input_of_binary_node.target
+                    != quantized_decomposed.dequantize_per_tensor.default
+                )
             ):
                 return False
 
@@ -489,9 +600,13 @@ def fn(match):
         from .mkldnn_fusion import _get_remaining_users
 
         extra_input_of_pattern = (
-            match.kwargs["accum"]
-            if output_dtype is None
-            else match.kwargs["accum_after_dequant"]
+            match.kwargs["other"]
+            if "other" in match.kwargs
+            else (
+                match.kwargs["accum"]
+                if output_dtype is None or (not extra_input_from_dequant)
+                else match.kwargs["accum_after_dequant"]
+            )
         )
         if (
             len(
@@ -517,7 +632,7 @@ def _register_quantized_conv_binary_lowering(
 ):
     @register_lowering_pattern(
         pattern,
-        extra_check=_is_valid_quantized_conv_binary_optimization_pattern(),
+        extra_check=_is_valid_qconv_binary_optimization_pattern(),
         pass_number=pass_number,
     )
     def qconv_binary(match: Match, *args, **kwargs):
@@ -884,6 +999,228 @@ def __init__(
                 binary_unary_attr,  # binary_unary_attr
             )
 
+    # QLinear
+    r"""
+    Supported linear-binary(-unary) patterns
+
+        linear(X)   extra input
+               \   /
+                Add
+                 |
+            Optional(relu)
+                 |
+                 Y
+
+    1. int8-mixed-fp32
+    +---+---------------+-----------+------------------------------+---------+
+    | # | Add type      | Quant out | Pattern                      | Post op |
+    +---+---------------+-----------+------------------------------+---------+
+    | 1 | In-/out-place | Yes       | linear + fp32 -> (relu) -> q | add     |
+    +---+---------------+-----------+------------------------------+---------+
+    | 2 | In-/out-place | No        | linear + fp32 -> (relu)      | sum     |
+    +---+---------------+-----------+------------------------------+---------+
+
+    2. int8-mixed-bf16
+    +---+----------+---------------+-----------+-----------------------------------------+---------+
+    | # | X2 dtype | Add type      | Quant out | Pattern                                 | Post op |
+    +---+----------+---------------+-----------+-----------------------------------------+---------+
+    | 1 | BF16     | In-/out-place | Yes       | linear + bf16 -> (relu) -> q            | add     |
+    +---+----------+---------------+-----------+-----------------------------------------+---------+
+    | 2 | BF16     | In-/out-place | No        | linear + bf16 -> (relu)                 | sum     |
+    +---+----------+---------------+-----------+-----------------------------------------+---------+
+    | 3 | FP32     | Out-place     | Yes       | linear + fp32 -> (relu) -> q            | add     |
+    |   |          | In-place right|           |                                         |         |
+    +---+----------+---------------+-----------+-----------------------------------------+---------+
+    | 4 | FP32     | Out-place     | No        | linear + fp32 -> (relu)                 | sum     |
+    |   |          | In-place right|           |                                         |         |
+    +---+----------+---------------+-----------+-----------------------------------------+---------+
+    | 5 | FP32     | In-place left | Yes       | linear + fp32 -> to_bf16 -> (relu) -> q | add     |
+    +---+----------+---------------+-----------+-----------------------------------------+---------+
+    | 6 | FP32     | In-place left | No        | linear + fp32 -> to_bf16 -> (relu)      | add     |
+    +---+----------+---------------+-----------+-----------------------------------------+---------+
+
+    Note
+    (1) The positions of linear and the extra input can be swapped.
+    (2) we don't insert q-dq before the extra input of linear-add by recipe. But if q-dq is found at the
+    extra input, we don't match that pattern because we cannot match all these patterns in 3 passes.
+    """
+    for x_scale_zp_are_tensors in (False, True):
+        qlinear_binary_op = (
+            torch.ops.onednn.qlinear_pointwise.binary_tensor
+            if x_scale_zp_are_tensors
+            else torch.ops.onednn.qlinear_pointwise.binary
+        )
+        unary_postop_list = ["none", "relu"]
+        unary_postop_dict = {
+            "none": None,
+            "relu": aten.relu.default,
+        }
+        convert_dtype_after_binary_list = [False, True]
+
+        # Priority 1 to match: QLinear Binary or Binary-Unary pattern with int8 output
+        # Covers case (1) of int8-mixed-fp32 and case (1)(3)(5) of int8-mixed-bf16,
+        # totally 3 patterns (2 are identical)
+        swap_binary_inputs_list = [False, True]
+        int8_mixed_bf16_list = [False, True]
+        combinations = itertools.product(
+            unary_postop_list,
+            int8_mixed_bf16_list,
+            swap_binary_inputs_list,
+            convert_dtype_after_binary_list,
+        )
+        qlinear_binary_replace_patterns = {}
+        for unary_op, int8_mixed_bf16, swap_inputs, cvt_dtype_binary in combinations:
+            if not int8_mixed_bf16 and cvt_dtype_binary:
+                # No convert node after binary node if dtypes are all fp32
+                continue
+            qlinear_binary_replace_patterns.update(
+                {
+                    BinaryUnaryAttr(
+                        "add", 1.0, unary_op, [], ""
+                    ): generate_pattern_with_output_quant(
+                        generate_pattern_with_unary(
+                            generate_pattern_with_binary(
+                                aten.add.Tensor,
+                                get_qlinear_pt2e_pattern(x_scale_zp_are_tensors),
+                                KeywordArg("other"),
+                                # If fp32 extra input is inplace added to bf16 linear output,
+                                # a to_bf16 node is inserted after binary
+                                dtype_convert=cvt_dtype_binary,
+                                swap_inputs=swap_inputs,
+                            ),
+                            unary_postop_dict[unary_op],
+                        ),
+                    )
+                }
+            )
+        for binary_unary_attr, patterns in qlinear_binary_replace_patterns.items():
+            _register_quantized_linear_binary_lowering(
+                patterns,
+                0,  # pass_number
+                qlinear_binary_op,  # computation_op
+                binary_unary_attr,  # binary_unary_attr
+            )
+
+        # Priority 2.1 to match: QLinear Binary-Unary pattern with fp32/bfloat16 output
+        # Covers case (2) of int8-mixed-fp32 and case (2)(4) of int8-mixed-bf16,
+        # totally 2 patterns (2 are identical)
+        binary_replace_float_out_patterns = {}
+        for swap_binary_inputs in swap_binary_inputs_list:
+            binary_replace_float_out_patterns.update(
+                {
+                    BinaryUnaryAttr(
+                        "sum", 1.0, "relu", [], ""
+                    ): generate_pattern_with_unary(
+                        generate_pattern_with_binary(
+                            aten.add.Tensor,
+                            get_qlinear_pt2e_pattern(x_scale_zp_are_tensors),
+                            KeywordArg("accum"),
+                            dtype_convert=False,
+                            swap_inputs=swap_binary_inputs,
+                        ),
+                        aten.relu.default,
+                    ),
+                }
+            )
+        for (
+            binary_unary_attr,
+            patterns,
+        ) in binary_replace_float_out_patterns.items():
+            _register_quantized_linear_binary_lowering(
+                patterns,
+                1,  # pass_number
+                qlinear_binary_op,  # computation_op
+                binary_unary_attr,
+            )
+        # Priority 2.2 to match: QLinear Binary-Unary pattern with fp32/bfloat16 output
+        # Covers case (6) of int8-mixed-bf16
+        binary_replace_float_out_patterns = {}
+        for swap_binary_inputs in swap_binary_inputs_list:
+            binary_replace_float_out_patterns.update(
+                {
+                    BinaryUnaryAttr(
+                        "add", 1.0, "relu", [], ""
+                    ): generate_pattern_with_unary(
+                        generate_pattern_with_binary(
+                            aten.add.Tensor,
+                            get_qlinear_pt2e_pattern(x_scale_zp_are_tensors),
+                            KeywordArg("other"),
+                            dtype_convert=True,
+                            swap_inputs=swap_binary_inputs,
+                        ),
+                        aten.relu.default,
+                    ),
+                }
+            )
+        for (
+            binary_unary_attr,
+            patterns,
+        ) in binary_replace_float_out_patterns.items():
+            _register_quantized_linear_binary_lowering(
+                patterns,
+                1,  # pass_number
+                qlinear_binary_op,  # computation_op
+                binary_unary_attr,
+            )
+
+        # Priority 3.1: QLinear Binary pattern with fp32/bfloat16 output
+        # Covers case (2) of int8-mixed-fp32 and case (2)(4) of int8-mixed-bf16,
+        # totally 2 patterns (2 are identical)
+        binary_replace_float_out_patterns = {}
+        for swap_binary_inputs in swap_binary_inputs_list:
+            binary_replace_float_out_patterns.update(
+                {
+                    BinaryUnaryAttr(
+                        "sum", 1.0, "none", [], ""
+                    ): generate_pattern_with_binary(
+                        aten.add.Tensor,
+                        get_qlinear_pt2e_pattern(x_scale_zp_are_tensors),
+                        KeywordArg("accum"),
+                        dtype_convert=False,
+                        swap_inputs=swap_binary_inputs,
+                    ),
+                }
+            )
+        for (
+            binary_unary_attr,
+            patterns,
+        ) in binary_replace_float_out_patterns.items():
+            _register_quantized_linear_binary_lowering(
+                patterns,
+                2,  # pass_number
+                qlinear_binary_op,  # computation_op
+                # Output dtype should be the same as accum's dtype but we don't know
+                # its dtype. So, leave it to be determined in the lowering function
+                binary_unary_attr,
+            )
+        # Priority 3.2: QLinear Binary pattern with fp32/bfloat16 output
+        # Covers (6) of int8-mixed-bf16
+        binary_replace_float_out_patterns = {}
+        for swap_binary_inputs in swap_binary_inputs_list:
+            binary_replace_float_out_patterns.update(
+                {
+                    BinaryUnaryAttr(
+                        "add", 1.0, "none", [], ""
+                    ): generate_pattern_with_binary(
+                        aten.add.Tensor,
+                        get_qlinear_pt2e_pattern(x_scale_zp_are_tensors),
+                        KeywordArg("other"),
+                        dtype_convert=True,
+                        swap_inputs=swap_binary_inputs,
+                    ),
+                }
+            )
+        for (
+            binary_unary_attr,
+            patterns,
+        ) in binary_replace_float_out_patterns.items():
+            _register_quantized_linear_binary_lowering(
+                patterns,
+                2,  # pass_number
+                qlinear_binary_op,  # computation_op
+                binary_unary_attr,
+            )
+
 
 def _is_valid_quantized_maxpool2d_optimization_pattern():
     def fn(match):
diff --git a/torch/_inductor/graph.py b/torch/_inductor/graph.py
index 672509b20a56..bfb7b8dea7eb 100644
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@@ -1303,6 +1303,8 @@ def debug(msg):
                                 torch.ops.aten.mkldnn_rnn_layer.default,
                                 torch.ops.onednn.qlinear_pointwise.default,
                                 torch.ops.onednn.qlinear_pointwise.tensor,
+                                torch.ops.onednn.qlinear_pointwise.binary,
+                                torch.ops.onednn.qlinear_pointwise.binary_tensor,
                             ]
                             need_fixed_channels_last_layout += [
                                 torch.ops.mkldnn._convolution_pointwise.default,
diff --git a/torch/_inductor/ir.py b/torch/_inductor/ir.py
index b4cf3bca42e5..689877ba6928 100644
--- a/torch/_inductor/ir.py
+++ b/torch/_inductor/ir.py
@@ -3595,7 +3595,10 @@ def __init__(
         self.mutated_inputs = mutated_inputs
         if mutated_inputs is not None:
             # Ensure that the mutated inputs are only allowed for certain nodes
-            allowed_set = {torch.ops.higher_order.flex_attention}
+            allowed_set = {
+                torch.ops.higher_order.flex_attention,
+                torch.ops.higher_order.flex_attention_backward,
+            }
             current_node = V.graph.current_node.target
             assert (
                 current_node in allowed_set
@@ -3718,13 +3721,6 @@ def get_workspace_size(self):
         return self.workspace_size if self.workspace_size is not None else 0
 
 
-class CppTemplateBuffer(TemplateBuffer):
-    def __init__(self, layout, inputs, make_kernel_render, template, choice):
-        super().__init__(layout, inputs, make_kernel_render)
-        self.template = template
-        self.choice = choice
-
-
 @dataclasses.dataclass
 class InputsKernel(Buffer):
     inputs: List[Buffer]
@@ -3930,6 +3926,21 @@ def should_allocate(self):
         return True
 
 
+def get_aten_cpp_kernel_name(kernel):
+    # Calling with the default kernel name can lead to ambiguous behavior like the following example.
+    # repeat_interleave(const at::Tensor & repeats, c10::optional<int64_t> output_size=c10::nullopt)
+    # repeat_interleave(const at::Tensor & self, int64_t repeats,
+    #       c10::optional<int64_t> dim=c10::nullopt, c10::optional<int64_t> output_size=c10::nullopt)
+    if not isinstance(kernel, torch._ops.OpOverload) or kernel.namespace != "aten":
+        return None
+    opname = (
+        kernel.__name__.split(".")[0]
+        if kernel._overloadname == "default"
+        else kernel.__name__.replace(".", "_")
+    )
+    return f"at::_ops::{opname}::call"
+
+
 @dataclasses.dataclass
 class ExternKernel(InputsKernel):
     constant_args: Tuple[Any, ...] = ()
@@ -3973,7 +3984,8 @@ def __init__(
         self.kwargs = kwargs if kwargs else {}
         self.output_view = output_view
         self.python_kernel_name = python_kernel_name
-        self.cpp_kernel_name = cpp_kernel_name
+        # If cpp_kernel_name is None, we will try to construct it from op_overload
+        self.cpp_kernel_name = cpp_kernel_name or get_aten_cpp_kernel_name(op_overload)
         self.ordered_kwargs_for_cpp_kernel = ordered_kwargs_for_cpp_kernel
         self.op_overload = op_overload
         self.collect_arg_kwarg_properties()
@@ -4016,6 +4028,40 @@ def collect_arg_kwarg_properties(self):
             else {}
         )
 
+    def fill_non_provided_args(self, args, kwargs, convert_val_to_str=False):
+        # Previously, we want to maintain forward-compatibility by skipping
+        # default args in the serialized artifacts in fbcode. However,
+        # some of our shim interfaces require default values being set.
+        # Discussed with Sherlock offline and we decided to allow serializing
+        # default args into the C++ wrapper code for now. We will refine this
+        # part if we see real FC requirement. More details related to FC
+        # can be found at:
+        # https://docs.google.com/document/d/1FzWm-sHYwmRi3x_g036kOxd99KaYquUsA-L5JwOn8ys/edit?usp=sharing
+        assert isinstance(args, (list, tuple))
+        if isinstance(args, tuple):
+            args = list(args)
+        assert self.arg_properties, "ExternKernel.arg_properties should not be empty"
+
+        n_args = len(args)
+        n_pos_args = len(self.arg_properties)
+        # For cpp wrapper, if some positional args are not provided, we need to check
+        # if they're in the kwargs or use their default value
+        if n_args < n_pos_args:
+            log.debug(
+                "%s has %d unprovided positional arguments. "
+                "Will check if they are in the keyword arguments or will use default values.",
+                self.op_overload,
+                n_pos_args - n_args,
+            )
+            for i in range(n_args, n_pos_args):
+                arg_name = self.arg_properties[i]["name"]
+                args.append(
+                    kwargs[arg_name]
+                    if arg_name in kwargs
+                    else self.arg_properties[i]["default_value"]
+                )
+        return args
+
     def decide_layout(self):
         if isinstance(self.layout, FlexibleLayout):
             self.apply_constraint()
@@ -4030,7 +4076,15 @@ def codegen(self, wrapper):
         raise NotImplementedError
 
     def get_kernel_name(self):
-        return self.cpp_kernel_name if V.graph.cpp_wrapper else self.python_kernel_name
+        return (
+            (
+                V.graph.wrapper_code.get_c_shim_func_name(self.cpp_kernel_name)  # type: ignore[attr-defined]
+                if config.abi_compatible
+                else self.cpp_kernel_name
+            )
+            if V.graph.cpp_wrapper
+            else self.python_kernel_name
+        )
 
     @staticmethod
     def copy_input(x):
@@ -4726,9 +4780,17 @@ class InplaceBernoulliFallback(ExternKernel):
 
     def codegen(self, wrapper):
         (x,) = (t.codegen_reference() for t in self.inputs)
-        wrapper.writeline(
-            f"{self.get_kernel_name()}({x}, {', '.join(map(repr, self.constant_args))}){wrapper.ending}"
-        )
+
+        if V.graph.cpp_wrapper and config.abi_compatible:
+            # Inductor doesn't really support aten Generator, so the Generator kwarg is always NULL here,
+            # which needs to be explicitly generated for cpp wrapper
+            wrapper.writeline(
+                f"{self.get_kernel_name()}({x}, {', '.join(map(repr, self.constant_args))}, NULL){wrapper.ending}"
+            )
+        else:
+            wrapper.writeline(
+                f"{self.get_kernel_name()}({x}, {', '.join(map(repr, self.constant_args))}){wrapper.ending}"
+            )
 
     def should_allocate(self):
         return False
@@ -4739,20 +4801,19 @@ def get_mutation_names(self):
     def get_unbacked_symbol_defs(self) -> Set[sympy.Symbol]:
         return set()
 
-    def __init__(self, x, *constant_args):
+    def __init__(self, op_overload, x, *constant_args):
         super().__init__(
             None,
             NoneLayout(x.get_device()),  # type: ignore[arg-type]
             self.unwrap_storage([x]),
             constant_args,
+            op_overload=op_overload,
         )
         self.name = V.graph.register_buffer(self)
         self.python_kernel_name = "aten.bernoulli_"
-        self.cpp_kernel_name = (
-            "aoti_torch_bernoulli_"
-            if config.abi_compatible
-            else "at::native::bernoulli_"
-        )
+        if not config.abi_compatible:
+            # TODO: this should be simplified once we switch to ABI-compatible only
+            self.cpp_kernel_name = "at::native::bernoulli_"
         mark_node_as_mutating(self, x)
 
 
@@ -5128,25 +5189,7 @@ class ExternKernelNode:
 }
 
 
-def get_aten_cpp_kernel_name(kernel):
-    # Calling with the default kernel name can lead to ambiguous behavior like the following example.
-    # repeat_interleave(const at::Tensor & repeats, c10::optional<int64_t> output_size=c10::nullopt)
-    # repeat_interleave(const at::Tensor & self, int64_t repeats,
-    #       c10::optional<int64_t> dim=c10::nullopt, c10::optional<int64_t> output_size=c10::nullopt)
-    assert (
-        isinstance(kernel, torch._ops.OpOverload) and kernel.namespace == "aten"
-    ), "Invalid aten kernel"
-    opname = (
-        kernel.__name__.split(".")[0]
-        if kernel._overloadname == "default"
-        else kernel.__name__.replace(".", "_")
-    )
-    return f"at::_ops::{opname}::call"
-
-
 class FallbackKernel(ExternKernelAlloc):
-    args_default_value: List[Dict[str, Any]]
-
     def __init__(
         self,
         layout,
@@ -5158,12 +5201,23 @@ def __init__(
         *,
         unbacked_bindings=None,
     ):
+        if (
+            kernel == aten.mul.Tensor
+            and len(tensor_args) == 1
+            and len(nontensor_args) == 1
+        ):
+            # When aten.mul.Tensor's second arg is constant, cpp wrapper expects
+            # to call mul_Scalar. A more proper fix is to do it in decomposition.
+            # See https://github.com/pytorch/pytorch/issues/123478
+            kernel = aten.mul.Scalar
+
         super().__init__(
             layout,
             tuple(tensor_args),
             tuple(nontensor_args),
             op_overload=kernel,
         )
+
         # We need output buffers for generating kernel arguments in the
         # abi-compatible mode, where we retrieve outputs by pass each individual
         # output through the abi-compatible interface.
@@ -5179,7 +5233,6 @@ def __init__(
             ),
         ), f"Fails to create FallbackKernel for {kernel}: {type(kernel)} not supported"
         self.op_overload = kernel
-
         self.unflatten_args = unflatten_args
         self.kwargs = {} if kwargs is None else kwargs
         V.graph.warn_fallback(self.python_kernel_name)
@@ -5341,41 +5394,6 @@ def is_not_write(arg):
         self.cpp_kernel_key = f"{self.cpp_kernel_name.replace('::', '_')}_{self.cpp_kernel_overload_name}"  # type: ignore[union-attr]
 
         self.cpp_op_schema = get_cpp_op_schema(kernel)
-        self.init_args_default_value(kernel._schema)
-
-    def init_args_default_value(self, schema):
-        self.args_default_value = [
-            {
-                "name": x.name,
-                "type": x.real_type,
-                "value": x.default_value,
-            }
-            for x in schema.arguments
-            if not x.kwarg_only
-        ]
-
-    def get_pos_arg_value(self, pos, kwargs):
-        # positional args may be provided in kwargs
-        pos_arg_name = self.args_default_value[pos]["name"]
-        if pos_arg_name in kwargs:
-            log.debug(
-                "Found argument %s with value %s from kwargs",
-                pos_arg_name,
-                kwargs[pos_arg_name],
-            )
-            return kwargs[pos_arg_name]
-
-        assert hasattr(
-            self, "args_default_value"
-        ), "self.args_default_value has to be provided"
-        assert pos < len(
-            self.args_default_value
-        ), f"expected the index {pos} to be smaller than len(self.args_default_value): {len(self.args_default_value)}"
-        arg_default_value = self.args_default_value[pos]["value"]
-        log.debug(
-            "Use default value %s for argument %s", arg_default_value, pos_arg_name
-        )
-        return arg_default_value
 
     def codegen_args(self):
         @dataclasses.dataclass
@@ -5388,6 +5406,7 @@ def __repr__(self):
         tensor_args = [Shim(x.codegen_reference()) for x in self.inputs]
         args, kwargs = self.unflatten_args(tensor_args, self.constant_args)
         if V.graph.cpp_wrapper and isinstance(self.op_overload, torch._ops.OpOverload):
+            args = self.fill_non_provided_args(args, kwargs)
             args = [
                 V.graph.wrapper_code.val_to_cpp_arg_str(param.real_type, x)
                 for param, x in zip(self.op_overload._schema.arguments, args)
@@ -5395,17 +5414,6 @@ def __repr__(self):
         else:
             args = [V.graph.wrapper_code.val_to_arg_str(x) for x in args]
 
-        # Previously, we want to maintain forward-compatibility by skipping
-        # default args in the serialized artifacts in fbcode. However,
-        # some of our shim interfaces require default values being set.
-        # Discussed with Sherlock offline and we decided to allow serializing
-        # default args into the C++ wrapper code for now. We will refine this
-        # part if we see real FC requirement. More details related to FC
-        # can be found at:
-        # https://docs.google.com/document/d/1FzWm-sHYwmRi3x_g036kOxd99KaYquUsA-L5JwOn8ys/edit?usp=sharing
-        if V.graph.cpp_wrapper and hasattr(self, "args_default_value"):
-            self.fill_non_provided_args(args, kwargs, convert_val_to_str=True)
-
         # let self.codegen_kwargs handle kwargs
         self.kwargs.update(kwargs)
         return args
@@ -5441,30 +5449,6 @@ def get_mutation_names(self):
         assert len(self.mutation_names) <= 1
         return self.mutation_names
 
-    def fill_non_provided_args(self, args, kwargs, convert_val_to_str=False):
-        assert isinstance(args, (list, tuple))
-        if isinstance(args, tuple):
-            args = list(args)
-        assert hasattr(self, "args_default_value")
-        n_args = len(args)
-        n_pos_args = len(self.args_default_value)
-        # For cpp wrapper, if some positional args are not provided, we need to check
-        # if they're in the kwargs or use their default value
-        if n_args < n_pos_args:
-            log.debug(
-                "%s has %d unprovided positional arguments. "
-                "Will check if they are in the keyword arguments or will use default values.",
-                self.op_overload,
-                n_pos_args - n_args,
-            )
-            pos_args = [
-                self.get_pos_arg_value(i, kwargs) for i in range(n_args, n_pos_args)
-            ]
-            if convert_val_to_str:
-                pos_args = [V.graph.wrapper_code.val_to_arg_str(x) for x in pos_args]
-            args.extend(pos_args)
-        return args
-
     # ProxyExecutor Design Note
     # We export the ExternFallbackNodes (for custom ops) into a serialized file
     # and run it with a host side proxy executor to address the ABI problem
@@ -5539,15 +5523,6 @@ def codegen(self, wrapper):
         if kernel.namespace == "aten":  # type: ignore[union-attr]
             # Aten Fallback Ops
             assert isinstance(kernel, torch._ops.OpOverload)
-
-            if (
-                kernel == aten.mul.Tensor
-                and len(self.inputs) == 1
-                and len(self.constant_args) == 1
-            ):
-                # When aten.mul.Tensor's second arg is constant, cpp wrapper expects to call mul_Scalar
-                kernel = aten.mul.Scalar
-
             if V.graph.cpp_wrapper:
                 if (
                     config.is_fbcode()
@@ -5562,10 +5537,6 @@ def codegen(self, wrapper):
                     )
                     self.use_runtime_dispatch = True
                     self.set_cpp_kernel(kernel)
-                else:
-                    self.cpp_kernel_name = get_aten_cpp_kernel_name(kernel)
-                    schema = kernel._schema  # type: ignore[union-attr]
-                    self.init_args_default_value(schema)
             else:
                 self.python_kernel_name = str(kernel)
         elif kernel.namespace == "_quantized":  # type: ignore[union-attr]
@@ -6280,7 +6251,7 @@ def codegen(self, wrapper):
         )
 
     @classmethod
-    def create(cls, x, packed_w, orig_w, B, batch_size):
+    def create(cls, x, packed_w, orig_w, batch_size):
         x = cls.require_stride1(cls.realize_input(x))
         orig_w = cls.require_stride1(cls.realize_input(orig_w))
         *m, _ = x.get_size()
@@ -6288,11 +6259,7 @@ def create(cls, x, packed_w, orig_w, B, batch_size):
         output_size = list(m) + [oc]
         output_stride = make_contiguous_strides_for(output_size)
         inputs = [x, packed_w, orig_w]
-        constant_args = [batch_size]
-        if B is not None:
-            inputs += [B]
-        else:
-            constant_args.insert(0, None)
+        constant_args = [None, batch_size]
 
         return MKLPackedLinear(
             layout=FixedLayout(
@@ -7191,6 +7158,232 @@ def create(
         )
 
 
+class QLinearPointwiseBinaryPT2E(ExternKernelAlloc):
+    def __init__(
+        self,
+        layout,
+        inputs,
+        constant_args=(),
+        has_bias=True,
+        x_scale_zp_are_tensors=False,
+    ):
+        """
+        if bias is not None
+            - inputs = [x, w, b, weight_scale, weight_zp, x2]
+            - const_args is: [x_scale, x_zp, o_inv_scale, o_zp,
+              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
+        else
+            - inputs = [x, w, weight_scale, weight_zp, x2]
+            - const_args is: [bias, x_scale, x_zp, o_inv_scale, o_zp,
+              fp32_output, binary_attr, aplha, unary_attr, unary_scalars, unary_algorithm]
+        """
+        self.has_bias = has_bias
+        self.x_scale_zp_are_tensors = x_scale_zp_are_tensors
+        super().__init__(
+            layout,
+            inputs,
+            constant_args,
+            None,
+            python_kernel_name=(
+                "torch.ops.onednn.qlinear_pointwise.binary_tensor"
+                if x_scale_zp_are_tensors
+                else "torch.ops.onednn.qlinear_pointwise.binary"
+            ),
+            cpp_kernel_name="onednn::qlinear_pointwise",
+        )
+        self.cpp_kernel_overload_name = (
+            "binary_tensor" if x_scale_zp_are_tensors else "binary"
+        )
+        self.cpp_kernel_key = "qlinear_pointwise_binary"
+        x_scale_type_str, x_zp_type_str = (
+            ("at::Tensor", "at::Tensor")
+            if x_scale_zp_are_tensors
+            else ("double", "int64_t")
+        )
+        self.cpp_op_schema = f"""
+            at::Tensor(
+                at::Tensor act,
+                {x_scale_type_str} act_scale,
+                {x_zp_type_str} act_zero_point,
+                at::Tensor weight,
+                at::Tensor weight_scales,
+                at::Tensor weight_zero_points,
+                c10::optional<at::Tensor> bias,
+                double inv_output_scale,
+                int64_t output_zero_point,
+                c10::optional<c10::ScalarType> output_dtype,
+                c10::optional<at::Tensor> other,
+                double other_scale,
+                int64_t other_zero_point,
+                c10::string_view binary_post_op,
+                double binary_alpha,
+                c10::string_view unary_post_op,
+                torch::List<c10::optional<at::Scalar>> unary_post_op_args,
+                c10::string_view unary_post_op_algorithm)"""
+
+    def codegen(self, wrapper):
+        # Parser the inputs and constant
+        args = [x.codegen_reference() for x in self.inputs]
+        const_args = []
+        const_args.extend(self.codegen_const_args())
+
+        x = args[0]
+        packed_weight = args[1]
+        bias = args[2] if self.has_bias else const_args[0]
+        w_scale, w_zp, other = args[-3], args[-2], args[-1]
+        if self.x_scale_zp_are_tensors:
+            assert len(args) >= 5
+            x_scale, x_zp = args[-5], args[-4]
+            (
+                o_inv_scale,
+                o_zp,
+                output_dtype,
+                other_scale,
+                other_zp,
+                binary_attr,
+                alpha,
+                unary_attr,
+                unary_scalars,
+                unary_algorithm,
+            ) = const_args[-10:]
+        else:
+            assert len(const_args) >= 8
+            (
+                x_scale,
+                x_zp,
+                o_inv_scale,
+                o_zp,
+                output_dtype,
+                other_scale,
+                other_zp,
+                binary_attr,
+                alpha,
+                unary_attr,
+                unary_scalars,
+                unary_algorithm,
+            ) = const_args[-12:]
+
+        codegen_args = (
+            x,
+            x_scale,
+            x_zp,
+            packed_weight,
+            w_scale,
+            w_zp,
+            bias,
+            o_inv_scale,
+            o_zp,
+            output_dtype,
+            other,
+            other_scale,
+            other_zp,
+            binary_attr,
+            alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithm,
+        )
+        wrapper.generate_extern_kernel_alloc_and_find_schema_if_needed(
+            self.get_name(),
+            self.python_kernel_name,
+            self.cpp_kernel_name,
+            codegen_args,
+            self.cpp_op_schema,
+            self.cpp_kernel_key,
+            self.cpp_kernel_overload_name,
+        )
+        if isinstance(self.layout, Layout):
+            self.codegen_size_asserts(wrapper)
+
+    @classmethod
+    def create(
+        cls,
+        x: "TensorBox",
+        x_scale: float,
+        x_zp: int,
+        weight: "TensorBox",  # packed_weight
+        w_scale: "TensorBox",
+        w_zp: "TensorBox",
+        bias: "TensorBox",
+        o_inv_scale: float,
+        output_zero_point: int,
+        output_dtype,
+        other: "TensorBox",
+        other_scale,
+        other_zp,
+        binary_attr,
+        alpha,
+        unary_attr,
+        unary_scalars,
+        unary_algorithm,
+    ):
+        (
+            inputs,
+            constant_args,
+            kernel_layout,
+            req_stride_order,
+        ) = _prepare_linear_fusion_create(
+            cls,
+            x,
+            weight,
+            bias,
+        )
+
+        if isinstance(x_scale, TensorBox) and isinstance(x_zp, TensorBox):
+            x_scale.realize()
+            x_zp.realize()
+            inputs = inputs + [x_scale, x_zp]
+            x_scale_zp_are_tensors = True
+        else:
+            assert isinstance(x_scale, float) and isinstance(x_zp, int)
+            constant_args = constant_args + [x_scale, x_zp]
+            x_scale_zp_are_tensors = False
+        w_scale.realize()
+        w_zp.realize()
+        inputs = inputs + [w_scale, w_zp]
+        if binary_attr == "sum":
+            other = cls.require_stride_order(other, req_stride_order)
+        inputs.append(other)
+        constant_args = constant_args + [
+            o_inv_scale,
+            output_zero_point,
+            output_dtype,
+            other_scale,
+            other_zp,
+            binary_attr,
+            alpha,
+            unary_attr,
+            may_convert_to_optional(unary_scalars),
+            unary_algorithm,
+        ]
+
+        if binary_attr == "sum":
+            packed = QLinearPointwiseBinaryPT2E(
+                layout=NoneLayout(other.get_device()),
+                inputs=inputs,
+                constant_args=constant_args,
+                has_bias=(bias is not None),
+                x_scale_zp_are_tensors=x_scale_zp_are_tensors,
+            )
+            mark_node_as_mutating(packed, other)
+            # Return other since it has been inplace changed.
+            return packed.inputs[-1]
+
+        if output_dtype is not None:
+            assert output_dtype in [torch.float32, torch.bfloat16]
+            # in _prepare_linear_fusion_create, we use x.dtype (uint8) to create kernel_layout
+            # if we set fp32_output, the output buf should be dtype float32 instead of uint8.
+            kernel_layout.dtype = output_dtype
+
+        return QLinearPointwiseBinaryPT2E(
+            layout=kernel_layout,
+            inputs=inputs,
+            constant_args=constant_args,
+            has_bias=(bias is not None),
+            x_scale_zp_are_tensors=x_scale_zp_are_tensors,
+        )
+
+
 @dataclasses.dataclass
 class MutableBox(IRNode):
     """
diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
index 15a99faa7b37..32dff9d46668 100644
--- a/torch/_inductor/kernel/flex_attention.py
+++ b/torch/_inductor/kernel/flex_attention.py
@@ -1,17 +1,39 @@
 """ Triton Implementation of the flex_attention Kernel"""
+
 import logging
-from typing import Any, List
+import math
+from enum import auto, Enum
+from typing import Any, List, Tuple
 
 import torch
+from torch._prims_common import make_contiguous_strides_for
 from .. import config
-from ..lowering import empty_strided, lowerings, register_lowering
+from ..ir import (
+    ComputedBuffer,
+    FixedLayout,
+    FlexibleLayout,
+    InputBuffer,
+    IRNode,
+    StorageBox,
+    Subgraph,
+    TensorBox,
+)
+from ..lowering import empty_strided, full, lowerings, register_lowering
 from ..select_algorithm import autotune_select_algorithm, TritonTemplate
 
 log = logging.getLogger(__name__)
 aten = torch.ops.aten
 
 
-def sdpa_grid(batch_size, num_heads, num_queries, d_model, meta):
+class SubgraphType(Enum):
+    """The type of subgraph for which we want to generate an output buffer."""
+
+    FWD = auto()  # Forward pass
+    JOINT_FWD = auto()  # The recompute step fo the of the bwds kernel
+    JOINT_BWD = auto()  # The bwd pass of the joint
+
+
+def flex_attention_grid(batch_size, num_heads, num_queries, d_model, meta):
     """How is this kernel parallelized?
     We create a grid of (batch_size * num_heads, ceil_div(n_queries, query_block_size), 1)
     Each block is responsible for iterating over blocks of keys and values calculating
@@ -22,9 +44,117 @@ def sdpa_grid(batch_size, num_heads, num_queries, d_model, meta):
     return (triton.cdiv(num_queries, meta["BLOCK_M"]), batch_size * num_heads, 1)
 
 
-sdpa_template = TritonTemplate(
-    name="sdpa",
-    grid=sdpa_grid,
+def create_placeholder(
+    name: str, dtype: torch.dtype, device: torch.device
+) -> TensorBox:
+    """Creates a placeholder input buffers for producing subgraph_output."""
+    input_buffer = InputBuffer(name, FixedLayout(device, dtype, [1], [1]))
+    return TensorBox.create(input_buffer)
+
+
+def index_to_other_buffers(cnt: int, graph_type: SubgraphType) -> int:
+    """This function needs to be aware of the signatures for flex_attention_forward
+    and flex_attention_backward. If new args are added, or the signature changes
+    be sure to update the indexing math
+
+    Args:
+        cnt (int): The current index of the placeholder node
+        is_joint_graph (bool): Whether or not this subgraph represents the joint graph
+    """
+    # Current fwd_args = [query, key, value, score_mod, *other_buffers]
+    # For fwd_graphs we have 5 dummy values this when the first lifted args
+    # is seen cnt = 5 and the start of the index_buffers is at args[4]
+    # thus we subtract 1 from the current cnt
+    if graph_type == SubgraphType.FWD:
+        return cnt - 1
+
+    # Current bwd_args = [q, k, v, out, lse, grad_out, fw_graph, joint_graph, *other_buffers]
+    # We have 5 dummy values but the start of other_buffers is at index 8
+    if graph_type == SubgraphType.JOINT_FWD:
+        return cnt + 3
+
+    # Same bwd args but now with 6 dummy values while other_buffers still start at 8
+    if graph_type == SubgraphType.JOINT_BWD:
+        return cnt + 2
+
+
+def build_subgraph_buffer(
+    args: Tuple[IRNode],
+    placeholder_inps: List[TensorBox],
+    subgraph: Subgraph,
+    graph_type: SubgraphType,
+) -> ComputedBuffer:
+    """This function's goal is to take in the required args and produce the subgraph buffer
+    The subgraph buffer is a ComputedBuffer that will be inlined into the triton template
+
+    Args:
+        args: The args that were passed into the flex_attention kernel
+        placeholder_inps: The list of scalar inputs, these were created on the fly through `create_placeholder`
+        subgraph: The Subgraph ir for which to produce the output node
+        graph_type: The type of subgraph for which we want to produce the output node, see enum above for details
+    """
+    cnt = 0
+    env = {}
+    for node in subgraph.graph_module.graph.nodes:
+        # There are two classes of placeholder inpts that we need
+        # to handle differently. For the first n_scalar_inps inputs
+        # we expect that these placeholders were generated by the make_fx call
+        # in the flex Attention HOP. So we need to create a new placeholder
+        # TensorBox for each of these inputs. For the rest of the inputs we
+        # expect that these are lifted inputs that fill up the '*other_buffers'
+        # tuple and already have corresponding TensorBoxes passed in as args.
+        if node.op == "placeholder":
+            is_lifted_input = cnt >= len(placeholder_inps)
+            lifted_input_index = index_to_other_buffers(cnt, graph_type)
+            env[node] = (
+                args[lifted_input_index] if is_lifted_input else placeholder_inps[cnt]
+            )
+            cnt += 1
+        elif node.op == "call_function":
+            # For call_function we use the default lowerings and pass in the
+            # already created TensorBoxes as args
+            from torch.utils._pytree import tree_map
+
+            env[node] = lowerings[node.target](
+                *tree_map(lambda x: env[x] if x in env else x, node.args)
+            )
+        elif node.op == "output":
+            # For the output node we need to create a ComputedBuffer
+            # which represents the actual score modification
+            # The joint_graph's output should be of the form[grad_score, None, None, None, None]
+            # This is because only the 'score' requires grad and the other outputs are
+            # the non-differentiable index scalars
+            if graph_type == SubgraphType.FWD or graph_type == SubgraphType.JOINT_FWD:
+                output_node = node.args[0]
+            else:
+                output_node = node.args[0][0]
+            output_buffer = env[output_node]
+            assert isinstance(output_buffer, TensorBox), (
+                "The output node  for flex attention's subgraph must be a TensorBox, but got: ",
+                type(output_buffer),
+            )
+            assert isinstance(output_buffer.data, StorageBox), (
+                "The output node for the flex attention subgraph must be a StorageBox, but got: ",
+                type(output_buffer),
+            )
+            # Create the ComputedBuffer directly that will be inlined into the modification block
+            subgraph_buffer = ComputedBuffer(
+                name=None,
+                layout=FlexibleLayout(
+                    device=output_buffer.data.get_device(),
+                    dtype=output_buffer.data.get_dtype(),
+                    size=output_buffer.data.get_size(),
+                ),
+                data=output_buffer.data.data,  # type: ignore[arg-type]
+            )
+            return subgraph_buffer
+
+    raise ValueError("TemplatedAttention was passed a subgraph with no output node!")
+
+
+flex_attention_template = TritonTemplate(
+    name="flex_attention",
+    grid=flex_attention_grid,
     source=r"""
 {{def_kernel("Q", "K", "V", "LSE")}}
     # Sub notation for this kernel:
@@ -118,6 +248,7 @@ def sdpa_grid(batch_size, num_heads, num_queries, d_model, meta):
         m = offs_m[:, None]
         n = start_n + offs_n[None, :]
         {{ modification(
+            subgraph_number=0,
             score="qk",
             b="off_hz // H",
             h="off_hz % H",
@@ -162,7 +293,7 @@ def sdpa_grid(batch_size, num_heads, num_queries, d_model, meta):
 
     # TODO generalize and add proper mask support
     mask = (idx_m != -1) & (idx_d != -1)
-    {{store_output(("idx_z", "idx_h", "idx_m", "idx_d"), "acc")}}
+    {{store_output(("idx_z", "idx_h", "idx_m", "idx_d"), "acc", "mask")}}
 
     # TODO dont want to write this if we dont require grad
     if OUTPUT_LOGSUMEXP:
@@ -192,7 +323,7 @@ def sdpa_grid(batch_size, num_heads, num_queries, d_model, meta):
 }
 
 
-def _get_default_config(query):
+def _get_default_config_fwd(query) -> Tuple[int, int, int, int]:
     dtype = query.get_dtype()
     head_dim = query.get_size()[-1]
     default_config = None
@@ -218,143 +349,394 @@ def _get_default_config(query):
     return default_config
 
 
+def _get_default_config_bwd(query) -> Tuple[int, int, int, int]:
+    head_dim = query.get_size()[-1]
+    dtype = query.get_dtype()
+
+    if head_dim <= 256 and torch.cuda.get_device_capability() >= (9, 0):  # H100
+        if dtype == torch.float32:
+            return (64, 64, 4, 1)
+        return (128, 128, 4, 3)
+    elif head_dim <= 256 and torch.cuda.get_device_capability() >= (8, 0):  # A100
+        return (32, 32, 4, 1)
+    else:  # modest hardware or extremely large head_dim
+        return (32, 32, 4, 1)
+
+
 # TODO: We probably also need a layout constraint?
 @register_lowering(torch.ops.higher_order.flex_attention, type_promotion_kind=None)
 def flex_attention(*args, **kwargs):
-    from torch._prims_common import make_contiguous_strides_for
-    from ..ir import (
-        ComputedBuffer,
-        FixedLayout,
-        FlexibleLayout,
-        InputBuffer,
-        StorageBox,
-        TensorBox,
-    )
-
     query, key, value, subgraph, *other_buffers = args
+    placeholder_inps = [
+        create_placeholder(name, dtype, query.get_device())
+        for name, dtype in [
+            ("score", query.get_dtype()),
+            ("b", torch.int32),
+            ("h", torch.int32),
+            ("m", torch.int32),
+            ("n", torch.int32),
+        ]
+    ]
+    subgraph_buffer = build_subgraph_buffer(
+        args, placeholder_inps, subgraph, graph_type=SubgraphType.FWD
+    )
+    layout = FixedLayout(
+        query.get_device(),
+        query.get_dtype(),
+        query.get_size(),
+        make_contiguous_strides_for(query.get_size()),
+    )
+    # see NOTE:[TritonTemplates with multiple outputs]
+    logsumexp_shape = query.get_size()[:-1]  # [B, H, M]
+    logsumexp = empty_strided(
+        logsumexp_shape,
+        None,
+        dtype=torch.float32,  # The logsumexp is always stored in fp32 regardless of the input dtype
+        device=query.get_device(),
+    )
+    choices: List[Any] = []
+    configs: List[Tuple[int, int, int, int]] = []
+    configs.append(_get_default_config_fwd(query))
+    if config.max_autotune:
+        configs += [
+            (128, 64, 4, 3),
+            (128, 128, 4, 3),
+            (128, 128, 8, 2),
+            (64, 128, 4, 3),
+            (64, 64, 4, 3),
+        ]
 
-    def create_placeholder(name: str, dtype: torch.dtype) -> InputBuffer:
-        return TensorBox.create(
-            InputBuffer(
-                name,
-                FixedLayout(
-                    query.get_device(),
-                    dtype,
-                    [
-                        1,
-                    ],
-                    [
-                        1,
-                    ],
-                ),
-            )
+    # Note, we don't need to pass in the captured buffers explicitly
+    # because they're implicitly added by the score_mod function
+    # We do need to explicitly pass it in for autotuning though.
+    for BLOCK_M, BLOCK_N, num_warps, num_stages in configs:
+        flex_attention_template.maybe_append_choice(
+            choices=choices,
+            input_nodes=[query, key, value, logsumexp],
+            layout=layout,
+            subgraphs=[
+                subgraph_buffer,
+            ],
+            mutated_inputs=[
+                logsumexp,
+            ],
+            num_stages=num_stages,
+            num_warps=num_warps,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL=query.get_size()[-1],
+            # For now, we always assume the "sound" option
+            SCORE_MOD_IS_LINEAR=False,
+            ROWS_GUARANTEED_SAFE=False,
+            OUTPUT_LOGSUMEXP=True,
         )
+    inputs_for_autotuning = [query, key, value, logsumexp] + list(other_buffers)
+    return (
+        autotune_select_algorithm(
+            "flex_attention", choices, inputs_for_autotuning, layout
+        ),
+        logsumexp,
+    )
 
-    scalar_inps = ["score", "b", "h", "m", "n"]
-    env = {}
-    cnt = 0
-    placeholder_inps = [
-        create_placeholder(name, dtype)
+
+# ---------------------------- Backward HOP Implementation ----------------------------
+
+
+def flex_attention_backward_grid(batch_size, num_heads, num_key_value, d_model, meta):
+    """How is this kernel parallelized?
+    Currently this is only parallelizing over batch * num_heads, but we can, and want to
+    parallelize over ceil_div(num_key_value, key_value_block_size). To do this will either require
+    atomic updates to some grad values or to have a two pass kernel design.
+    """
+    return (batch_size * num_heads, 1, 1)
+
+
+flex_attention_backward_template = TritonTemplate(
+    name="flex_attention_backward",
+    grid=flex_attention_backward_grid,
+    source=r"""
+{{def_kernel("Q", "K", "V", "OUT", "LSE", "DELTA", "DO", "DQ", "DV")}}
+    # Sub notation for this kernel:
+    # Q: Query, K: Key, V: Value
+    # OUT: Forward output, LSE: logsumexp (logsumexp is always stored in fp32 regardless of the input dtype)
+    # DELTA: Precomputed sum(OUT* DO, axis=1)
+    # DO: Derivative of Output, DQ: Derivative of Query, DV: Derivative of Value
+    # DK: Derivative of Key, is the written to via the store_output call due to some limitations with
+    # inductor codegen
+    # M: Number of queries, N: Number of keys/values, D: Model dimension
+    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
+    # (Modifiable) Config options:
+    # BLOCK_M
+    # BLOCK_N
+    # SCORE_MOD_IS_LINEAR: Is the score modifier linear? If so, we can lift the
+    # change of base out of the loop
+    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
+    # is not masked out? If so, we can skip an extra safety check
+    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
+
+    # Define Q Strides
+    stride_qz = {{stride("Q", 0)}}
+    stride_qh = {{stride("Q", 1)}}
+    stride_qm = {{stride("Q", 2)}}
+    stride_qk = {{stride("Q", 3)}}
+    # Define K Strides
+    stride_kz = {{stride("K", 0)}}
+    stride_kh = {{stride("K", 1)}}
+    stride_kn = {{stride("K", 2)}}
+    stride_kk = {{stride("K", 3)}}
+    # Define V Strides
+    stride_vz = {{stride("V", 0)}}
+    stride_vh = {{stride("V", 1)}}
+    stride_vn = {{stride("V", 2)}}
+    stride_vk = {{stride("V", 3)}}
+
+    Z = {{size("Q", 0)}}
+    H = {{size("Q", 1)}}
+    N_CTX = {{size("Q", 2)}}
+
+    qk_scale = 1.0
+    MATMUL_PRECISION = Q.dtype.element_ty
+
+    off_hz = tl.program_id(0)
+    off_z = off_hz // H # batch idx
+    off_h = off_hz % H # head idx
+
+    # offset pointers for batch/head
+    Q += off_z * stride_qz + off_h * stride_qh
+    K += off_z * stride_kz + off_h * stride_kh
+    V += off_z * stride_vz + off_h * stride_vh
+
+    # Asserting contiguous for now...
+    DO += off_z * stride_qz + off_h * stride_qh
+    DQ += off_z * stride_qz + off_h * stride_qh
+    DV += off_z * stride_vz + off_h * stride_vh
+
+    # TODO I think that this should be N_CTX/BLOCK_N blocks
+    for start_n in range(0, NUM_Q_BLOCKS):
+        # We are not doing the causal optimization yet allowing us to start further down the
+        # kv column
+        offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+        offs_m = tl.arange(0, BLOCK_M)
+        offs_k = tl.arange(0, BLOCK_DMODEL)
+
+        # initialize pointers to value-like data
+        q_ptrs = Q + (offs_m[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk)
+        v_ptrs = V + (offs_n[:, None] * stride_vn + offs_k[None, :] * stride_vk)
+        do_ptrs = DO + (offs_m[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+        dq_ptrs = DQ + (offs_m[:, None] * stride_qm + offs_k[None, :] * stride_qk)
+
+        # pointer to row-wise quantities in value-like data
+        D_ptrs = DELTA + off_hz * N_CTX
+        l_ptrs = LSE + off_hz * N_CTX
+
+        # initialize dv and dk
+        dv = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+        dk = tl.zeros([BLOCK_N, BLOCK_DMODEL], dtype=tl.float32)
+
+        # Key and Value stay in SRAM throughout
+        k = tl.load(k_ptrs)
+        v = tl.load(v_ptrs)
+
+        for start_m in range(0, NUM_Q_BLOCKS * BLOCK_M, BLOCK_M):
+            offs_m_curr = start_m + offs_m
+
+            # load q, k, v, do on-chip
+            q = tl.load(q_ptrs)
+
+            if SCORE_MOD_IS_LINEAR:
+                qk_scale *= 1.44269504
+            q = (q * qk_scale).to(MATMUL_PRECISION)
+
+            # -- compute qk ---
+            qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+            qk = tl.dot(q, tl.trans(k.to(MATMUL_PRECISION)), acc=qk)
+            pre_mod_scores = qk
+            # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
+            m = offs_m_curr[:, None]
+            n = offs_n[None, :]
+            {{ modification(
+                subgraph_number=0,
+                score="qk",
+                b="off_z",
+                h="off_h",
+                m="m",
+                n="n",
+                out="qk"
+            ) | indent_except_first(3) }}
+            # TODO: In the case that score_mod is linear, this can be LICMed
+            if not SCORE_MOD_IS_LINEAR:
+                qk *= 1.44269504
+            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            l_i = tl.load(l_ptrs + offs_m_curr)
+            p = tl.math.exp2(qk - l_i[:, None])
+
+            # compute dv
+            do = tl.load(do_ptrs)
+            dv += tl.dot(tl.trans(p.to(MATMUL_PRECISION)), do)
+
+            # compute dp = dot(v, do)
+            Di = tl.load(D_ptrs + offs_m_curr) # [BLOCKM, 1]
+
+            # compute ds = p * (dp - delta[:, None])
+            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
+            dp += tl.dot(do, tl.trans(v))
+            ds = p * dp
+
+            # ~~~~~~~~~~~~~~~~~~~ Apply joint modification  ~~~~~~~~~~~~~~~~~~~
+            {{ modification(
+                subgraph_number=1,
+                score="pre_mod_scores",
+                b="off_z",
+                h="off_h",
+                m="m",
+                n="n",
+                out="ds"
+            ) | indent_except_first(3) }}
+            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+            # compute dk = dot(ds.T, q)
+            dk += tl.dot(tl.trans(ds.to(MATMUL_PRECISION)), q)
+            # compute dq
+            dq = tl.load(dq_ptrs)
+            dq += tl.dot(ds.to(MATMUL_PRECISION), k)
+
+            # Store grad_query
+            tl.store(dq_ptrs, dq)
+
+            # increment pointers
+            dq_ptrs += BLOCK_M * stride_qm
+            q_ptrs += BLOCK_M * stride_qm
+            do_ptrs += BLOCK_M * stride_qm
+
+        # write-back
+        index_n = offs_n[:, None]
+        index_k = offs_k[None, :]
+
+        # Store grad_key and grad_value
+        dv_ptrs = DV + (index_n * stride_vn + index_k * stride_vk)
+        tl.store(dv_ptrs, dv)
+
+        # TODO generalize and add proper mask support
+        mask = (index_n != -1) & (index_k != -1)
+        {{store_output(("off_z", "off_h", "index_n", "index_k"), "dk", "mask", indent_width=8)}}
+
+ """,
+)
+
+
+# TODO: We probably also need a layout constraint?
+@register_lowering(
+    torch.ops.higher_order.flex_attention_backward, type_promotion_kind=None
+)
+def flex_attention_backward(*args, **kwargs):
+    (
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        grad_out,
+        fw_graph,
+        joint_graph,
+        *other_buffers,
+    ) = args
+
+    device = query.get_device()
+    dtype = query.get_dtype()
+
+    fwd_placeholder_inps = [
+        create_placeholder(name, dtype, device)
         for name, dtype in [
-            ("score", query.get_dtype()),
+            ("score", dtype),
             ("b", torch.int32),
             ("h", torch.int32),
             ("m", torch.int32),
             ("n", torch.int32),
         ]
     ]
-    for node in subgraph.graph_module.graph.nodes:
-        # There are two classes of placeholder inpts that we need
-        # to handle differently. For the first n_scalar_inps inputs
-        # we expect that these placeholders were generated by the make_fx call
-        # in the flex Attention HOP. So we need to create a new placeholder
-        # TensorBox for each of these inputs. For the rest of the inputs we
-        # expect that these are lifted inputs that fill up the '*other_buffers'
-        # tuple and already have corresponding TensorBoxes passed in as args.
-        if node.op == "placeholder":
-            is_lifted_input = cnt >= len(scalar_inps)
-            env[node] = args[cnt - 1] if is_lifted_input else placeholder_inps[cnt]
-            cnt += 1
-        elif node.op == "call_function":
-            # For call_function we use the defulat lowerings and pass in the
-            # already created TensorBoxes as args
-            from torch.utils._pytree import tree_map
+    fw_subgraph_buffer = build_subgraph_buffer(
+        args, fwd_placeholder_inps, fw_graph, graph_type=SubgraphType.JOINT_FWD
+    )
 
-            env[node] = lowerings[node.target](
-                *tree_map(lambda x: env[x] if x in env else x, node.args)
-            )
-        elif node.op == "output":
-            # For the output node we need to create a ComputedBuffer
-            # which represents the actual score modification
+    joint_placeholder_inps = fwd_placeholder_inps + [
+        create_placeholder("out", dtype, device)
+    ]
+    joint_subgraph_buffer = build_subgraph_buffer(
+        args, joint_placeholder_inps, joint_graph, graph_type=SubgraphType.JOINT_BWD
+    )
 
-            output_buffer = env[node.args[0]]
-            assert isinstance(output_buffer.data, StorageBox), (
-                "The output node for the flex attention subgraph must be a StorageBox, but got: ",
-                type(output_buffer),
-            )
-            # Create the ComputedBuffer directly that will be inlined into the modification block
-            subgraph_buffer = ComputedBuffer(
-                name=None,
-                layout=FlexibleLayout(
-                    device=output_buffer.data.get_device(),
-                    dtype=output_buffer.data.get_dtype(),
-                    size=output_buffer.data.get_size(),
-                ),
-                data=output_buffer.data.data,  # type: ignore[arg-type]
-            )
+    layout_k = FixedLayout(
+        key.get_device(),
+        key.get_dtype(),
+        key.get_size(),
+        make_contiguous_strides_for(key.get_size()),
+    )
 
-            layout = FixedLayout(
-                output_buffer.get_device(),
-                query.get_dtype(),
-                query.get_size(),
-                make_contiguous_strides_for(query.get_size()),
-            )
-            # see NOTE:[TritonTemplates with multiple outputs]
-            logsumexp_shape = query.get_size()[:-1]  # [B, H, M]
-            logsumexp = empty_strided(
-                logsumexp_shape,
-                None,
-                dtype=torch.float32,  # The logsumexp is always stored in fp32 regardless of the input dtype
-                device=output_buffer.get_device(),
-            )
-            choices: List[Any] = []
-            configs: List[Any] = []
-            configs.append(_get_default_config(query))
-            if config.max_autotune:
-                configs += [
-                    (128, 64, 4, 3),
-                    (128, 128, 4, 3),
-                    (128, 128, 8, 2),
-                    (64, 128, 4, 3),
-                    (64, 64, 4, 3),
-                ]
-            # Note, we don't need to pass in the captured buffers explicitly
-            # because they're implicitly added by the score_mod function
-            # We do need to explicitly pass it in for autotuning though.
-            for BLOCK_M, BLOCK_N, num_warps, num_stages in configs:
-                sdpa_template.maybe_append_choice(
-                    choices=choices,
-                    input_nodes=[query, key, value, logsumexp],
-                    layout=layout,
-                    subgraphs=subgraph_buffer,
-                    mutated_inputs=[
-                        logsumexp,
-                    ],
-                    num_stages=num_stages,
-                    num_warps=num_warps,
-                    BLOCK_M=BLOCK_M,
-                    BLOCK_N=BLOCK_N,
-                    BLOCK_DMODEL=query.get_size()[-1],
-                    # For now, we always assume the "sound" option
-                    SCORE_MOD_IS_LINEAR=False,
-                    ROWS_GUARANTEED_SAFE=False,
-                    OUTPUT_LOGSUMEXP=True,
-                )
-            inputs_for_autotuning = [query, key, value, logsumexp] + list(other_buffers)
-            return (
-                autotune_select_algorithm(
-                    "sdpa", choices, inputs_for_autotuning, layout
-                ),
+    # Create delta which will is needed for the bwd's kernel
+    mul_delta = lowerings[aten.mul](out, grad_out)
+    delta = lowerings[aten.sum](mul_delta, axis=-1)
+
+    # see NOTE:[TritonTemplates with multiple outputs]
+    grad_query = full(
+        query.get_size(), 0.0, dtype=dtype, device=device
+    )  # torch.zeros equivalent
+    grad_query.realize()
+    grad_value = empty_strided(value.get_size(), None, dtype=dtype, device=device)
+
+    choices: List[Any] = []
+    configs: List[Tuple[int, int, int, int]] = []
+    configs.append(_get_default_config_bwd(query))
+    if config.max_autotune:
+        configs += [
+            (128, 128, 4, 3),
+            (128, 128, 8, 1),
+            (64, 64, 4, 3),
+            (64, 64, 8, 1),
+        ]
+
+    for BLOCK_M, BLOCK_N, num_warps, num_stages in configs:
+        flex_attention_backward_template.maybe_append_choice(
+            choices=choices,
+            input_nodes=[
+                query,
+                key,
+                value,
+                out,
                 logsumexp,
-            )
-    raise ValueError("TemplatedAttention was passed a subgraph with no output node!")
+                delta,
+                grad_out,
+                grad_query,
+                grad_value,
+            ],
+            layout=layout_k,  # We use store_output only for grad_key
+            subgraphs=[fw_subgraph_buffer, joint_subgraph_buffer],
+            mutated_inputs=[grad_query, grad_value],
+            num_stages=num_stages,
+            num_warps=num_warps,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+            BLOCK_DMODEL=query.get_size()[-1],
+            NUM_Q_BLOCKS=math.ceil(query.get_size()[-2] / BLOCK_M),
+            # For now, we always assume the "sound" option
+            SCORE_MOD_IS_LINEAR=False,
+        )
+    inputs_for_autotuning = [
+        query,
+        key,
+        value,
+        out,
+        logsumexp,
+        delta,
+        grad_out,
+        grad_query,
+        grad_value,
+    ] + list(other_buffers)
+
+    grad_key = autotune_select_algorithm(
+        "flex_attention_backward", choices, inputs_for_autotuning, layout_k
+    )
+    return (
+        grad_query,
+        grad_key,
+        grad_value,
+    )
diff --git a/torch/_inductor/kernel/mm.py b/torch/_inductor/kernel/mm.py
index fa14b4406de6..593da39d2bf6 100644
--- a/torch/_inductor/kernel/mm.py
+++ b/torch/_inductor/kernel/mm.py
@@ -3,7 +3,6 @@
 from typing import Any, Dict, List, Optional
 
 import torch
-from torch._inductor.codegen.cpp_gemm_template import CppPackedGemmTemplate
 from torch._inductor.virtualized import V
 from .. import config as inductor_config
 from ..codegen.cuda.gemm_template import CUTLASSGemmTemplate
@@ -18,7 +17,6 @@
 )
 from ..utils import (
     use_aten_gemm_kernels,
-    use_cpp_packed_gemm_template,
     use_cutlass_template,
     use_max_autotune,
     use_triton_template,
@@ -158,13 +156,6 @@ def tuned_mm(mat1, mat2, *, layout=None):
     if static_shape and is_nonzero and use_cutlass_template(layout, m, n, k):
         CUTLASSGemmTemplate.add_cutlass_gemm_choices(choices, layout, [mat1, mat2])
 
-    if use_cpp_packed_gemm_template(layout, mat1, mat2):
-        CppPackedGemmTemplate.add_choices(
-            choices,
-            layout,
-            [mat1, mat2],
-        )
-
     if len(choices) == 0 and not use_aten_gemm_kernels():
         log.warning("No choices for GEMM, using ATen backend as fallback")
         choices.append(aten_mm.bind((mat1, mat2), aten_layout))
@@ -320,15 +311,6 @@ def tuned_addmm(inp, mat1, mat2, *, alpha=1, beta=1, layout=None):
                 beta=beta,
             )
 
-    if use_cpp_packed_gemm_template(layout, mat1, mat2):
-        CppPackedGemmTemplate.add_choices(
-            choices,
-            layout,
-            [inp_expanded, mat1, mat2],
-            alpha=alpha,
-            beta=beta,
-        )
-
     add_aten_fallback = False
     if len(choices) == 0:
         log.warning("No choices for GEMM, using ATen backend as fallback")
diff --git a/torch/_inductor/kernel/mm_common.py b/torch/_inductor/kernel/mm_common.py
index 26d08183b0e5..76511e19a49d 100644
--- a/torch/_inductor/kernel/mm_common.py
+++ b/torch/_inductor/kernel/mm_common.py
@@ -1,4 +1,5 @@
 import functools
+import itertools
 import logging
 from typing import cast, List, Tuple
 
@@ -113,39 +114,50 @@ def filtered_configs(
 
 
 # List of dictionaries to store the kernel configs. Configs that evaluate to true
-# will be utilised on the target platform
-mm_kernel_configs = [
-    # "BLOCK_M", "BLOCK_N", "BLOCK_K", "num_stages", "num_warps"
-    {"config": (16, 32, 16, 3, 2), "cond": True},
-    {"config": (16, 32, 32, 4, 2), "cond": True},
-    {"config": (16, 32, 32, 5, 2), "cond": True},
-    {"config": (32, 32, 16, 1, 2), "cond": True},
-    {"config": (32, 32, 128, 2, 4), "cond": torch.version.hip is None},
-    {"config": (32, 64, 32, 5, 8), "cond": True},
-    {"config": (64, 32, 32, 5, 8), "cond": True},
-    {"config": (64, 32, 128, 5, 4), "cond": True},
-    {"config": (64, 64, 16, 2, 4), "cond": True},
-    {"config": (64, 64, 32, 2, 4), "cond": True},
-    {"config": (64, 64, 64, 3, 8), "cond": True},
-    {"config": (64, 64, 128, 3, 4), "cond": True},
-    {"config": (64, 64, 128, 5, 4), "cond": True},
-    {"config": (64, 128, 32, 3, 4), "cond": True},
-    {"config": (64, 128, 32, 4, 8), "cond": True},
-    {"config": (64, 128, 64, 4, 4), "cond": True},
-    {"config": (64, 128, 128, 4, 4), "cond": True},
-    {"config": (128, 64, 32, 2, 2), "cond": True},
-    {"config": (128, 64, 32, 3, 4), "cond": True},
-    {"config": (128, 64, 32, 4, 8), "cond": True},
-    {"config": (128, 64, 64, 3, 8), "cond": True},
-    {"config": (128, 64, 128, 4, 8), "cond": True},
-    {"config": (128, 128, 32, 2, 8), "cond": True},
-    {"config": (128, 128, 32, 3, 4), "cond": True},
-    {"config": (128, 128, 32, 4, 4), "cond": True},
-    {"config": (128, 128, 64, 3, 4), "cond": True},
-    {"config": (128, 128, 64, 3, 8), "cond": True},
-    {"config": (128, 128, 64, 5, 4), "cond": True},
-    {"config": (128, 128, 64, 5, 8), "cond": True},
-]
+# will be utilised on the target platform. The configs are as follows:
+# (BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps)
+mm_kernel_configs = (
+    [
+        {"config": (16, 32, 16, 3, 2), "cond": True},
+        {"config": (16, 32, 32, 4, 2), "cond": True},
+        {"config": (16, 32, 32, 5, 2), "cond": True},
+        {"config": (32, 32, 16, 1, 2), "cond": True},
+        {"config": (32, 32, 128, 2, 4), "cond": torch.version.hip is None},
+        {"config": (32, 64, 32, 5, 8), "cond": True},
+        {"config": (64, 32, 32, 5, 8), "cond": True},
+        {"config": (64, 32, 128, 5, 4), "cond": True},
+        {"config": (64, 64, 16, 2, 4), "cond": True},
+        {"config": (64, 64, 32, 2, 4), "cond": True},
+        {"config": (64, 64, 64, 3, 8), "cond": True},
+        {"config": (64, 64, 128, 3, 4), "cond": True},
+        {"config": (64, 64, 128, 5, 4), "cond": True},
+        {"config": (64, 128, 32, 3, 4), "cond": True},
+        {"config": (64, 128, 32, 4, 8), "cond": True},
+        {"config": (64, 128, 64, 4, 4), "cond": True},
+        {"config": (64, 128, 128, 4, 4), "cond": True},
+        {"config": (128, 64, 32, 2, 2), "cond": True},
+        {"config": (128, 64, 32, 3, 4), "cond": True},
+        {"config": (128, 64, 32, 4, 8), "cond": True},
+        {"config": (128, 64, 64, 3, 8), "cond": True},
+        {"config": (128, 64, 128, 4, 8), "cond": True},
+        {"config": (128, 128, 32, 2, 8), "cond": True},
+        {"config": (128, 128, 32, 3, 4), "cond": True},
+        {"config": (128, 128, 32, 4, 4), "cond": True},
+        {"config": (128, 128, 64, 3, 4), "cond": True},
+        {"config": (128, 128, 64, 3, 8), "cond": True},
+        {"config": (128, 128, 64, 5, 4), "cond": True},
+        {"config": (128, 128, 64, 5, 8), "cond": True},
+    ]
+    if inductor_config.max_autotune_gemm_search_space != "EXHAUSTIVE"
+    else [
+        {"config": (BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps), "cond": True}
+        for BLOCK_M, BLOCK_N, BLOCK_K in itertools.product(
+            [16, 32, 64, 128, 256], repeat=3
+        )
+        for num_stages in [1, 2, 3, 4, 5]
+        for num_warps in [2, 4, 8]
+    ]
+)
 
 int8_mm_kernel_configs = [
     {"config": (64, 64, 32, 2, 4), "cond": True},
diff --git a/torch/_inductor/lowering.py b/torch/_inductor/lowering.py
index 389ff16e3902..07899fe2ccd0 100644
--- a/torch/_inductor/lowering.py
+++ b/torch/_inductor/lowering.py
@@ -1788,7 +1788,12 @@ def bernoulli_(x, *args):
         "cpu"
     ), "this should be handled in decomps unless config.fallback_random or the device is CPU"
     x.realize()
-    ir.InplaceBernoulliFallback(x, *args)
+    op_overload = (
+        aten.bernoulli_.float
+        if len(args) == 0 or isinstance(args[0], float)
+        else aten.bernoulli_.Tensor
+    )
+    ir.InplaceBernoulliFallback(op_overload, x, *args)
     return x
 
 
@@ -4554,7 +4559,7 @@ def fn(idx):
                 factor = ops.index_expr(hend - hstart, torch.int32)
                 divide_factors.append(factor)
             divide_factor = functools.reduce(ops.mul, divide_factors)
-            return ops.div(fn_sum(idx, x_loader), divide_factor)
+            return ops.truediv(fn_sum(idx, x_loader), divide_factor)
 
     rv = Pointwise.create(
         device=x.get_device(),
@@ -5318,7 +5323,7 @@ def log_add_exp_helper(a_tuple, b_tuple):
 def cummax(x, axis=None):
     if len(x.get_size()) == 0:
         assert axis in [0, -1]
-        return clone(x), torch.empty_like(x, dtype=torch.int64)
+        return clone(x), empty_like(x, dtype=torch.int64)
 
     dtype = x.get_dtype()
     combine_fn = ir.get_reduction_combine_fn(
@@ -5348,7 +5353,7 @@ def cummax(x, axis=None):
 def cummin(x, axis=None):
     if len(x.get_size()) == 0:
         assert axis in [0, -1]
-        return clone(x), torch.empty_like(x, dtype=torch.int64)
+        return clone(x), empty_like(x, dtype=torch.int64)
 
     dtype = x.get_dtype()
     combine_fn = ir.get_reduction_combine_fn(
diff --git a/torch/_inductor/mkldnn_lowerings.py b/torch/_inductor/mkldnn_lowerings.py
index 399cb1668dad..5a12a5c090bf 100644
--- a/torch/_inductor/mkldnn_lowerings.py
+++ b/torch/_inductor/mkldnn_lowerings.py
@@ -1,22 +1,10 @@
-from typing import List, Optional
+from typing import List
 
 import torch
 import torch.utils._pytree as pytree
-from torch._inductor.kernel.mm_common import mm_args
 from . import ir
-from .codegen.cpp_gemm_template import CppPackedGemmTemplate
 from .ir import TensorBox
-from .lowering import (
-    add,
-    add_needs_realized_inputs,
-    aten,
-    permute,
-    register_lowering,
-    to_dtype,
-)
-from .select_algorithm import autotune_select_algorithm, ExternKernelChoice
-from .utils import use_aten_gemm_kernels, use_cpp_packed_gemm_template, use_max_autotune
-from .virtualized import V
+from .lowering import add, add_needs_realized_inputs, aten, register_lowering, to_dtype
 
 
 def register_onednn_fusion_ops():
@@ -350,13 +338,71 @@ def qlinear_unary(
                 )
             )
 
-        if torch._C.has_mkl:
-            aten_mkl_linear = ExternKernelChoice(
-                torch.ops.mkl._mkl_linear,
-                "mkl::_mkl_linear",
-                has_out_variant=False,
-                kernel_creator=ir.MKLPackedLinear.create,
+        @register_lowering(
+            torch.ops.onednn.qlinear_pointwise.binary, type_promotion_kind=None
+        )
+        @register_lowering(
+            torch.ops.onednn.qlinear_pointwise.binary_tensor, type_promotion_kind=None
+        )
+        def qlinear_binary(
+            x: TensorBox,
+            x_scale,
+            x_zp,
+            packed_weight: TensorBox,
+            w_scale: TensorBox,
+            w_zp: TensorBox,
+            bias: TensorBox,
+            o_inv_scale,
+            o_zero_point,
+            output_dtype,
+            x2: TensorBox,
+            x2_scale,
+            x2_zp,
+            binary_attr,
+            alpha,
+            unary_attr,
+            unary_scalars,
+            unary_algorithmm,
+        ):
+            if binary_attr == "sum":
+                if output_dtype in [
+                    torch.float32,
+                    torch.bfloat16,
+                ] and x2.get_dtype() in [torch.float32, torch.bfloat16]:
+                    if x2.get_dtype() != output_dtype:
+                        # For int8-mixed-bf16 quantization and inplace add,
+                        # there is case when accum dtype is float32 but output dtype is bfloat16.
+                        # Since the accum will be inplaced changed with post op sum,
+                        # we will do accum dtype convertion here.
+                        x2 = to_dtype(x2, output_dtype)
+                else:
+                    assert (
+                        x2.get_dtype() == output_dtype
+                    ), "dtype of accum for qlinear post op sum should be the same as output"
+            return TensorBox.create(
+                ir.QLinearPointwiseBinaryPT2E.create(
+                    x,
+                    x_scale,
+                    x_zp,
+                    packed_weight,
+                    w_scale,
+                    w_zp,
+                    bias,
+                    o_inv_scale,
+                    o_zero_point,
+                    output_dtype,
+                    x2,
+                    x2_scale,
+                    x2_zp,
+                    binary_attr,
+                    alpha,
+                    unary_attr,
+                    unary_scalars,
+                    unary_algorithmm,
+                )
             )
+
+        if torch._C.has_mkl:
             cpu_needs_realized_inputs.append(torch.ops.mkl._mkl_linear)
 
             @register_lowering(torch.ops.mkl._mkl_linear)
@@ -364,48 +410,11 @@ def mkl_packed_linear(
                 x: TensorBox,
                 packed_w: TensorBox,
                 orig_w: TensorBox,
-                b: Optional[TensorBox],
+                b: TensorBox,
                 batch_size,
-                *,
-                layout=None,
             ):
-                choices = (
-                    [
-                        aten_mkl_linear.bind(
-                            (x, packed_w, orig_w), layout, B=None, batch_size=batch_size
-                        )
-                    ]
-                    if use_aten_gemm_kernels()
-                    else []
-                )
-                if use_max_autotune():
-                    transposed_w = permute(orig_w, [1, 0])
-                    *_, layout, x, transposed_w = mm_args(
-                        x, transposed_w, layout=layout
-                    )
-                    if use_cpp_packed_gemm_template(layout, x, transposed_w):
-                        CppPackedGemmTemplate.add_choices(
-                            choices,
-                            layout,
-                            [x, packed_w, orig_w],
-                            trans_w=True,
-                            input_indices=[0, 2],
-                        )
-
-                assert packed_w.get_name() in V.graph.constants
-                assert orig_w.get_name() in V.graph.constants
-                # packed_w is a mkldnn tensor which we can't generate directly
-                # so we use the weights from the original tensor in autotune.
-                input_gen_fns = {
-                    1: lambda x: V.graph.constants[x.get_name()],
-                    2: lambda x: V.graph.constants[x.get_name()],
-                }
-                result: TensorBox = autotune_select_algorithm(
-                    "packed_linear",
-                    choices,
-                    [x, packed_w, orig_w],
-                    layout,
-                    input_gen_fns=input_gen_fns,
+                result = TensorBox.create(
+                    ir.MKLPackedLinear.create(x, packed_w, orig_w, batch_size)
                 )
                 if b is not None:
                     result = add(result, b)
diff --git a/torch/_inductor/ops_handler.py b/torch/_inductor/ops_handler.py
index 88f9d406c2e1..71395c71c9b6 100644
--- a/torch/_inductor/ops_handler.py
+++ b/torch/_inductor/ops_handler.py
@@ -17,7 +17,6 @@
 
 import torch
 import torch.utils._pytree as pytree
-from torch.fx.graph import inplace_methods, magic_methods
 from .utils import IndentedBuffer, reduction_num_outputs, sympy_index_symbol, sympy_str
 
 T = TypeVar("T")
@@ -146,6 +145,12 @@ def to_dtype_bitcast(self, x: T, dtype: torch.dtype, src_dtype: torch.dtype) ->
         """
         ...
 
+    def identity(self, x: T) -> T:
+        """
+        Returns x as is.  This is used to trigger CSE.
+        """
+        ...
+
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     # These operations are only available in a "kernel" context.  Check
     # torch._inductor.codegen.common.CSEProxy for their typical implementation
@@ -408,9 +413,6 @@ def to_int(self, x0: T) -> T:
     def trunc(self, x0: T) -> T:
         ...
 
-    def truncdiv(self, x0: T, x1: T) -> T:
-        ...
-
     def ceil(self, x0: T) -> T:
         ...
 
@@ -447,28 +449,195 @@ def sub(self, x0: T, x1: T) -> T:
     def mul(self, x0: T, x1: T) -> T:
         ...
 
-    def floordiv(self, x0: T, x1: T) -> T:
+    def pow(self, x0: T, x1: T) -> T:
         ...
 
-    def truediv(self, x0: T, x1: T) -> T:
+    def and_(self, x0: T, x1: T) -> T:
         ...
 
-    def div(self, x0: T, x1: T) -> T:
+    def or_(self, x0: T, x1: T) -> T:
         ...
 
-    def mod(self, x0: T, x1: T) -> T:
+    def xor(self, x0: T, x1: T) -> T:
         ...
 
-    def pow(self, x0: T, x1: T) -> T:
+    # These are metaprogrammed by MockHandler._init_cls
+    def lshift(self, x0: T, x1: T) -> T:
         ...
 
-    def and_(self, x0: T, x1: T) -> T:
+    def rshift(self, x0: T, x1: T) -> T:
         ...
 
-    def or_(self, x0: T, x1: T) -> T:
+    def getitem(self, x0: T, x1: T) -> T:
+        # TODO: this is probably just illegal lol
         ...
 
-    def xor(self, x0: T, x1: T) -> T:
+    def matmul(self, x0: T, x1: T) -> T:
+        # TODO: this is probably just illegal lol
+        ...
+
+    def invert(self, x0: T) -> T:
+        ...
+
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # These are "special" operators.  These only exist if the target
+    # language actually supports the operator.  Keep this in sync with
+    # pointwise_overrides_data.
+
+    def airy_ai(self, x: T) -> T:
+        ...
+
+    def bessel_j0(self, x: T) -> T:
+        ...
+
+    def bessel_j1(self, x: T) -> T:
+        ...
+
+    def bessel_y0(self, x: T) -> T:
+        ...
+
+    def bessel_y1(self, x: T) -> T:
+        ...
+
+    def digamma(self, x: T) -> T:
+        ...
+
+    def erfcx(self, x: T) -> T:
+        ...
+
+    def fma(self, x: T, y: T, z: T) -> T:
+        ...
+
+    def igamma(self, x: T, y: T) -> T:
+        ...
+
+    def igammac(self, x: T, y: T) -> T:
+        ...
+
+    def gammainc(self, x: T, y: T) -> T:
+        ...
+
+    def gammaincc(self, x: T, y: T) -> T:
+        ...
+
+    def i0(self, x: T) -> T:
+        ...
+
+    def i0e(self, x: T) -> T:
+        ...
+
+    def i1(self, x: T) -> T:
+        ...
+
+    def i1e(self, x: T) -> T:
+        ...
+
+    def log_ndtr(self, x: T) -> T:
+        ...
+
+    def modified_bessel_i0(self, x: T) -> T:
+        ...
+
+    def modified_bessel_i1(self, x: T) -> T:
+        ...
+
+    def modified_bessel_k0(self, x: T) -> T:
+        ...
+
+    def modified_bessel_k1(self, x: T) -> T:
+        ...
+
+    def ndtr(self, x: T) -> T:
+        ...
+
+    def ndtri(self, x: T) -> T:
+        ...
+
+    def polygamma(self, x: T, y: T) -> T:
+        ...
+
+    def scaled_modified_bessel_k0(self, x: T) -> T:
+        ...
+
+    def scaled_modified_bessel_k1(self, x: T) -> T:
+        ...
+
+    def spherical_bessel_j0(self, x: T) -> T:
+        ...
+
+    def zeta(self, x: T, y: T) -> T:
+        ...
+
+    def chebyshev_polynomial_t(self, x: T, y: T) -> T:
+        ...
+
+    def chebyshev_polynomial_u(self, x: T, y: T) -> T:
+        ...
+
+    def chebyshev_polynomial_v(self, x: T, y: T) -> T:
+        ...
+
+    def chebyshev_polynomial_w(self, x: T, y: T) -> T:
+        ...
+
+    def legendre_polynomial_p(self, x: T, y: T) -> T:
+        ...
+
+    def shifted_chebyshev_polynomial_t(self, x: T, y: T) -> T:
+        ...
+
+    def shifted_chebyshev_polynomial_u(self, x: T, y: T) -> T:
+        ...
+
+    def shifted_chebyshev_polynomial_v(self, x: T, y: T) -> T:
+        ...
+
+    def shifted_chebyshev_polynomial_w(self, x: T, y: T) -> T:
+        ...
+
+    def hermite_polynomial_h(self, x: T, y: T) -> T:
+        ...
+
+    def hermite_polynomial_he(self, x: T, y: T) -> T:
+        ...
+
+    def laguerre_polynomial_l(self, x: T, y: T) -> T:
+        ...
+
+    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    # These operators are a bit special, because they are conventionally
+    # natively supported in both Python and C, but the semantics differ so
+    # care must be taken
+
+    def truncdiv(self, x0: T, x1: T) -> T:
+        """C-style trunc division between integers only.  Computes the true
+        division of two numbers and rounds the result to zero.
+        """
+        ...
+
+    def floordiv(self, x0: T, x1: T) -> T:
+        """Python-style floor division between integers only.  Computes the
+        true division of two numbers and floors the result.
+        """
+        ...
+
+    def truediv(self, x0: T, x1: T) -> T:
+        """True division between floats.  Integer inputs are NOT valid: to do
+        Python style (int, int) -> float division, promote the inputs to float
+        first."""
+        ...
+
+    def div(self, x0: T, x1: T) -> T:
+        """TODO: to be removed.  This renders as / no matter what the backend is
+        which is incoherent."""
+        ...
+
+    def mod(self, x0: T, x1: T) -> T:
+        """C-style modulus, take sign from LHS (x0)."""
+        ...
+
+    def remainder(self, x0: T, x1: T) -> T:
+        """Python-style modulus, take sign from RHS (x1)."""
         ...
 
     # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -578,9 +747,27 @@ def inner(*args):
 
             return inner
 
-        for name, format_string in itertools.chain(
-            magic_methods.items(), inplace_methods.items()
-        ):
+        for name, format_string in {
+            "add": "{} + {}",
+            "sub": "{} - {}",
+            "mul": "{} * {}",
+            "floordiv": "{} // {}",
+            "truediv": "{} / {}",
+            "mod": "{} % {}",  # careful, depending on target semantics varies
+            "pow": "{} ** {}",
+            "lshift": "{} << {}",
+            "rshift": "{} >> {}",
+            "and_": "{} & {}",
+            "or_": "{} | {}",
+            "xor": "{} ^ {}",
+            "eq": "{} == {}",
+            "ne": "{} != {}",
+            "lt": "{} < {}",
+            "gt": "{} > {}",
+            "le": "{} <= {}",
+            "ge": "{} >= {}",
+            "neg": "-{}",
+        }.items():
             setattr(cls, name, make_handler(format_string))
 
 
diff --git a/torch/_inductor/runtime/compile_tasks.py b/torch/_inductor/runtime/compile_tasks.py
index 66a36703da45..b29a95f64b6c 100644
--- a/torch/_inductor/runtime/compile_tasks.py
+++ b/torch/_inductor/runtime/compile_tasks.py
@@ -1,10 +1,10 @@
 from __future__ import annotations
 
 import functools
+import importlib
 import os
 import sys
 import warnings
-from types import ModuleType
 from typing import Any, Callable
 
 
@@ -31,19 +31,20 @@ def _reload_python_module_in_subproc(key, path):
 
 
 def _reload_python_module(key, path):
-    with open(path) as f:
-        try:
-            code = compile(f.read(), path, "exec")
-        except Exception as e:
-            raise RuntimeError(
-                f"Failed to import {path}\n{type(e).__name__}: {e}"
-            ) from None
-        mod = ModuleType(f"{__name__}.{key}")
-        mod.__file__ = path
-        mod.key = key  # type: ignore[attr-defined]
-        exec(code, mod.__dict__, mod.__dict__)
-        sys.modules[mod.__name__] = mod
-        return mod
+    spec = importlib.util.spec_from_file_location(f"{__name__}.{key}", path)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"Failed to import {path}: path not found")
+    module = importlib.util.module_from_spec(spec)
+    module.key = key  # type: ignore[attr-defined]
+    try:
+        spec.loader.exec_module(module)
+    except Exception as e:
+        raise RuntimeError(
+            f"Failed to import {path}\n{type(e).__name__}: {e}"
+        ) from None
+
+    sys.modules[module.__name__] = module
+    return module
 
 
 @functools.lru_cache(None)
diff --git a/torch/_inductor/scheduler.py b/torch/_inductor/scheduler.py
index 98e1b9444526..456e0c50567d 100644
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@@ -336,7 +336,7 @@ def decide_inplace_update(self):
             isinstance(self, (SchedulerNode,))
             and config.inplace_buffers
             and (
-                not isinstance(V.kernel, torch._inductor.codegen.triton.TritonKernel)
+                not isinstance(V.kernel, torch._inductor.codegen.simd.SIMDKernel)
                 or getattr(V.kernel, "mutations", None) is not None
             )
         ):
@@ -390,7 +390,7 @@ def decide_inplace_update(self):
                             )
                             # mutations not tracked in cpp kernels
                             if isinstance(
-                                V.kernel, torch._inductor.codegen.triton.TritonKernel
+                                V.kernel, torch._inductor.codegen.simd.SIMDKernel
                             ):
                                 V.kernel.mutations.add(input_node.get_name())
                                 V.kernel.mutations.add(self.get_name())
@@ -842,9 +842,6 @@ def _get_atomic_add_buffers(self) -> Set[str]:
                     )
         return buffers_store_as_atomic_add
 
-    def has_atomic_add(self, check_buf):
-        return check_buf in self._get_atomic_add_buffers()
-
 
 class FusedSchedulerNode(BaseSchedulerNode):
     """
@@ -968,15 +965,6 @@ def op_counts(self):
             op_counts.update(node.op_counts())
         return op_counts
 
-    def has_atomic_add(self, check_buf):
-        return any(
-            (
-                isinstance(sub_schedule_node1, SchedulerNode)
-                and sub_schedule_node1.has_atomic_add(check_buf)
-            )
-            for sub_schedule_node1 in self.get_nodes()
-        )
-
     # None of these need to be implemented, as a FusedSchedulerNode is just an
     # abstraction for scheduling purposes
     def update_mutated_names(self, renames: Dict[str, str]):
@@ -1535,13 +1523,22 @@ def add_user(used_by_name, user_node, can_inplace=False, is_weak=False):
                 if (r := unbacked_symbol_to_origin_node[s]) is not None:
                     node.add_fake_dep(StarDep(r))
 
+            if (
+                len(node.read_writes.writes) == 1
+                and (dep := next(iter(node.read_writes.writes)))
+                and isinstance(dep, MemoryDep)
+            ):
+                node_mode = dep.mode
+            else:
+                node_mode = None
+
             # a node will mutate either 0 or 1 buffers
             assert len(node.get_mutations()) <= 1
             for alt_name in node.get_mutations():
                 alt_name = rename(alt_name)
                 # this node must run after the prior writer
                 add_user(alt_name, node)
-                node.add_mutation_dep(StarDep(alt_name))
+                node.add_mutation_dep(StarDep(alt_name, mode=node_mode))
                 for other_node in name_to_users[alt_name].items:
                     # this node must run after all prior readers
                     other_name = rename(other_node.get_name())
@@ -2171,25 +2168,6 @@ def can_fuse(self, node1: BaseSchedulerNode, node2: BaseSchedulerNode):
             why("node1 must go before node2")
             return False
 
-        if (
-            isinstance(node1, (FusedSchedulerNode, SchedulerNode))
-            and isinstance(node2, SchedulerNode)
-            and isinstance(node2._body, ir.LoopBody)
-        ):
-            # Fix issue: https://github.com/pytorch/pytorch/issues/108963
-            # Check:
-            #   If node2 reads a buf which is a mutation buf of node1(SchedulerNode) or among nodes in node1(FusedSchedulerNode),
-            #   we will get the corresponding mutation buf and check if this mutation buf is stored by atomic_add mode.
-            # If True, we will disable the fusion of node1 and node2.
-            if any(
-                (
-                    node2_used_buf in self.mutation_renames
-                    and node1.has_atomic_add(self.mutation_renames[node2_used_buf])
-                )
-                for node2_used_buf in node2._body.reads_name2expr.keys()
-            ):
-                return False
-
         if node2.is_template():
             why("templates can only fuse epilogues")
             return False
@@ -2276,6 +2254,23 @@ def can_fuse_vertical(self, node1, node2):
         # we still can match unmet dep
         # if there's indirect indexing, don't match it
         def fusable_read_and_write(read: Dep, write: Dep):
+            read_name = self.mutation_renames.get(read.name, read.name)
+            write_name = self.mutation_renames.get(write.name, write.name)
+            if (
+                isinstance(read, MemoryDep)
+                and isinstance(write, MemoryDep)
+                and read.mode == write.mode
+                and write.mode is not None
+            ):
+                return True
+            if (
+                isinstance(read, StarDep)
+                and isinstance(write, MemoryDep)
+                and read.mode == write.mode
+                and write.mode is not None
+                and read_name == write_name
+            ):
+                return True
             return (
                 self.mutation_renames.get(read.name, read.name) == write.name
                 and (isinstance(read, MemoryDep) and isinstance(write, MemoryDep))
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py
index 0bcc166982a1..5cb10e1820cf 100644
--- a/torch/_inductor/select_algorithm.py
+++ b/torch/_inductor/select_algorithm.py
@@ -6,6 +6,7 @@
 
 import math
 import operator
+import os
 import sys
 import textwrap
 import time
@@ -103,7 +104,7 @@ def __init__(
         prefix_args=0,
         suffix_args=0,
         epilogue_fn=identity,
-        subgraphs=None,
+        subgraphs: Optional[List[ir.ComputedBuffer]] = None,
         *,
         index_dtype,
     ):
@@ -114,7 +115,7 @@ def __init__(
         )
         self.input_nodes = input_nodes
         self.output_node = output_node
-        self.named_input_nodes = {}
+        self.named_input_nodes = {}  # type: ignore[var-annotated]
         self.defines = defines
         self.kernel_name = kernel_name
         self.template_mask = None
@@ -128,10 +129,10 @@ def __init__(
         self.prefix_args = prefix_args
         self.suffix_args = suffix_args
         self.epilogue_fn = epilogue_fn
-        self.render_hooks = dict()
+        self.render_hooks = dict()  # type: ignore[var-annotated]
         self.triton_meta: Optional[Dict[str, object]] = None
-        # For Templated Attention
-        self.subgraphs = subgraphs
+        # For Templated Attention this can be a list of ir.Subgraph
+        self.subgraphs: Optional[List[ir.ComputedBuffer]] = subgraphs
 
     def need_numel_args(self):
         return False
@@ -271,19 +272,28 @@ def stride(self, name, index):
             val = self.named_input_nodes[name].get_stride()[index]
         return texpr(self.rename_indexing(val))
 
-    def modification(self, **fixed_inputs) -> str:
-        """This function generates the code body to populate
-        a 'modification' placeholder within a template
+    def modification(self, subgraph_number: int, **fixed_inputs) -> str:
+        """This creates a modification function for a subgraph.
+        To use this inside a template, the first argument should specify which subgraph to codegen for
 
-        TODO come up with standardized way to modify templates, with
-        potential multiple modifications
+        Args:
+            subgraph_number (int): The index of the subgraph in self.subgraphs
         """
+        assert isinstance(subgraph_number, int)
+        assert isinstance(self.subgraphs, list)
+        assert subgraph_number < len(
+            self.subgraphs
+        ), f"Invalid subgraph number provided to create_modification, {subgraph_number} must be < {len(self.subgraphs)}"
+
+        subgraph = self.subgraphs[subgraph_number]
 
         def add_input(name):
             return self.args.input(name)
 
+        name = f"PlaceholderSubstitution_{subgraph_number}"
+
         class PlaceholderSubstitution(V.WrapperHandler):  # type: ignore[name-defined]
-            self.name = "PlaceholderSubstitution"
+            self.name = name
 
             def load(self, name: str, index: sympy.Expr):
                 if name not in fixed_inputs:
@@ -297,15 +307,14 @@ def load(self, name: str, index: sympy.Expr):
             def indirect_indexing(self, index_var, size, check):
                 return sympy_index_symbol(str(index_var))
 
-        # if self.modification_cache is None:
         with V.set_ops_handler(PlaceholderSubstitution(V.ops)):
             assert isinstance(
-                self.subgraphs, ir.ComputedBuffer
-            ), "Expected the subgraph to be a ComputedBuffer"
-            if isinstance(self.subgraphs.data, ir.InputBuffer):
-                out = self.subgraphs.data.make_loader()((1,))
+                subgraph, ir.ComputedBuffer
+            ), f"Expected the subgraph to be a ComputedBuffer, got {type(subgraph)}"
+            if isinstance(subgraph.data, ir.InputBuffer):
+                out = subgraph.data.make_loader()((1,))
             else:
-                out = self.subgraphs.data.inner_fn((1,))
+                out = subgraph.data.inner_fn((1,))
 
         self.codegen_body()
         self.body.writeline(f"{fixed_inputs['out']} = {out.value}")
@@ -320,11 +329,18 @@ def store_output(
         indices: Union[List[Any], Tuple[Any]],
         val: str,
         mask: Optional[str] = None,
+        indent_width: int = 4,
     ):
-        """
-        Hook called from template code to store the final output
-        (if the buffer hasn't been optimized away), then append any
-        epilogue fusions.
+        """Stores the final output and appends any epilogue fusions if the buffer hasn't been optimized away.
+
+        Args:
+            indices (Union[List, Tuple]): The index for each dimension of the output. The dot product of
+                these indices and output strides must match `val`.
+            val (str): The value to store.
+            mask (Optional[str]): An optional mask to use for the store operation. If provided, this mask
+                will be applied to the store.
+            indent_width (int): The number of spaces to use for indentation. This is used when the call to
+                store_output is indented in the kernel definition.
         """
         assert isinstance(indices, (list, tuple))
         assert isinstance(val, str)
@@ -348,7 +364,7 @@ def store_output(
         self.range_trees[0].lookup(sympy.Integer(1), sympy_product(lengths)).set_name(
             "xindex"
         )
-        self.template_mask = mask
+        self.template_mask = mask  # type: ignore[assignment]
         self.template_indices = indices
         output_index = self.output_node.get_layout().make_indexer()(index_symbols)
         output_index = self.rename_indexing(output_index)
@@ -373,7 +389,7 @@ def store_output(
         def hook():
             # more stuff might have been added since the codegen_body above
             self.codegen_body()
-            return textwrap.indent(self.body.getvalue(), "    ").strip()
+            return textwrap.indent(self.body.getvalue(), " " * indent_width).strip()
 
         assert "<STORE_OUTPUT>" not in self.render_hooks
         self.render_hooks["<STORE_OUTPUT>"] = hook
@@ -438,11 +454,8 @@ def indexing(
             block_ptr=block_ptr,
         )
 
-    def initialize_range_tree(self, pid_cache):
-        super().initialize_range_tree(pid_cache)
-        # ignore default codegen
-        self.body.clear()
-        self.indexing_code.clear()
+    def codegen_range_tree(self):
+        pass  # ignore default codegen
 
     def call_kernel(self, name: str, node: Optional[ir.IRNode] = None):
         wrapper = V.graph.wrapper_code
@@ -696,19 +709,17 @@ def __init__(
         has_out_variant=True,
         op_overload=None,
         use_fallback_kernel=False,
-        kernel_creator=None,
     ):
         super().__init__()
         name = name or kernel.__name__
         assert callable(kernel)
-        assert not hasattr(extern_kernels, name), f"duplicate extern kernel: {name}"
+        assert not hasattr(extern_kernels, name), "duplicate extern kernel"
         self.name = name
         self.cpp_kernel_name = cpp_kernel
         self.has_out_variant = has_out_variant
         setattr(extern_kernels, name, kernel)
         self.op_overload = op_overload
         self.use_fallback_kernel = use_fallback_kernel
-        self.kernel_creator = kernel_creator
 
     def to_callable(self):
         return getattr(extern_kernels, self.name)
@@ -875,8 +886,6 @@ def output_node(self):
             inner = ir.FallbackKernel.create(
                 self.choice.op_overload, *self.input_nodes, **self.kwargs
             )
-        elif self.choice.kernel_creator is not None:
-            inner = self.choice.kernel_creator(*self.input_nodes, **self.kwargs)
         else:
             cls = ir.ExternKernelOut if self.has_out_variant else ir.ExternKernelAlloc
             inner = cls(
@@ -899,86 +908,6 @@ def info_dict(self) -> Dict[str, Union[PrimitiveInfoType, List[PrimitiveInfoType
         }
 
 
-class DataProcessorChoiceCallerWrapper:
-    def __init__(self, wrapped, preprocessor, postprocessor):
-        self._wrapped = wrapped
-        if preprocessor is not None:
-            self._preprocessor = preprocessor
-        else:
-            self._preprocessor = lambda x, y: (x, y)
-        if postprocessor is not None:
-            self._postprocessor = postprocessor
-        else:
-            self._postprocessor = lambda x: x
-
-    def __getattr__(self, name):
-        return getattr(self._wrapped, name)
-
-    def benchmark(self, *args, out) -> float:
-        new_args, new_out = self._preprocessor(args, out)
-        result = self._wrapped.benchmark(*new_args, out=new_out)
-        new_out = self._postprocessor(new_out)
-        if out is not new_out:
-            out.copy_(new_out)
-        return result
-
-    def output_node(self) -> ir.TensorBox:
-        result = self._wrapped.output_node()
-        return self._postprocessor(result)
-
-    def __repr__(self) -> str:
-        return f"DataProcessorChoiceCallerWrapper({self._wrapped})"
-
-
-class DataProcessorTemplateWrapper:
-    """
-    A wrapper class for a kernel template.
-
-    This class together with `DataProcessorChoiceCallerWrapper` provides a convenient way to
-    preprocess and postprocess data before and after using the wrapped template. A typical
-    usage is to reorder or filter the input nodes in order to match the expected input of other
-    kernel choices like a ATen kernel. A more complicated usage is to prepack the weights.
-    See the example from :mod:`cpp_gemm_template` for more details.
-    """
-
-    def __init__(
-        self,
-        wrapped_template_cls,
-        preprocessor,
-        postprocessor,
-        **kwargs,
-    ):
-        if preprocessor is not None:
-            self._preprocessor = preprocessor
-        else:
-            self._preprocessor = lambda x, y: (x, y)
-        if postprocessor is not None:
-            self._postprocessor = postprocessor
-        else:
-            self._postprocessor = lambda x: x
-        assert "input_nodes" in kwargs
-        assert "layout" in kwargs
-        kwargs["input_nodes"], kwargs["layout"] = preprocessor(
-            kwargs["input_nodes"], kwargs["layout"]
-        )
-        self._wrapped = wrapped_template_cls(**kwargs)
-
-    def __getattr__(self, name):
-        return getattr(self._wrapped, name)
-
-    def maybe_append_choice(self, choices, **kwargs):
-        return type(self._wrapped).maybe_append_choice(self, choices, **kwargs)
-
-    def generate(self, **kwargs):
-        choice_caller = self._wrapped.generate(**kwargs)
-        return DataProcessorChoiceCallerWrapper(
-            choice_caller, self._preprocessor, self._postprocessor
-        )
-
-    def __repr__(self) -> str:
-        return f"DataProcessorTemplateWrapper({self._wrapped})"
-
-
 class ErrorFromChoice(RuntimeError):
     def __init__(self, msg, choice: ChoiceCaller, inputs_str):
         msg += f"\nFrom choice {choice}\n{inputs_str}"
@@ -990,6 +919,13 @@ class NoValidChoicesError(RuntimeError):
     pass
 
 
+@functools.lru_cache(None)
+def get_env_num_workers() -> Optional[int]:
+    if "TORCHINDUCTOR_COMPILE_THREADS" in os.environ:
+        return int(os.environ["TORCHINDUCTOR_COMPILE_THREADS"])
+    return None
+
+
 class AlgorithmSelectorCache(PersistentCache):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -1054,11 +990,10 @@ def no_op(*args, **kwargs):
                 or precompilation_timeout_seconds <= 0
             ):
                 return no_op
-            num_workers = min(
-                config.compile_threads,
-                torch.get_num_threads(),
-                len(choices),
-            )
+
+            env_workers = get_env_num_workers()
+            num_workers = env_workers if env_workers is not None else (len(choices))
+
             if num_workers <= 0:
                 return no_op
 
@@ -1150,7 +1085,7 @@ def wait_on_futures():
                         else:
                             raise e
                     except ImportError:
-                        raise e
+                        raise e from None
 
                 executor.shutdown(wait=True)
 
@@ -1257,9 +1192,7 @@ def get_inputs():
             }
             example_inputs = list(unique_example_inputs.values())
             example_inputs_extern = [
-                unique_example_inputs[input_node.get_name()]
-                if unique_example_inputs[input_node.get_name()].is_mkldnn
-                else torch.as_strided(
+                torch.as_strided(
                     unique_example_inputs[input_node.get_name()],
                     V.graph.sizevars.size_hints(
                         input_node.get_size(),
@@ -1348,7 +1281,7 @@ def benchmark_in_current_process(choices):
                     )
                     timing = float("inf")
                 except AssertionError as e:
-                    raise AssertionError(  # noqa: TRY200
+                    raise AssertionError(  # noqa: B904
                         f"Incorrect result from choice {choice}\n\n{e}"
                     )
                 except Exception as e:
@@ -1361,7 +1294,7 @@ def benchmark_in_current_process(choices):
                         else:
                             raise e
                     except ImportError:
-                        raise e
+                        raise e from None
 
                 timings[choice] = timing
 
@@ -1420,7 +1353,7 @@ def log_results(
             result = timings[choice]
             if result:
                 sys.stderr.write(
-                    f"  {choice.name} {result:.4f} ms {best_time/result:.1%}\n"
+                    f"  {choice.name} {result:.4f} ms {best_time / result:.1%}\n"
                 )
             else:
                 sys.stderr.write(
@@ -1499,7 +1432,7 @@ def autotune_select_algorithm(*args, **kwargs):
     if "return_multi_template" not in kwargs:
         kwargs[
             "return_multi_template"
-        ] = torch._inductor.config.benchmark_multi_templates
+        ] = torch._inductor.config.benchmark_epilogue_fusion
 
     return _ALGORITHM_SELECTOR_CACHE(*args, **kwargs)
 
diff --git a/torch/_inductor/sizevars.py b/torch/_inductor/sizevars.py
index 65a9cb837907..b6288b34fafa 100644
--- a/torch/_inductor/sizevars.py
+++ b/torch/_inductor/sizevars.py
@@ -310,6 +310,14 @@ def statically_known_leq(self, left: Expr, right: Expr) -> bool:
         expr = left <= right
         return self.is_expr_static_and_true(expr)
 
+    # See Note - [On Statically Known]
+    def statically_known_geq(self, left: Expr, right: Expr) -> bool:
+        """
+        Returns a bool indicating if it is sound to optimize as if left is greater than or equal to right.
+        """
+        expr = left >= right
+        return self.is_expr_static_and_true(expr)
+
     # See Note - [On Statically Known]
     def statically_known_lt(self, left: Expr, right: Expr) -> bool:
         """
@@ -318,6 +326,14 @@ def statically_known_lt(self, left: Expr, right: Expr) -> bool:
         expr = left < right
         return self.is_expr_static_and_true(expr)
 
+    # See Note - [On Statically Known]
+    def statically_known_gt(self, left: Expr, right: Expr) -> bool:
+        """
+        Returns a bool indicating if it is sound to optimize as if left is greater than right.
+        """
+        expr = left > right
+        return self.is_expr_static_and_true(expr)
+
     # See Note - [On Statically Known]
     def statically_known_multiple_of(self, numerator: Expr, denominator: Expr) -> bool:
         """
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
index 3e219249e6ac..59baad51885e 100644
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@@ -339,7 +339,7 @@ def print_performance(
 ):
     timings = torch.tensor([timed(fn, args, times, device) for _ in range(repeat)])
     took = torch.median(timings) / times
-    print(f"{took/baseline:.6f}")
+    print(f"{took / baseline:.6f}")
     return took
 
 
@@ -726,6 +726,8 @@ def fresh_inductor_cache(cache_entries=None):
     except Exception:
         log.warning("on error, temporary cache dir kept at %s", inductor_cache_dir)
         raise
+    finally:
+        clear_inductor_caches()
 
 
 def argsort(seq) -> List[int]:
@@ -985,42 +987,6 @@ def use_cutlass_template(layout, m, n, k):
     return res
 
 
-def _use_template_for_cpu(layout):
-    return use_max_autotune() and layout.device.type == "cpu"
-
-
-def use_cpp_packed_gemm_template(layout, mat1, mat2):
-    from . import ir
-    from .codegen.cpp_micro_gemm import create_micro_gemm
-    from .kernel.mm_common import mm_args
-
-    if not _use_template_for_cpu(layout) or not _use_autotune_backend("CPP"):
-        return False
-
-    if not config.cpp.weight_prepack:
-        return False
-
-    layout_dtypes = [torch.float32]
-    m, n, k, layout, mat1, mat2 = mm_args(mat1, mat2)
-    # TODO(jgong5): support dynamic shapes for n or k
-    if has_free_symbols((n, k)):
-        return False
-    if isinstance(mat2, ir.BaseView):
-        mat2 = mat2.unwrap_view()
-    micro_gemm = create_micro_gemm(
-        "micro_gemm", m, n, k, layout.dtype, num_threads=parallel_num_threads()
-    )
-    # TODO(jgong5): support n % n_block_size != 0
-    return (
-        layout.dtype in layout_dtypes
-        and micro_gemm is not None
-        and n % micro_gemm.register_blocking[1] == 0
-        and mat1.get_stride()[-1] == 1  # TODO(jgong5): support transposed input
-        and isinstance(mat2, ir.StorageBox)
-        and mat2.is_module_buffer()
-    )
-
-
 def use_aten_gemm_kernels():
     return not use_max_autotune() or _use_autotune_backend("ATEN")
 
@@ -1490,7 +1456,7 @@ def dump_node_schedule(node_schedule):
     An API that can be used in pdb to dump a node_schedule.
     Right mainly dump the read/write dependencies but can add more as needed.
     """
-    from torch._inductor.codegen.triton import DisableReduction, EnableReduction
+    from torch._inductor.codegen.simd import DisableReduction, EnableReduction
     from torch._inductor.scheduler import SchedulerNode
 
     print(f"Node schedule with {len(node_schedule)} nodes")
@@ -1529,7 +1495,7 @@ def should_assume_input_aligned(example_input: torch.Tensor):
     # See Note: [Input Alignment handling in Inductor]
 
     # right now, we only care about alignment for cuda tensors.
-    if example_input.device.type != "cuda":
+    if not is_gpu(example_input.device.type):
         return False
     return config.assume_aligned_inputs or tensor_is_aligned(example_input)
 
@@ -1612,16 +1578,23 @@ def aoti_compile_with_persistent_cache(
     """
     Compile the given function with persistent cache for AOTI eager mode.
     """
-    flattened_inputs = pytree.arg_tree_leaves(*args, **kwargs)
-    assert all(
-        isinstance(input, torch.Tensor) for input in flattened_inputs
-    ), "Only support tensor for now"
     assert not dynamic, "Only support static shape for now"
+    type_to_torch_dtype = {int: torch.int32, float: torch.float, bool: torch.bool}
+    supported_scalar_types = tuple(type_to_torch_dtype.keys())
+    flattened_inputs = pytree.arg_tree_leaves(*args, **kwargs)
+    if not all(
+        isinstance(input, (supported_scalar_types, torch.Tensor))
+        for input in flattened_inputs
+    ):
+        raise NotImplementedError("Only support tensor, int, float, bool for now")
 
     persistent_cache = aoti_eager_cache_dir(ns, device_type)
-    persistent_cache.mkdir(parents=True, exist_ok=True)
+    if not persistent_cache.exists():
+        persistent_cache.mkdir(parents=True)
+
     persistent_cache_lib = persistent_cache / "lib"
-    persistent_cache_lib.mkdir(parents=True, exist_ok=True)
+    if not persistent_cache_lib.exists():
+        persistent_cache_lib.mkdir()
 
     with mock.patch.dict(
         os.environ,
@@ -1636,21 +1609,37 @@ def aoti_compile_with_persistent_cache(
                 options=options,
                 remove_runtime_assertions=remove_runtime_assertions,
                 disable_constraint_solver=disable_constraint_solver,
+                # Some operations may have non-Tensor parameters like int, float, bool. These
+                # non-Tensor parameters will not be the input of the graph. Therefore, we do
+                # need to keep the same signature.
+                same_signature=False,
             )
 
             kernel_metadata_items = []
-            for input_tensor in flattened_inputs:
+            for input in flattened_inputs:
                 # TODO(Eikan): To add dynamic support
                 metadata: Dict[str, Any] = {}
                 metadata["is_dynamic"] = dynamic
-                metadata["device_type"] = f"{input_tensor.device.type}"
-                if is_cpu_device([input_tensor]):
-                    metadata["device_index"] = -1
+
+                if isinstance(input, torch.Tensor):
+                    metadata["device_type"] = f"{input.device.type}"
+                    if is_cpu_device([input]):
+                        metadata["device_index"] = -1
+                    else:
+                        metadata["device_index"] = input.device.index
+                    metadata["dtype"] = f"{input.dtype}"
+                    metadata["sizes"] = list(input.size())
+                    metadata["strides"] = list(input.stride())
                 else:
-                    metadata["device_index"] = input_tensor.device.index
-                metadata["dtype"] = f"{input_tensor.dtype}"
-                metadata["sizes"] = list(input_tensor.size())
-                metadata["strides"] = list(input_tensor.stride())
+                    assert isinstance(input, supported_scalar_types)
+                    # Scalar tensor
+                    metadata["device_type"] = device_type
+                    metadata["device_index"] = -1 if device_type == "cpu" else 0
+                    metadata["dtype"] = f"{type_to_torch_dtype[type(input)]}"
+                    metadata["sizes"] = []
+                    metadata["strides"] = []
+                    metadata["scalar_value"] = input
+
                 kernel_metadata_items.append(metadata)
 
             kernel_meta_info: Dict[str, Any] = {}
@@ -1686,3 +1675,26 @@ def aoti_compile_with_persistent_cache(
             return kernel_lib_path
         except Exception as e:
             return ""
+
+
+def run_and_get_cpp_code(fn, *args, **kwargs):
+    # We use the patch context manager instead of using it as a decorator.
+    # In this way, we can ensure that the attribute is patched and unpatched correctly
+    # even if this run_and_get_cpp_code function is called multiple times.
+    with unittest.mock.patch.object(config, "debug", True):
+        torch._dynamo.reset()
+        import io
+        import logging
+
+        log_capture_string = io.StringIO()
+        ch = logging.StreamHandler(log_capture_string)
+        from torch._inductor.graph import output_code_log
+
+        output_code_log.addHandler(ch)
+        prev_level = output_code_log.level
+        output_code_log.setLevel(logging.DEBUG)
+        result = fn(*args, **kwargs)
+        s = log_capture_string.getvalue()
+        output_code_log.setLevel(prev_level)
+        output_code_log.removeHandler(ch)
+    return result, s
diff --git a/torch/_library/fake_class_registry.py b/torch/_library/fake_class_registry.py
index 32a9aa8c8711..d77989cd829b 100644
--- a/torch/_library/fake_class_registry.py
+++ b/torch/_library/fake_class_registry.py
@@ -134,8 +134,10 @@ def register_fake_class(qualname, fake_class: Optional[HasStaticMethodFromReal]
     returns an instance of the fake class. All tensors in the fake object should also
     be properly fakified with to_fake_tensor() in from_real.
 
+
     Examples:
         # For a custom class Foo defined in test_custom_class_registration.cpp:
+
         TORCH_LIBRARY(_TorchScriptTesting, m) {
           m.class_<TensorQueue>("_TensorQueue")
             .def(torch::init<at::Tensor>())
@@ -144,6 +146,7 @@ def register_fake_class(qualname, fake_class: Optional[HasStaticMethodFromReal]
             .def("top", &TensorQueue::top)
             .def("size", &TensorQueue::size)
             .def("clone_queue", &TensorQueue::clone_queue)
+            .def("__obj_flatten__", &TensorQueue::__obj_flatten__)
             .def_pickle(
                 // __getstate__
                 [](const c10::intrusive_ptr<TensorQueue>& self)
@@ -166,8 +169,7 @@ def __init__(self, queue):
 
             @classmethod
             def __obj_unflatten__(cls, flattened_ctx):
-                ctx = {flattened_ctx[0]: flattened_ctx[1]}
-                return cls(**ctx)
+                return cls(**dict(ctx))
 
             def push(self, x):
                 self.queue.append(x)
@@ -178,6 +180,11 @@ def pop(self):
             def size(self):
                 return len(self.queue)
 
+    In this example, the original TensorQeue need to addd a __obj_flatten__ method
+    to the class TensorQueue and the flattend result is passed into FakeTensorQueue's
+    __obj_unflatten__ as inputs to create a fake class. This protocol allows pytorch to look
+    at the contents of the script object and properly handle them in the subsystems
+    like dynamo, aot_aotugrad or more.
     """
 
     def inner(fake_class: HasStaticMethodFromReal):
diff --git a/torch/_logging/_registrations.py b/torch/_logging/_registrations.py
index 0530c12df304..10463b864f44 100644
--- a/torch/_logging/_registrations.py
+++ b/torch/_logging/_registrations.py
@@ -26,12 +26,13 @@
 register_log("torch", "torch")
 register_log("distributed", DISTRIBUTED)
 register_log(
-    "dist_c10d", ["torch.distributed.distributed_c10d", "torch.distributed.rendezvous"]
+    "c10d", ["torch.distributed.distributed_c10d", "torch.distributed.rendezvous"]
 )
 register_log(
-    "dist_ddp", ["torch.nn.parallel.distributed", "torch._dynamo.backends.distributed"]
+    "ddp", ["torch.nn.parallel.distributed", "torch._dynamo.backends.distributed"]
 )
-register_log("dist_fsdp", ["torch.distributed.fsdp"])
+register_log("pp", ["torch.distributed.pipelining"])
+register_log("fsdp", ["torch.distributed.fsdp"])
 register_log("onnx", "torch.onnx")
 register_log("export", ["torch._dynamo", "torch.export", *DYNAMIC])
 
diff --git a/torch/_meta_registrations.py b/torch/_meta_registrations.py
index a7e41de58c5d..93e45bfb1d84 100644
--- a/torch/_meta_registrations.py
+++ b/torch/_meta_registrations.py
@@ -19,6 +19,7 @@
     corresponding_real_dtype,
     elementwise_dtypes,
     ELEMENTWISE_TYPE_PROMOTION_KIND,
+    FloatLike,
     IntLike,
     make_contiguous_strides_for,
     Number,
@@ -3286,6 +3287,15 @@ def _meta_foreach_inplace(*args, _scalar_op=None, **kwargs):
     return
 
 
+@register_meta([aten._foreach_pow_.Scalar])
+def meta__foreach_pow__scalar(self, exponent):
+    torch._check(
+        isinstance(exponent, FloatLike),
+        lambda: f"exponent must be a float but got {type(exponent)}",
+    )
+    return
+
+
 @register_meta([aten._foreach_pow.ScalarAndTensor])
 def meta__foreach_pow_scalar_and_tensor(self, exponent):
     # Only foreach_pow has a ScalarAndTensor method and needs special
@@ -5485,6 +5495,8 @@ def meta__flash_attention_backward(
     philox_seed: Tensor,
     philox_offset: Tensor,
     scale: Optional[float] = None,
+    window_size_left: Optional[int] = None,
+    window_size_right: Optional[int] = None,
 ):
     grad_query = torch.empty_like(query)
     grad_key = torch.empty_like(key)
diff --git a/torch/_numpy/_util.py b/torch/_numpy/_util.py
index ff219d930731..477d3d44671a 100644
--- a/torch/_numpy/_util.py
+++ b/torch/_numpy/_util.py
@@ -178,7 +178,7 @@ def _try_convert_to_tensor(obj):
         tensor = torch.as_tensor(obj)
     except Exception as e:
         mesg = f"failed to convert {obj} to ndarray. \nInternal error is: {str(e)}."
-        raise NotImplementedError(mesg)  # noqa: TRY200
+        raise NotImplementedError(mesg)  # noqa: B904
     return tensor
 
 
diff --git a/torch/_numpy/linalg.py b/torch/_numpy/linalg.py
index 2232419db1b2..093851142dbc 100644
--- a/torch/_numpy/linalg.py
+++ b/torch/_numpy/linalg.py
@@ -38,7 +38,7 @@ def wrapped(*args, **kwds):
         try:
             return func(*args, **kwds)
         except torch._C._LinAlgError as e:
-            raise LinAlgError(*e.args)  # noqa: TRY200
+            raise LinAlgError(*e.args)  # noqa: B904
 
     return wrapped
 
diff --git a/torch/_numpy/testing/utils.py b/torch/_numpy/testing/utils.py
index cd3d3407f582..f757860e1218 100644
--- a/torch/_numpy/testing/utils.py
+++ b/torch/_numpy/testing/utils.py
@@ -247,7 +247,7 @@ def assert_equal(actual, desired, err_msg="", verbose=True):
             assert_equal(actualr, desiredr)
             assert_equal(actuali, desiredi)
         except AssertionError:
-            raise AssertionError(msg)  # noqa: TRY200
+            raise AssertionError(msg)  # noqa: B904
 
     # isscalar test to check cases such as [np.nan] != np.nan
     if isscalar(desired) != isscalar(actual):
@@ -279,7 +279,7 @@ def assert_equal(actual, desired, err_msg="", verbose=True):
     except (DeprecationWarning, FutureWarning) as e:
         # this handles the case when the two types are not even comparable
         if "elementwise == comparison" in e.args[0]:
-            raise AssertionError(msg)  # noqa: TRY200
+            raise AssertionError(msg)  # noqa: B904
         else:
             raise
 
@@ -426,7 +426,7 @@ def _build_err_msg():
             assert_almost_equal(actualr, desiredr, decimal=decimal)
             assert_almost_equal(actuali, desiredi, decimal=decimal)
         except AssertionError:
-            raise AssertionError(_build_err_msg())  # noqa: TRY200
+            raise AssertionError(_build_err_msg())  # noqa: B904
 
     if isinstance(actual, (ndarray, tuple, list)) or isinstance(
         desired, (ndarray, tuple, list)
@@ -726,7 +726,7 @@ def func_assert_same_pos(x, y, func=isnan, hasval="nan"):
             names=("x", "y"),
             precision=precision,
         )
-        raise ValueError(msg)  # noqa: TRY200
+        raise ValueError(msg)  # noqa: B904
 
 
 def assert_array_equal(x, y, err_msg="", verbose=True, *, strict=False):
@@ -2272,7 +2272,7 @@ def check_free_memory(free_bytes):
         try:
             mem_free = _parse_size(env_value)
         except ValueError as exc:
-            raise ValueError(  # noqa: TRY200
+            raise ValueError(  # noqa: B904
                 f"Invalid environment variable {env_var}: {exc}"
             )
 
diff --git a/torch/_refs/__init__.py b/torch/_refs/__init__.py
index ac6a60d0078c..68675c751736 100644
--- a/torch/_refs/__init__.py
+++ b/torch/_refs/__init__.py
@@ -3830,7 +3830,7 @@ def _check_stack_inputs(tensors: TensorSequenceType) -> None:
     entry_shape = tensors[0].shape
     for i in range(1, len(tensors)):
         assert tensors[i].shape == entry_shape, (
-            f"stack expects each tensor to be equal size, but got {entry_shape} at entry 0"
+            f"stack expects each tensor to be equal size, but got {entry_shape} at entry 0 "
             f"and {tensors[i].shape} at entry {i}"
         )
 
@@ -6298,7 +6298,7 @@ def _compute_sizes(seq, scalar_type):
         try:
             handle = seq[0]
         except Exception:
-            raise ValueError(  # noqa: TRY200
+            raise ValueError(  # noqa: B904
                 f"could not determine the shape of object type '{type(seq).__name__}'"
             )
         seq = handle
@@ -6358,12 +6358,24 @@ def _infer_scalar_type(obj):
 
 # Analogous to recursive_store
 # xref: recursive_store in torch/csrc/utils/tensor_new.cpp
-def _recursive_build(scalarType: torch.dtype, obj: TensorOrNumberLikeType):
-    if isinstance(obj, Tensor) and obj.ndim <= 1:
+def _recursive_build(
+    scalarType: torch.dtype, obj: Union[TensorOrNumberLikeType, TensorSequenceType]
+):
+    if isinstance(obj, Tensor) and obj.numel() == 1:
         return obj.detach().to(dtype=scalarType, device="cpu", copy=True).view(())
+    elif isinstance(obj, Tensor):
+        # It is invalid to call ".tensor([...])" with a non-scalar tensor in eager mode
+        # >>> torch.tensor([torch.randn(2)])
+        # ValueError: only one element tensors can be converted to Python scalars
+        #
+        # But it is possible with a NumPy array
+        # >>> torch.tensor([np.random.uniform(size=(2,))]).shape
+        # torch.Size([1, 2])
+        return obj.detach().to(dtype=scalarType, device="cpu", copy=True)
     elif isinstance(obj, Number):
         return torch.scalar_tensor(obj, dtype=scalarType)
 
+    # seq can be a list of tensors
     seq = obj
     return torch.stack([_recursive_build(scalarType, item) for item in seq])
 
diff --git a/torch/_streambase.py b/torch/_streambase.py
index 5a0df2c22ba9..b06946523fa3 100644
--- a/torch/_streambase.py
+++ b/torch/_streambase.py
@@ -5,27 +5,27 @@ class _StreamBase(ABC):
     r"""Base stream class abstraction for multi backends Stream to herit from"""
 
     @abstractmethod
-    def wait_event(self, event):
+    def wait_event(self, event) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def wait_stream(self, stream):
+    def wait_stream(self, stream) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def record_event(self, event=None):
+    def record_event(self, event=None) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def query(self):
+    def query(self) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def synchronize(self):
+    def synchronize(self) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def __eq__(self, stream):
+    def __eq__(self, stream) -> bool:
         raise NotImplementedError
 
 
@@ -33,13 +33,13 @@ class _EventBase(ABC):
     r"""Base Event class abstraction for multi backends Event to herit from"""
 
     @abstractmethod
-    def wait(self, stream=None):
+    def wait(self, stream=None) -> None:
         raise NotImplementedError
 
     @abstractmethod
-    def query(self):
+    def query(self) -> bool:
         raise NotImplementedError
 
     @abstractmethod
-    def synchronize(self):
+    def synchronize(self) -> None:
         raise NotImplementedError
diff --git a/torch/_subclasses/fake_tensor.py b/torch/_subclasses/fake_tensor.py
index 3fdc4fc01e6b..79c8e951edfc 100644
--- a/torch/_subclasses/fake_tensor.py
+++ b/torch/_subclasses/fake_tensor.py
@@ -215,8 +215,8 @@ def tensor_memo(self):
     meta_converter: MetaConverter
     constant_storage_mapping: Dict[StorageWeakRef, List[ReferenceType]]
 
-    def __init__(self):
-        self.meta_converter = MetaConverter()
+    def __init__(self, *, copy_data=False):
+        self.meta_converter = MetaConverter(copy_data=copy_data)
 
         # map from to storage to corresponding constant tensors
         self.constant_storage_mapping = {}
@@ -294,8 +294,6 @@ def from_real_tensor(
             assert not make_constant
 
         def mk_fake_tensor(make_meta_t):
-            from torch._dynamo.utils import clone_input
-
             # NB: don't use in_kernel_invocation_manager. to
             # ensure FakeTensor can internally do constant computation
             # as necessary.  Invocation manager is "more correct" as
@@ -311,18 +309,6 @@ def mk_fake_tensor(make_meta_t):
                     # TODO: callback might be used in recursive contexts, in
                     # which case using t is wrong!  BUG!
                     constant=t if make_constant else None,
-                    # TODO: This won't preserve aliasing relationships, so if
-                    # there is mutation you won't see it reflect elsewhere.
-                    # This is fine because propagate_real_tensors isn't
-                    # intended to give you exact results and some inaccuracy
-                    # is OK, although if its use case expands we would want to
-                    # do something similar to meta converter, but poking in
-                    # real tensors at the storage cloning phase
-                    real_tensor=(
-                        (t if make_constant else clone_input(t))
-                        if fake_mode.propagate_real_tensors
-                        else None
-                    ),
                 )
 
         out = self.meta_converter(
@@ -870,23 +856,26 @@ def __init__(
     ):
         log.debug("create_mode 0x%x", id(self))
         self.allow_fallback_kernels = allow_fallback_kernels
-        self.fake_tensor_converter = FakeTensorConverter()
+
+        import torch._dynamo.config
+        import torch._functorch.config
+
+        self.propagate_real_tensors = (
+            torch._functorch.config.fake_tensor_propagate_real_tensors
+        )
+        self.fake_tensor_converter = FakeTensorConverter(
+            copy_data=self.propagate_real_tensors
+        )
+
         if static_shapes is not None:
             self.static_shapes = static_shapes
         else:
             self.static_shapes = shape_env is None
 
-        import torch._dynamo.config
-        import torch._functorch.config
-
         # This is temporarily patched to True in Dynamo to grandfather in some
         # places where we unconditionally allow scalar outputs, TO BE REMOVED
         self.allow_scalar_outputs = False
 
-        self.propagate_real_tensors = (
-            torch._functorch.config.fake_tensor_propagate_real_tensors
-        )
-
         self._allow_unsafe_data_ptr_access = (
             torch._functorch.config.fake_tensor_allow_unsafe_data_ptr_access
         )
@@ -1552,7 +1541,7 @@ def maybe_to_real_tensor(t):
                 func,
                 flat_arg_fake_tensors,
                 flat_args,
-                self.shape_env.unbacked_var_to_val,
+                self.shape_env.unbacked_var_to_val if self.shape_env else None,
             )
 
         def maybe_propagate_real_tensors(fake_out):
diff --git a/torch/_subclasses/functional_tensor.py b/torch/_subclasses/functional_tensor.py
index 1762059eedf2..dfef5951ab26 100644
--- a/torch/_subclasses/functional_tensor.py
+++ b/torch/_subclasses/functional_tensor.py
@@ -17,6 +17,27 @@
 not_implemented_log = torch._logging.getArtifactLogger(__name__, "not_implemented")
 
 
+# NOTE Some special handling for tensor conversion during export is needed.
+# Normally, when tracing through the model with tensor.to(), the maybe-aliasing
+# relationship between input and output tensors will be baked into the graph.
+# For example, if we got a tensor with device cpu and call tensor.to("cpu"),
+# it will become a no-op in the graph. For a whole graph capture, this is not
+# sound so we need to do something different. Instead, in export we will try to
+# preserve the tensor conversion by forcing a non-semantic-breaking aten::_to_copy
+# operator to be traced in the graph, and subsequently banning mutations on all
+# such converted tensors.
+# In addition to patching .to() method call in functionalization, we will have to
+# patch other similar methods like float() and cpu(), because they intentionally
+# don't fall back to .to() methods, but have the same behavior as .to() according to
+# pytorch document. https://pytorch.org/docs/stable/generated/torch.Tensor.float.html
+# thus we simply force them to go through .to() call.
+def _conversion_method_template(**extra_kwargs):
+    def _(self, *args, **kwargs):
+        return self.to(*args, **{**kwargs, **extra_kwargs})
+
+    return _
+
+
 class FunctionalTensor(torch.Tensor):
     """
     Functional tensors represent tensors that will remove mutations
@@ -225,6 +246,24 @@ def to(self, *args, **kwargs):
                 return super().to(*args, **{**kwargs, "copy": True})
         return super().to(*args, **kwargs)
 
+    def cuda(self, device=None, *args, **kwargs):
+        device = device or torch.cuda.current_device()
+        if len(args) > 0:
+            return self.to(device, *args, **kwargs)
+        else:
+            return self.to(device=device, **kwargs)
+
+    char = _conversion_method_template(dtype=torch.int8)
+    cpu = _conversion_method_template(device=torch.device("cpu"))
+    bfloat16 = _conversion_method_template(dtype=torch.bfloat16)
+    byte = _conversion_method_template(dtype=torch.uint8)
+    double = _conversion_method_template(dtype=torch.float64)
+    float = _conversion_method_template(dtype=torch.float32)
+    bool = _conversion_method_template(dtype=torch.bool)
+    half = _conversion_method_template(dtype=torch.float16)
+    int = _conversion_method_template(dtype=torch.int32)
+    long = _conversion_method_template(dtype=torch.int64)
+
 
 class FunctionalTensorMode(TorchDispatchMode):
     def __init__(self, pre_dispatch=False, export=False, _allow_token_discovery=False):
diff --git a/torch/_subclasses/meta_utils.py b/torch/_subclasses/meta_utils.py
index c674120a22ff..5dd5e4b8944f 100644
--- a/torch/_subclasses/meta_utils.py
+++ b/torch/_subclasses/meta_utils.py
@@ -34,6 +34,7 @@
     maybe_get_level,
     peek_interpreter_stack,
 )
+from torch.utils._mode_utils import no_dispatch
 
 from torch.utils._python_dispatch import is_traceable_wrapper_subclass
 from torch.utils.weak import WeakIdKeyDictionary
@@ -153,13 +154,14 @@ class MetaTensorDescriber:
     the same ID when we see the same tensor/storage.
     """
 
-    def __init__(self):
+    def __init__(self, *, copy_data=False):
         self.next_tensor_id: MetaTensorId = 0
         self.next_storage_id: MetaStorageId = 0
         # Tensor -> int
         self.lookup_tensor = WeakIdKeyDictionary()
         # Storage -> int
         self.lookup_storage = WeakIdKeyDictionary()
+        self.copy_data = copy_data
 
     def get_tensor_id(self, t: torch.Tensor):
         if t not in self.lookup_tensor:
@@ -180,6 +182,9 @@ def describe_storage(self, s: torch.UntypedStorage):
         return MetaStorageDesc(
             id=self.get_storage_id(s),
             size=s.size(),
+            # NB: We don't do the copy yet; copy happens when we start
+            # creating the new storages
+            data=s if self.copy_data else None,
         )
 
     def describe_tensor(self, t: torch.Tensor, recurse: bool = True):
@@ -354,6 +359,7 @@ def describe_tensor(self, t: torch.Tensor, recurse: bool = True):
             functorch_stack=maybe_functorch_stack,
             autograd_meta_from=autograd_meta_from,
             current_level=current_level,
+            data=t if self.copy_data else None,
         )
 
 
@@ -361,6 +367,9 @@ def describe_tensor(self, t: torch.Tensor, recurse: bool = True):
 class MetaStorageDesc:
     id: MetaStorageId
     size: int
+    # NB: this is only populated with copy_data True, it is not directly
+    # serializable in JSON, you want to do something special here anyway
+    data: Optional[torch.UntypedStorage]
 
 
 @dataclass(frozen=True)
@@ -388,7 +397,9 @@ class MetaTensorDesc:
     # NB: Sometimes, size, stride and storage_offset contain SymInt, in which
     # case this is NOT serializable.  That only happens when you're
     # re-fakeifying a fake tensor with an existing ShapeEnv... maybe we
-    # can get rid of this use case entirely
+    # can get rid of this use case entirely.  Notably, even if we are
+    # fakeifying a real tensor into a fake tensor with symbolic shapes, the
+    # size here is NOT dynamic
     # NB: size could potentially be None as you can override it and make it
     # throw an error, but we don't currently have any subclasses that do this
     # except C++ nested tensor but we're going to have nested int to make this
@@ -434,6 +445,11 @@ class MetaTensorDesc:
     functorch_stack: Optional[List[CInterpreter]] = None
     autograd_meta_from: Optional[torch.Tensor] = None
 
+    # This is only populated on copy_data, and typically is not used at all,
+    # except for some of our meta-ification paths that don't properly use
+    # storage (pro-tip: you should use storage)
+    data: Optional[torch.Tensor] = None
+
     # Faithfully serializing functorch tensors will not be too difficult.
     # We only need to consider grad/vmap interpreters, and their internal
     # state is only bools (mostly what the grad enabled/disabled state
@@ -449,6 +465,30 @@ def shape(self):
         return self.size
 
 
+# A more faithful reproduction would do a copy on the entire
+# storage, but this needs to be done carefully because the
+# underlying storage could have larger extent than is implied
+# by size/stride.  The real fix is to properly call
+# meta_storage recursively here.
+#
+# These "safe" functions are intended to be used under no_dispatch() mode.
+# The no_dispatch() here is intended to prevent ambient fake tensor mode from
+# fakeifying the operation.  But if we are given an honest to goodness
+# FakeTensor as src, we MUST NOT run the copy/clone operation.  A better way
+# to do this would be to not use no_dispatch and instead just disable fake
+# tensor mode only (allowing for subclass dispatch to occur)
+def _safe_copy(dst, src):
+    if type(src) is not torch.Tensor:
+        return
+    dst.copy_(src)
+
+
+def _safe_clone(src):
+    if type(src) is not torch.Tensor:
+        return None
+    return src.clone()
+
+
 # This is a class for converting multiple tensors into meta tensors which
 # share the same view/storage structure.  The operation model is you allocate
 # one of these, and then call it repeatedly on all the tensors you want to
@@ -457,7 +497,7 @@ def shape(self):
 # meta storages. This class will hold weak references to cached tenosrs
 # and tensor storages.
 class MetaConverter:
-    def __init__(self):
+    def __init__(self, *, copy_data: bool = False):
         # Maps MetaStorageId to UntypedStorage
         self.storage_memo: weakref.WeakValueDictionary = weakref.WeakValueDictionary()
         # Maps MetaTensorId to torch.Tensor (typically a meta tensor or
@@ -467,7 +507,12 @@ def __init__(self):
         self.miss = 0
         self.del_hook = None
         self.arg_cnt = 0
-        self.describer = MetaTensorDescriber()
+        # Ensures real_storage/real_tensor are populated on the resulting
+        # metaified storage/tensor.  The naming of this attribute is load
+        # bearing: FakeTensor relies on real tensor being set to exactly this
+        # value
+        self.copy_data = copy_data
+        self.describer = MetaTensorDescriber(copy_data=copy_data)
 
     def successful(self):
         return self.hit > 0 and self.miss == 0
@@ -489,8 +534,14 @@ def meta_storage(self, s: MetaStorageDesc, callback):
         # Need to make sure to resize the meta storage too.
         if self.get_storage_memo(s) is None:
             r_s = callback(
-                lambda: torch.empty(s.size, dtype=torch.uint8, device="meta")
+                lambda: torch.empty(s.size, dtype=torch.uint8, device="meta"),
             ).untyped_storage()
+            if self.copy_data:
+                # NB: no_dispatch is needed because internally storage copy is
+                # implemented as Tensor operations
+                with torch.no_grad(), no_dispatch():
+                    assert s.data is not None
+                    r_s.real_storage = s.data.clone()
             self.set_storage_memo(s, r_s)
             return r_s
         else:
@@ -640,8 +691,8 @@ def empty_create_subclass(
             outer_size = outer_size if outer_size is not None else t.size
             outer_stride = outer_stride if outer_stride is not None else t.stride
 
-            transformed_tensors_dict = {
-                attr: callback(
+            def transform(attr, inner_t):
+                r = callback(
                     lambda: empty_create(
                         inner_t,
                         AttrSource(source, attr),
@@ -652,7 +703,20 @@ def empty_create_subclass(
                         ),
                     )
                 )
-                for attr, inner_t in t.attrs.items()
+                if self.copy_data:
+                    with torch.no_grad(), no_dispatch():
+                        r.real_tensor = torch.empty_strided(
+                            inner_t.size,
+                            inner_t.stride,
+                            dtype=inner_t.dtype,
+                            device=inner_t.device,
+                        )
+                        assert inner_t.data is not None
+                        _safe_copy(r.real_tensor, inner_t.data)
+                return r
+
+            transformed_tensors_dict = {
+                attr: transform(attr, inner_t) for attr, inner_t in t.attrs.items()
             }
 
             sub = t.type.__tensor_unflatten__(
@@ -892,6 +956,11 @@ def tensor_visitor_fn(
                             device="meta",
                         )
                     )
+                    if self.copy_data:
+                        # Pray that sparse clone doesn't lose information
+                        assert t.data is not None
+                        with torch.no_grad(), no_dispatch():
+                            r.real_tensor = _safe_clone(t.data)
                     assert safe_is_leaf(r), "the callback you passed in doesn't detach"
                     # Note [is_coalesced is dispatched]
                     # Strangely enough, is_coalesced() is a dispatched operator,
@@ -939,6 +1008,11 @@ def tensor_visitor_fn(
                             device="meta",
                         )
                     )
+                    if self.copy_data:
+                        # Pray sparse clone doesn't lose information
+                        assert t.data is not None
+                        with torch.no_grad(), no_dispatch():
+                            r.real_tensor = _safe_clone(t.data)
                     assert safe_is_leaf(r), "the callback you passed in doesn't detach"
                     if t.requires_grad:
                         r.requires_grad = True
@@ -961,11 +1035,22 @@ def tensor_visitor_fn(
                     sizes, strides, _storage_offset = sym_sizes_strides_storage_offset(
                         t, source
                     )
+                    # TODO: This doesn't seem right, where's the MKLDNN'ness
+                    # lol
                     r = callback(
                         lambda: torch.empty_strided(
                             sizes, strides, dtype=t.dtype, device="meta"
                         )
                     )
+                    if self.copy_data:
+                        with torch.no_grad(), no_dispatch():
+                            assert t.size is not None
+                            assert t.stride is not None
+                            r.real_tensor = torch.empty_strided(
+                                t.size, t.stride, dtype=t.dtype, device=t.device
+                            )
+                            assert t.data is not None
+                            _safe_copy(r.real_tensor, t.data)
                     assert safe_is_leaf(r), "the callback you passed in doesn't detach"
                     if t.requires_grad:
                         r.requires_grad = True
@@ -1056,6 +1141,16 @@ def _to_fake_tensor(t: MetaTensorDesc):
                                     device="meta",
                                 )
                             )
+                            if self.copy_data:
+                                with torch.no_grad(), no_dispatch():
+                                    r.real_tensor = torch.empty_strided(  # type: ignore[attr-defined]
+                                        t.size,
+                                        t.stride,
+                                        dtype=t.dtype,
+                                        device=t.device,
+                                    )
+                                    assert t.data is not None
+                                    _safe_copy(r.real_tensor, t.data)  # type: ignore[attr-defined]
                         return r
 
                     r = _to_fake_tensor(t)
@@ -1190,6 +1285,13 @@ def is_c_of_r(complex_dtype, real_dtype):
                 else:
                     is_leaf = t.is_leaf
 
+                    # Graph-Break for wrapped tensors
+                    if (
+                        not (t.is_batchedtensor or t.is_gradtrackingtensor)
+                        and t.is_functorch_wrapped
+                    ) or t.is_legacy_batchedtensor:
+                        return NotImplemented
+
                     (
                         sizes,
                         strides,
@@ -1211,6 +1313,14 @@ def is_c_of_r(complex_dtype, real_dtype):
                                 device="meta",
                             )
                         )
+                        if self.copy_data:
+                            with torch.no_grad(), no_dispatch():
+                                assert t.size is not None
+                                assert t.stride is not None
+                                r.real_tensor = torch.empty_strided(
+                                    t.size, t.stride, dtype=t.dtype, device=t.device
+                                )
+                                _safe_copy(r.real_tensor, t.data)
 
                     assert safe_is_leaf(r), "the callback you passed in doesn't detach"
                     if t.requires_grad:
@@ -1230,13 +1340,6 @@ def is_c_of_r(complex_dtype, real_dtype):
                                 1,
                             )(r)
 
-                    # Graph-Break for wrapped tensors
-                    if (
-                        not (t.is_batchedtensor or t.is_gradtrackingtensor)
-                        and t.is_functorch_wrapped
-                    ) or t.is_legacy_batchedtensor:
-                        return NotImplemented
-
                     s = t.storage
                     assert s is not None
                     if s.id not in self.storage_memo and (
@@ -1248,6 +1351,10 @@ def is_c_of_r(complex_dtype, real_dtype):
                     ):
                         # You're normal and happy, install the fresh storage into the memo
                         self.set_storage_memo(s, r.untyped_storage())
+                        if self.copy_data:
+                            r.untyped_storage().real_storage = (
+                                r.real_tensor.untyped_storage()
+                            )
                     else:
                         # You're in crazy town; somehow you gave us a tensor
                         # that wasn't a view, but had nonzero storage offset,
@@ -1286,8 +1393,17 @@ def is_c_of_r(complex_dtype, real_dtype):
                         mb_fake_mode = maybe_get_fake_mode(r)
                         if mb_fake_mode is not None:
                             maybe_fake_mgr = in_kernel_invocation_manager(mb_fake_mode)
-                        with maybe_fake_mgr, torch.no_grad(), maybe_suppress():
-                            r.set_(r_s, storage_offset, sizes, strides)
+                        with torch.no_grad(), maybe_suppress():
+                            with maybe_fake_mgr:
+                                r.set_(r_s, storage_offset, sizes, strides)
+                            if self.copy_data:
+                                with torch.no_grad(), no_dispatch():
+                                    r.real_tensor.set_(
+                                        r_s.real_storage,
+                                        t.storage_offset,
+                                        t.size,
+                                        t.stride,
+                                    )
 
                 if t.grad is not None:
                     from torch._dynamo.source import AttrSource
diff --git a/torch/_torch_docs.py b/torch/_torch_docs.py
index ba8e899dc943..6d22f9dcf984 100644
--- a/torch/_torch_docs.py
+++ b/torch/_torch_docs.py
@@ -2195,13 +2195,13 @@ def merge_dicts(*dicts):
 add_docstr(
     torch.can_cast,
     r"""
-can_cast(from, to) -> bool
+can_cast(from_, to) -> bool
 
 Determines if a type conversion is allowed under PyTorch casting rules
 described in the type promotion :ref:`documentation <type-promotion-doc>`.
 
 Args:
-    from (dtype): The original :class:`torch.dtype`.
+    from\_ (dtype): The original :class:`torch.dtype`.
     to (dtype): The target :class:`torch.dtype`.
 
 Example::
diff --git a/torch/_utils.py b/torch/_utils.py
index 2e48fe9a1a9d..1bb726252dee 100644
--- a/torch/_utils.py
+++ b/torch/_utils.py
@@ -52,71 +52,40 @@ def _type(self, dtype=None, non_blocking=False, **kwargs):
     return dtype(self.size()).copy_(self, non_blocking)
 
 
-def _hpu(self, device=None, non_blocking=False, **kwargs):
-    """Returns a copy of this object in HPU memory.
+def _to(self, device, non_blocking=False):
+    """Returns a copy of this object in device memory.
 
-    If this object is already in HPU memory and on the correct device, then
-    no copy is performed and the original object is returned.
+    If this object is already on the correct device, then no copy is performed
+    and the original object is returned.
 
     Args:
-        device (int): The destination HPU id. Defaults to the current device.
+        device (int): The destination device.
         non_blocking (bool): If ``True`` and the source is in pinned memory,
             the copy will be asynchronous with respect to the host. Otherwise,
             the argument has no effect.
-        **kwargs: For compatibility, may contain the key ``async`` in place of
-            the ``non_blocking`` argument.
     """
-    non_blocking = _get_async_or_non_blocking("hpu", non_blocking, kwargs)
-    hpu = getattr(torch, "hpu", None)
-    assert hpu is not None, "HPU device module is not loaded"
-    if self.is_hpu:
-        if device is None:
-            device = hpu.current_device()
-        if self.get_device() == device:
-            return self
-    else:
-        if device is None:
-            device = -1
-    with hpu.device(device):
-        assert not self.is_sparse, "sparse storage is not supported for HPU tensors"
-        untyped_storage = torch.UntypedStorage(self.size(), device=torch.device("hpu"))
-        untyped_storage.copy_(self, non_blocking)
-        return untyped_storage
-
-
-def _cuda(self, device=None, non_blocking=False, **kwargs):
-    """Returns a copy of this object in CUDA memory.
-
-    If this object is already in CUDA memory and on the correct device, then
-    no copy is performed and the original object is returned.
+    if self.device == device:
+        return self
 
-    Args:
-        device (int): The destination GPU id. Defaults to the current device.
-        non_blocking (bool): If ``True`` and the source is in pinned memory,
-            the copy will be asynchronous with respect to the host. Otherwise,
-            the argument has no effect.
-        **kwargs: For compatibility, may contain the key ``async`` in place of
-            the ``non_blocking`` argument.
-    """
-    non_blocking = _get_async_or_non_blocking("cuda", non_blocking, kwargs)
-    if self.is_cuda:
-        if device is None:
-            device = torch.cuda.current_device()
-        if self.get_device() == device:
-            return self
-    else:
-        if device is None:
-            device = -1
-    with torch.cuda.device(device):
-        if self.is_sparse:
-            new_type = getattr(torch.cuda.sparse, self.__class__.__name__)
-            indices = torch.Tensor._indices(self).cuda(device, non_blocking)
-            values = torch.Tensor._values(self).cuda(device, non_blocking)
+    device_module = getattr(torch, device.type, None)
+    assert (
+        device_module is not None
+    ), f"{device.type.upper()} device module is not loaded"
+    with device_module.device(device):
+        if self.is_sparse and hasattr(device_module, "sparse"):
+            new_type = getattr(device_module.sparse, self.__class__.__name__)
+            indices = getattr(torch.Tensor._indices(self), device.type)(
+                device, non_blocking
+            )
+            values = getattr(torch.Tensor._values(self), device.type)(
+                device, non_blocking
+            )
             return new_type(indices, values, self.size())
         else:
-            untyped_storage = torch.UntypedStorage(
-                self.size(), device=torch.device("cuda")
-            )
+            assert (
+                not self.is_sparse
+            ), f"sparse storage is not supported for {device.type.upper()} tensors"
+            untyped_storage = torch.UntypedStorage(self.size(), device=device)
             untyped_storage.copy_(self, non_blocking)
             return untyped_storage
 
diff --git a/torch/_utils_internal.py b/torch/_utils_internal.py
index 3da8bc2186f5..a61adb6a826b 100644
--- a/torch/_utils_internal.py
+++ b/torch/_utils_internal.py
@@ -63,10 +63,14 @@ def throw_abstract_impl_not_imported_error(opname, module, context):
 
 
 # Meta only, act as nop otherwise.
+#
+# NB!  This treats "skip" kwarg specially!!
 def compile_time_strobelight_meta(phase_name):
     def compile_time_strobelight_meta_inner(function):
         @functools.wraps(function)
         def wrapper_function(*args, **kwargs):
+            if "skip" in kwargs:
+                kwargs["skip"] = kwargs["skip"] + 1
             return function(*args, **kwargs)
 
         return wrapper_function
diff --git a/torch/_weights_only_unpickler.py b/torch/_weights_only_unpickler.py
index 44dd8223862a..6c9f3b61ae8b 100644
--- a/torch/_weights_only_unpickler.py
+++ b/torch/_weights_only_unpickler.py
@@ -9,6 +9,10 @@
 # - `torch.nn.Parameter`
 # - `collections.Counter`
 # - `collections.OrderedDict`
+# Additionally, users can use an allowlist for adding classes they have deemed as safe using
+# `_add_safe_globals()` (`torch.serialization.add_safe_globals`)
+# `_clear_safe_globals()` (`torch.serialization.clear_safe_globals`)
+# `_get_safe_globals()` (`torch.serialization.get_safe_globals`)
 
 # Based of https://github.com/python/cpython/blob/main/Lib/pickle.py
 # Expected to be useful for loading PyTorch model weights
@@ -19,6 +23,7 @@
 
 import functools as _functools
 from collections import Counter, OrderedDict
+from inspect import getattr_static
 from pickle import (
     APPEND,
     APPENDS,
@@ -59,11 +64,57 @@
     UnpicklingError,
 )
 from struct import unpack
-from sys import maxsize
-from typing import Any, Dict, List
+from sys import maxsize, modules
+from typing import Any, Dict, List, Type
 
 import torch
 
+_marked_safe_globals_list: List[Any] = []
+
+
+def _add_safe_globals(safe_globals: List[Any]):
+    global _marked_safe_globals_list
+    _marked_safe_globals_list += safe_globals
+
+
+def _get_safe_globals() -> List[Any]:
+    global _marked_safe_globals_list
+    return _marked_safe_globals_list
+
+
+def _clear_safe_globals():
+    global _marked_safe_globals_list
+    _marked_safe_globals_list = []
+
+
+# Separate from _get_allowed_globals because of the lru_cache on _get_allowed_globals
+# For example if user had a script like
+#   torch.load(file_a)
+#   torch.serialization._add_safe_globals([torch.foo])
+#   torch.load(file_b)
+# the dynamic additions to safe_globals would not be picked up by
+# _get_allowed_globals due to the lru_cache
+def _get_user_allowed_globals():
+    rc: Dict[str, Any] = {}
+    for f in _marked_safe_globals_list:
+        rc[f"{f.__module__}.{f.__name__}"] = f
+    return rc
+
+
+def _tensor_rebuild_functions():
+    return {
+        torch._utils._rebuild_parameter,
+        torch._utils._rebuild_parameter_with_state,
+        torch._utils._rebuild_qtensor,
+        torch._utils._rebuild_tensor,
+        torch._utils._rebuild_tensor_v2,
+        torch._utils._rebuild_tensor_v3,
+        torch._utils._rebuild_sparse_tensor,
+        torch._utils._rebuild_meta_tensor_no_storage,
+        torch._utils._rebuild_nested_tensor,
+        torch._utils._rebuild_wrapper_subclass,
+    }
+
 
 # Unpickling machinery
 @_functools.lru_cache(maxsize=1)
@@ -75,6 +126,7 @@ def _get_allowed_globals():
         "torch.serialization._get_layout": torch.serialization._get_layout,
         "torch.Size": torch.Size,
         "torch.Tensor": torch.Tensor,
+        "torch.device": torch.device,
     }
     # dtype
     for t in torch.storage._dtype_to_storage_type_map().keys():
@@ -103,17 +155,7 @@ def _get_allowed_globals():
     ]:
         rc[str(qt)] = qt
     # Rebuild functions
-    for f in [
-        torch._utils._rebuild_parameter,
-        torch._utils._rebuild_parameter_with_state,
-        torch._utils._rebuild_qtensor,
-        torch._utils._rebuild_tensor,
-        torch._utils._rebuild_tensor_v2,
-        torch._utils._rebuild_tensor_v3,
-        torch._utils._rebuild_sparse_tensor,
-        torch._utils._rebuild_meta_tensor_no_storage,
-        torch._utils._rebuild_nested_tensor,
-    ]:
+    for f in _tensor_rebuild_functions():
         rc[f"torch._utils.{f.__name__}"] = f
 
     # Handles Tensor Subclasses, Tensor's with attributes.
@@ -128,6 +170,11 @@ def __init__(self, file, *, encoding: str = "bytes"):
         self.readline = file.readline
         self.read = file.read
         self.memo: Dict[int, Any] = {}
+        # tensor subclass types found from GLOBAL instructions that have passed the criteria
+        # to be allowed as the second argument to `torch._tensor._rebuild_from_type_v2`
+        # This enables rebuilding of tensor subclasses defined outside the `torch` package.
+        # See [Note: Criteria for allowing out-of-core tensor subclasses] for details on the criteria.
+        self.tensor_subclasses_found: Dict[str, Type] = {}
 
     def load(self):
         """Read a pickled object representation from the open file.
@@ -151,8 +198,124 @@ def load(self):
                 full_path = f"{module}.{name}"
                 if full_path in _get_allowed_globals():
                     self.append(_get_allowed_globals()[full_path])
+                elif full_path in _get_user_allowed_globals():
+                    self.append(_get_user_allowed_globals()[full_path])
                 else:
-                    raise RuntimeError(f"Unsupported class {full_path}")
+                    # The logic in this branch handles user-defined tensor subclasses.
+                    # We can automatically allow and raise and error for anything that is not provably safe.
+                    # [Note: Criteria for allowing out-of-core tensor subclasses]
+                    # GLOBAL '<module>.<tensor subclass>' instructions will get the class and
+                    # push the string (not the actual type) while adding the type to the dictionary keyed
+                    # by the string onto the unpickler's stack if they satisfy the following conditions:
+                    # (1) The <module> that defines them is in `sys.modules`
+                    #     (we will use getattr_static to access it to ensure no code execution)
+                    # (2) They inherit from `torch.Tensor`
+                    # (2) The class is not overriding any of the `torch.Tensor` methods listed here:
+                    #     `__getattr__`, `__get__`, `__getattribute__`, `__setstate__`, `__set__`,
+                    #     and `tp_alloc`
+                    #     The methods that we ban overriding were selected in a test-driven manner
+                    #     by overriding every callable method on a tensor subclass and determinining
+                    #     which might get called during unpickling.
+                    # When executing REDUCE, the string will be appropriately converted back to the type only
+                    # for `torch._tensor._rebuild_from_type_v2` as other use of the class could use methods
+                    # we didn't audit.
+                    if module == "__builtin__":
+                        raise RuntimeError(
+                            f"Unsupported global: GLOBAL {full_path} was not an allowed global by default. "
+                            "Please use `torch.serialization.add_safe_globals` to allowlist this global "
+                            "if you trust this class/function."
+                        )
+                    elif module not in modules:
+                        # TODO: add a link here to a doc that explains to users what we mean by trust
+                        raise RuntimeError(
+                            f"Found GLOBAL `{full_path}` instruction in the pickle file but `{full_path}` was "
+                            f"not in the pre-defined list of allowed globals that are considered safe by the "
+                            "weights_only unpickler for rebuilding state_dicts. This is the expected behavior if "
+                            f"`{full_path}` is a class or function that is not in the list of allowed globals "
+                            f"If `{full_path}` is NOT a tensor subclass, you might consider"
+                            "`torch.serialization.add_safe_globals` if it is appropriate. However, if it is a "
+                            "user-defined tensor subclass not defined in the `torch` package, this error might arise "
+                            f"as we expect `{module}` to be present in `sys.modules` (i.e. it "
+                            "must be imported in the current environment), but this was not the case. "
+                            f"If you intend to unpickle a tensor subclass `{full_path}` please import `{name}` from "
+                            f"`{module}`. Note that having this imported will *only* allow the type `{full_path}` to "
+                            "be passed as the second argument to `torch._tensor._rebuild_from_type_v2`, which should "
+                            "enable the tensor subclass to be unpickled without any arbitrary code execution as long "
+                            # If the user imports and these are overridden the next error will prompt them to use
+                            # torch.serialization.add_safe_globals.
+                            "a sa pre-defined list of methods called when unpickling are not overridden. In "
+                            "particular, the methods are `__getattr__`, `__get__`, `__getattribute__`, `__setstate__`, "
+                            "`__set__`, as well as the implementation of `tp_alloc`."
+                        )
+                    else:
+                        try:
+                            class_type = getattr_static(modules[module], name)
+                        except AttributeError as e:
+                            raise AttributeError(
+                                "For safety during weights_only loading, we use inspect.getattr_state to "
+                                f"get {name} from {module}, if {module} implements the descriptor protocol, "
+                                "__getattr__ or __getattribute__ these will not be called."
+                            ) from e
+                        # None of the objects here contain any data from the pickle so this is safe
+                        if isinstance(class_type, type) and issubclass(
+                            class_type, torch.Tensor
+                        ):
+                            # getattr is called by the getattr call in `_rebuild_from_type_v2`
+                            custom_get_attribute = (
+                                class_type.__getattribute__
+                                is not torch.Tensor.__getattribute__
+                            )
+                            custom_get = (
+                                getattr_static(class_type, "__get__", None) is not None
+                            )
+                            custom_get_attr = (
+                                getattr_static(class_type, "__getattr__", None)
+                                is not None
+                            )
+                            # Tensor.__setstate__ might be called in `_rebuild_from_type_v2`
+                            custom_set_state = (
+                                class_type.__setstate__ is not torch.Tensor.__setstate__
+                            )
+                            # setattr is called in `torch._utils._set_obj_state`
+                            custom_set_attr = (
+                                class_type.__setattr__ is not object.__setattr__
+                            )
+                            custom_set = (
+                                getattr_static(class_type, "__set__", None) is not None
+                            )
+                            # tp_alloc is called by `Tensor._rebuild_wrapper_subclass` and `Tensor.as_subclass`
+                            has_custom_tp_alloc = (
+                                not torch._C._check_tp_alloc_is_default(class_type)
+                            )
+                            custom_methods = {
+                                "__getattribute__": custom_get_attribute,
+                                "__getattr__": custom_get_attr,
+                                "__get__": custom_get,
+                                "__setattr__": custom_set_attr,
+                                "__set__": custom_set,
+                                "__setstate__": custom_set_state,
+                                "tp_alloc": has_custom_tp_alloc,
+                            }
+                            if any(custom_methods.values()):
+                                error = ""
+                                for k, v in custom_methods.items():
+                                    error += f" {k}={v}"
+                                raise RuntimeError(
+                                    f"Trying to unpickle tensor subclass `{full_path}` that has defined a custom "
+                                    f"version for one of these methods:{error}. Please check whether you trust these "
+                                    "methods and allowlist the subclass with `torch.serialization.add_safe_globals` if so."
+                                )
+                            # push the string full_path onto the stack (in REBUILD, there is special logic to
+                            # access this from tensor_subclasses_found for rebuild_from_type_v2)
+                            self.tensor_subclasses_found[full_path] = class_type
+                            self.append(full_path)
+                        else:
+                            raise RuntimeError(
+                                f"Unsupported global: GLOBAL {full_path} was not an allowed global by default. "
+                                "Please use `torch.serialization.add_safe_globals` to allowlist this global "
+                                "if you trust this class/function."
+                            )
+
             elif key[0] == NEWOBJ[0]:
                 args = self.stack.pop()
                 cls = self.stack.pop()
@@ -162,10 +325,33 @@ def load(self):
             elif key[0] == REDUCE[0]:
                 args = self.stack.pop()
                 func = self.stack[-1]
-                if func not in _get_allowed_globals().values():
+                if (
+                    func not in _get_allowed_globals().values()
+                    and func not in _get_user_allowed_globals().values()
+                ):
                     raise RuntimeError(
                         f"Trying to call reduce for unrecognized function {func}"
                     )
+                # Special handling for tensor subclass type found in GLOBAL that is pushed
+                # onto stack as str to prevent it from being used anywhere except the
+                # second arg of _rebuild_from_type_v2 and within argument tuple for _rebuild_wrapper_subclass
+                # _rebuild_from_type_v2 is called with args (func, type, func_args, state)
+                # where both type and, when func is rebuild_wrapper_subclass, func_args[0] could be the subclass type
+                # Since we pushed these subclass types onto the stack as strings, convert them to the actual
+                # type here.
+                if func is torch._tensor._rebuild_from_type_v2 and type(args[1]) is str:
+                    args_after = args[2:]
+                    if (
+                        args[0] is torch._utils._rebuild_wrapper_subclass
+                        and type(args[2][0]) is str
+                    ):
+                        new_arg_tuple = (
+                            self.tensor_subclasses_found[args[2][0]],
+                        ) + args[2][1:]
+                        args_after = (new_arg_tuple,) + args[3:]
+                    args = (
+                        args[:1] + (self.tensor_subclasses_found[args[1]],) + args_after
+                    )
                 self.stack[-1] = func(*args)
             elif key[0] == BUILD[0]:
                 state = self.stack.pop()
diff --git a/torch/ao/quantization/experimental/adaround_fake_quantize.py b/torch/ao/quantization/experimental/adaround_fake_quantize.py
new file mode 100644
index 000000000000..4d988bbb25bb
--- /dev/null
+++ b/torch/ao/quantization/experimental/adaround_fake_quantize.py
@@ -0,0 +1,148 @@
+from typing import Tuple
+
+import torch
+from torch.ao.quantization.fake_quantize import _is_symmetric_quant
+from torch.ao.quantization.utils import is_per_tensor
+from torch.quantization import FakeQuantize
+from torch.quantization.observer import MinMaxObserver
+
+
+class AdaroundFakeQuantizer(FakeQuantize):
+    """
+    This is a FakeQuantizer that enables an adaptive rounding fake quantizer.
+    Adaround is a technique to adaptively round weights, derived from the paper https://arxiv.org/pdf/2004.10568.pdf
+    For HTP compatibility, we are targeting to use symmetric quantization
+    """
+
+    scale: torch.Tensor
+    zero_point: torch.Tensor
+    V: torch.nn.Parameter
+
+    # pyre-fixme[3]: Return type must be annotated.
+    def __init__(
+        self,
+        observer=MinMaxObserver,
+        qscheme=torch.per_tensor_symmetric,  # not used, but needed for fakequant
+        quant_min: int = -128,
+        quant_max: int = 127,
+        ch_axis: int = 0,
+        # pyre-fixme[2]: Parameter must be annotated.
+        **observer_kwargs,
+    ):
+        super().__init__(
+            observer=observer,
+            qscheme=qscheme,
+            quant_min=quant_min,
+            quant_max=quant_max,
+            is_dynamic=False,
+            **observer_kwargs,
+        )
+        # Populate quant_min/quant_max to observer_kwargs if valid
+        if quant_min is not None and quant_max is not None:
+            assert (
+                quant_min <= quant_max
+            ), "quant_min must be less than or equal to quant_max"
+        # pyre-fixme[4]: Attribute must be annotated.
+        self.qscheme = qscheme
+        self.is_per_tensor: bool = is_per_tensor(qscheme)
+        self.is_symmetric: bool = _is_symmetric_quant(qscheme)
+        assert self.is_symmetric, "Only symmetric quantization is supported"
+        self.ch_axis: int = ch_axis
+
+        self.scale = torch.tensor([], requires_grad=False)
+        self.zero_point = torch.tensor([], requires_grad=False)
+        self.V = torch.nn.Parameter(torch.tensor([]), requires_grad=True)
+        # Fixed Stretch parameters
+        self.zeta: torch.Tensor = torch.tensor(1.1, requires_grad=False)
+        self.gamma: torch.Tensor = torch.tensor(-0.1, requires_grad=False)
+        self.sigmoid = torch.nn.Sigmoid()
+        self.use_soft_rounding = True
+
+    @torch.jit.export
+    def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self.scale, self.zero_point
+
+    @torch.jit.export
+    def extra_repr(self) -> str:
+        return (
+            f"fake_quant_enabled={self.fake_quant_enabled}, observer_enabled={self.observer_enabled}, "
+            f"quant_min={self.activation_post_process.quant_min}, quant_max={self.activation_post_process.quant_max}, "
+            f"dtype={self.dtype}, qscheme={self.qscheme}, ch_axis={self.ch_axis}, "
+            f"scale={self.scale}, zero_point={self.zero_point}, (self.V >= 0).int().sum()={(self.V >= 0).int().sum()}"
+        )
+
+    def enable_weight_fake_quant(self) -> None:
+        self.fake_quant_enabled[0] = 1
+
+    def get_rectified_sigmoid_func(self) -> torch.Tensor:
+        if self.use_soft_rounding:
+            return torch.clamp(
+                self.sigmoid(self.V) * (self.zeta - self.gamma) + self.gamma,
+                min=0,
+                max=1,
+            )
+        else:
+            # This will dump a binary solution
+            return (self.V >= 0).int()
+
+    @torch.jit.ignore
+    def update_scale(
+        self, X: torch.Tensor, _scale: torch.Tensor, _zero_point: torch.Tensor
+    ) -> None:
+        if self.scale.numel() == 0:
+            self.scale.data = _scale.to(X.device)
+            self.zero_point = _zero_point.to(X.device)
+        else:
+            self.scale.data = _scale
+            if not self.is_symmetric:
+                self.zero_point = _zero_point
+            else:
+                self.zero_point = torch.zeros_like(_zero_point)
+            for i in range(X.dim()):
+                if i == self.ch_axis:
+                    continue
+                self.zero_point = self.zero_point.unsqueeze(i)
+        X_q = X / self.scale
+        X_q_floor = torch.floor(X_q)
+        residual = X_q - X_q_floor  # [0,1)
+        assert torch.all(
+            torch.ge(residual, 0)
+        ), "residual should be non-negative [0, 1)"
+        V_init = -torch.log((self.zeta - self.gamma) / (residual - self.gamma) - 1)
+        self.V.data = V_init
+
+    def forward(self, X: torch.Tensor) -> torch.Tensor:
+        if self.observer_enabled[0] == 1:
+            X_detached = X.detach()
+            self.activation_post_process(X_detached)
+            _scale, _zero_point = self.activation_post_process.calculate_qparams()
+            _scale, _zero_point = _scale.to(self.scale.device), _zero_point.to(
+                self.zero_point.device
+            )
+            dims = list(range(X.dim()))
+            if not self.is_per_tensor:
+                dims.remove(self.ch_axis)
+            if not self.is_per_tensor:
+                for i in range(X.dim()):
+                    if i == self.ch_axis:
+                        continue
+                    _scale = _scale.unsqueeze(i)
+                    _zero_point = _zero_point.unsqueeze(i)
+            self.update_scale(X_detached, _scale, _zero_point)
+
+        if self.fake_quant_enabled[0] == 1:
+            # Perform soft quantization
+            # See the equation (23) in Adaround paper
+            h_v = self.get_rectified_sigmoid_func()
+            X_q = X / self.scale
+            # Straight-Through Estimator for floor function
+            X_q_floor = torch.floor(X_q) + self.zero_point
+            # Regardless of rounding, gradient should be able to flow back to self.V from X_q_dq.
+            # With adaround, we don't train weight, but train V only.
+            X_q_dq = (
+                torch.clamp(X_q_floor + h_v, min=self.quant_min, max=self.quant_max)
+                - self.zero_point
+            ) * self.scale
+            return X_q_dq
+        else:
+            return X
diff --git a/torch/ao/quantization/experimental/adaround_loss.py b/torch/ao/quantization/experimental/adaround_loss.py
new file mode 100644
index 000000000000..8080d72cc6da
--- /dev/null
+++ b/torch/ao/quantization/experimental/adaround_loss.py
@@ -0,0 +1,96 @@
+from typing import Tuple
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+ADAROUND_ZETA: float = 1.1
+ADAROUND_GAMMA: float = -0.1
+
+
+class AdaptiveRoundingLoss(torch.nn.Module):
+    """
+    Adaptive Rounding Loss functions described in https://arxiv.org/pdf/2004.10568.pdf
+    rounding regularization is eq [24]
+    reconstruction loss is eq [25] except regularization term
+    """
+
+    def __init__(
+        self,
+        max_iter: int,
+        warm_start: float = 0.2,
+        beta_range: Tuple[int, int] = (20, 2),
+        reg_param: float = 0.001,
+    ) -> None:
+        super().__init__()
+        self.max_iter = max_iter
+        self.warm_start = warm_start
+        self.beta_range = beta_range
+        self.reg_param = reg_param
+
+    def rounding_regularization(
+        self,
+        V: torch.Tensor,
+        curr_iter: int,
+    ) -> torch.Tensor:
+        """
+        Major logics copied from official Adaround Implementation.
+        Apply rounding regularization to the input tensor V.
+        """
+        assert (
+            curr_iter < self.max_iter
+        ), "Current iteration strictly les sthan max iteration"
+        if curr_iter < self.warm_start * self.max_iter:
+            return torch.tensor(0.0)
+        else:
+            start_beta, end_beta = self.beta_range
+            warm_start_end_iter = self.warm_start * self.max_iter
+
+            # compute relative iteration of current iteration
+            rel_iter = (curr_iter - warm_start_end_iter) / (
+                self.max_iter - warm_start_end_iter
+            )
+            beta = end_beta + 0.5 * (start_beta - end_beta) * (
+                1 + np.cos(rel_iter * np.pi)
+            )
+
+            # A rectified sigmoid for soft-quantization as formualted [23] in https://arxiv.org/pdf/2004.10568.pdf
+            h_alpha = torch.clamp(
+                torch.sigmoid(V) * (ADAROUND_ZETA - ADAROUND_GAMMA) + ADAROUND_GAMMA,
+                min=0,
+                max=1,
+            )
+
+            # Apply rounding regularization
+            # This regularization term helps out term to converge into binary solution either 0 or 1 at the end of optimization.
+            inner_term = torch.add(2 * h_alpha, -1).abs().pow(beta)
+            regularization_term = torch.add(1, -inner_term).sum()
+            return regularization_term * self.reg_param
+
+    def reconstruction_loss(
+        self,
+        soft_quantized_output: torch.Tensor,
+        original_output: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Compute the reconstruction loss between the soft quantized output and the original output.
+        """
+        return F.mse_loss(
+            soft_quantized_output, original_output, reduction="none"
+        ).mean()
+
+    def forward(
+        self,
+        soft_quantized_output: torch.Tensor,
+        original_output: torch.Tensor,
+        V: torch.Tensor,
+        curr_iter: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Compute the asymmetric reconstruction formulation as eq [25]
+        """
+        regularization_term = self.rounding_regularization(V, curr_iter)
+        reconstruction_term = self.reconstruction_loss(
+            soft_quantized_output, original_output
+        )
+        return regularization_term, reconstruction_term
diff --git a/torch/ao/quantization/experimental/adaround_optimization.py b/torch/ao/quantization/experimental/adaround_optimization.py
new file mode 100644
index 000000000000..7304f885a6f3
--- /dev/null
+++ b/torch/ao/quantization/experimental/adaround_optimization.py
@@ -0,0 +1,238 @@
+import copy
+import logging
+from typing import Any, Callable, List, Optional, Tuple, Type, Union
+
+import torch
+from torch.ao.quantization.experimental.adaround_fake_quantize import (
+    AdaroundFakeQuantizer,
+)
+from torch.ao.quantization.experimental.adaround_loss import AdaptiveRoundingLoss
+from torch.ao.quantization.observer import MinMaxObserver
+from torch.nn import functional as F
+from torch.nn.parallel import DataParallel
+from torch.utils.data import DataLoader, TensorDataset
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class AdaptiveRoundingOptimizer:
+    def __init__(
+        self,
+        model: Union[torch.nn.Module, torch.nn.DataParallel],
+        callback: Callable[[torch.nn.Module, List[Any]], None],
+        forward_hook_wrapper: Callable[[List[torch.Tensor]], Callable],
+        data: List[Any],
+        observer: Type[torch.ao.quantization.observer.ObserverBase] = MinMaxObserver,
+        max_iter=10000,
+        dtype: torch.dtype = torch.qint8,
+        quant_min=-128,
+        quant_max=127,
+        qscheme: torch.qscheme = torch.per_tensor_symmetric,
+        batch_size: int = 256,
+    ):
+        self.model = model
+        self.q_model = copy.deepcopy(self.model)
+        self.device = torch.device("cuda") if torch.cuda.is_available() else None
+        self.callback = callback
+        self.forward_hook_wrapper = forward_hook_wrapper
+        # TODO rather than having a data as list type or, we better pass *iterator* instead of list
+        self.data = data
+        self.batch_size = min(batch_size, len(data))
+        self.max_iter = max_iter
+        self.adaptive_round_loss_fn = AdaptiveRoundingLoss(
+            max_iter=self.max_iter, warm_start=0.2
+        )
+        self.dtype = dtype
+        self.observer = observer
+        self.quant_min = quant_min
+        self.quant_max = quant_max
+        self.qscheme = qscheme
+
+    def run_adaround(self) -> torch.nn.Module:
+        layer_list: List[Tuple[str, torch.nn.Module, torch.nn.Module]] = []
+        for (name, module), q_module in zip(
+            self.model.named_modules(), self.q_model.modules()
+        ):
+            if isinstance(module, (torch.nn.Conv1d, torch.nn.Linear)):
+                # Knowing activation ahead-of-time would be helpful for asymmetric formulation
+                # But this is challenging in eager mode, but graph module.
+                layer_list.append((name, module, q_module))
+        logger.info(f"Total number of layers : {len(layer_list)}")  # noqa: G004
+
+        for name, module, q_module in layer_list:
+            logger.info(
+                f"Kick start adaptive rounding on {name} module {module}"  # noqa: G004
+            )
+            self.optimize_adaptive_rounding(
+                module,
+                q_module,
+                None,
+            )
+
+        return (
+            self.q_model.module
+            if isinstance(self.q_model, DataParallel)
+            else self.q_model
+        )
+
+    def get_data_inp_out(
+        self, module: torch.nn.Module, q_module: torch.nn.Module, data: List[Any]
+    ) -> Tuple[List[torch.Tensor], List[torch.Tensor], List[torch.Tensor]]:
+        fp_out: List[torch.Tensor] = []
+        q_input: List[torch.Tensor] = []
+        fp_input: List[torch.Tensor] = []
+        fp32_fetcher: List[torch.Tensor] = []
+        quant_fetcher: List[torch.Tensor] = []
+        handler1 = module.register_forward_hook(self.forward_hook_wrapper(fp32_fetcher))
+        handler2 = q_module.register_forward_hook(
+            self.forward_hook_wrapper(quant_fetcher)
+        )
+        for data_ in data:
+            with torch.no_grad():
+                self.callback(self.model, data_)
+                self.callback(self.q_model, data_)
+            fp32_output = fp32_fetcher[1]
+            quant_input = quant_fetcher[0]
+            fp_out.append(fp32_output)
+            q_input.append(quant_input)
+            fp_input.append(fp32_fetcher[0])
+        handler1.remove()
+        handler2.remove()
+        return q_input, fp_out, fp_input
+
+    @torch.no_grad()
+    def feed_forward(self, x, weight, module):
+        if isinstance(module, torch.nn.Conv1d):
+            out = torch.nn.functional.conv1d(
+                x,
+                weight,
+                stride=module.stride,
+                padding=module.padding,
+                dilation=module.dilation,
+                groups=module.groups,
+            )
+        elif isinstance(module, torch.nn.Linear):
+            out = torch.nn.functional.linear(
+                x,
+                weight,
+                bias=module.bias,
+            )
+        else:
+            raise NotImplementedError
+        return out
+
+    def _compute_and_display_local_losses(
+        self,
+        ada_quantizer: AdaroundFakeQuantizer,
+        q_module: torch.nn.Module,
+        q_inp: torch.Tensor,
+        fp_out: torch.Tensor,
+    ):
+        with torch.no_grad():
+            ada_quantizer.use_soft_rounding = False
+            q_w_hard_round = ada_quantizer(q_module.weight)
+            out_hard_quant = self.feed_forward(q_inp, q_w_hard_round, q_module)
+            ada_quantizer.use_soft_rounding = True
+            q_w_soft_round = ada_quantizer(q_module.weight)
+            out_soft_quant = self.feed_forward(q_inp, q_w_soft_round, q_module)
+            soft_quant_loss = F.mse_loss(out_soft_quant, fp_out)
+            hard_quant_loss = F.mse_loss(out_hard_quant, fp_out)
+            logger.info(
+                f"soft quant loss: {soft_quant_loss.item()} hard quant loss: {hard_quant_loss.item()}"  # noqa: G004
+            )
+
+    def optimize_adaptive_rounding(
+        self,
+        module: torch.nn.Module,
+        q_module: torch.nn.Module,
+        activation: Optional[Callable[[torch.Tensor], torch.Tensor]] = None,
+    ) -> None:
+        ada_quantizer = AdaroundFakeQuantizer(
+            dtype=self.dtype,
+            observer=self.observer,
+            qscheme=self.qscheme,
+            quant_min=self.quant_min,
+            quant_max=self.quant_max,
+            reduce_range=False,
+        )
+        ada_quantizer.enable_observer()
+        ada_quantizer(q_module.weight)
+        ada_quantizer.disable_observer()
+        ada_quantizer.enable_fake_quant()
+        optimizer = torch.optim.Adam([ada_quantizer.V])
+        inp, out, fp_in = self.get_data_inp_out(module, q_module, self.data)
+
+        logger.info("==================== Before adaround ====================")
+        test_in, test_out, fp_test_in = self.get_data_inp_out(
+            module, q_module, self.data[0]
+        )
+
+        assert (
+            torch.abs(test_out[0] - module(fp_test_in[0])).sum().item() == 0
+        ), "In-placed activation is detected, please do not use activation in-placed"
+        # Stack the tensors in each list into a single tensor
+        # Assuming inp and out are your lists of tensors
+        inp_tensor = torch.vstack(inp)
+        out_tensor = torch.vstack(out)
+        dataset = TensorDataset(inp_tensor, out_tensor)
+        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)
+
+        self._compute_and_display_local_losses(
+            ada_quantizer, q_module, test_in[0], test_out[0]
+        )
+        global_idx = 0
+        one_iter = len(out) // self.batch_size
+        for iteration in range(self.max_iter // one_iter):
+            reconstruction_loss = regularization_loss = torch.tensor(0)
+            for q_inp, fp_out in dataloader:
+                optimizer.zero_grad()
+                q_weight = ada_quantizer(q_module.weight)
+                if isinstance(module, torch.nn.Conv1d):
+                    q_out = torch.nn.functional.conv1d(
+                        q_inp,
+                        q_weight,
+                        stride=q_module.stride,
+                        padding=q_module.padding,
+                        dilation=q_module.dilation,
+                        groups=q_module.groups,
+                    )
+                elif isinstance(q_module, torch.nn.Linear):
+                    q_out = torch.nn.functional.linear(
+                        q_inp,
+                        q_weight,
+                        bias=q_module.bias,
+                    )
+                else:
+                    raise NotImplementedError
+                regularization_loss, reconstruction_loss = self.adaptive_round_loss_fn(
+                    fp_out,
+                    q_out,
+                    ada_quantizer.V,
+                    curr_iter=global_idx,
+                )
+                loss = regularization_loss + reconstruction_loss
+                loss.backward()
+                optimizer.step()
+                global_idx += 1
+                if global_idx >= self.max_iter:
+                    break
+            if global_idx >= self.max_iter:
+                break
+            if iteration % 30 == 0:
+                logger.info(
+                    f"glob iter {global_idx} regularization_loss {regularization_loss.item()} "  # noqa: G004
+                    f"reconstruction_loss {reconstruction_loss.item()}"  # noqa: G004
+                )
+        logger.info("==================== After adaround ====================")
+        self._compute_and_display_local_losses(
+            ada_quantizer, q_module, test_in[0], test_out[0]
+        )
+
+        ada_quantizer.use_soft_rounding = True
+        ada_quantizer.V.requires_grad = False
+        ada_quantizer = ada_quantizer.eval()
+        q_weight = ada_quantizer(q_module.weight)
+        # At the end of optimization, we need to copy the adarounded weight back to the original module
+        q_module.weight.data.copy_(q_weight)
+        # Eager mode requires observer to be set as "weight_fake_quant" to be parsed
+        q_module.weight_fake_quant = ada_quantizer.activation_post_process
diff --git a/torch/ao/quantization/fx/_lower_to_native_backend.py b/torch/ao/quantization/fx/_lower_to_native_backend.py
index 728506037b55..049f4e3135d9 100644
--- a/torch/ao/quantization/fx/_lower_to_native_backend.py
+++ b/torch/ao/quantization/fx/_lower_to_native_backend.py
@@ -926,8 +926,14 @@ def _lower_dynamic_weighted_ref_functional(
         # Linear prepack args: (quantized weights[, bias])
         # Conv prepack args: (quantized weights[, bias, stride, padding, dilation, groups])
         prepack_args = [quantized_weight] + remaining_func_args
+        prepack_kwargs = {}
         if func_node.target == F.linear:
             prepack_op = get_linear_prepack_op_for_dtype(weight_dtype)
+            kwargs = func_node.kwargs.copy()
+            if 'bias' in kwargs:
+                prepack_kwargs['B'] = kwargs['bias']
+                del kwargs['bias']
+                func_node.kwargs = kwargs
         elif func_node.target in CONV_FUNCTIONAL_OPS:
             prepack_op = get_qconv_prepack_op(func_node.target)
             # For conv1d, the stride, padding, and dilation args may be ints,
@@ -939,7 +945,7 @@ def _lower_dynamic_weighted_ref_functional(
         else:
             raise ValueError(f"Lowering is not supported for op '{func_node.target}'")
         with model.graph.inserting_before(func_node):
-            packed_weight = model.graph.create_node("call_function", prepack_op, tuple(prepack_args), {})
+            packed_weight = model.graph.create_node("call_function", prepack_op, tuple(prepack_args), prepack_kwargs)
 
         # Step 3: Replace reference pattern with the corresponding quantized op
         func_node.target = q_relu_func if relu_node is not None else q_func
diff --git a/torch/ao/quantization/pt2e/export_utils.py b/torch/ao/quantization/pt2e/export_utils.py
index 2e7b9e380dfd..139042c326b8 100644
--- a/torch/ao/quantization/pt2e/export_utils.py
+++ b/torch/ao/quantization/pt2e/export_utils.py
@@ -8,7 +8,6 @@
 
 __all__ = [
     "model_is_exported",
-    "_WrapperModule",
 ]
 
 
diff --git a/torch/ao/quantization/pt2e/port_metadata_pass.py b/torch/ao/quantization/pt2e/port_metadata_pass.py
index c47e82073578..5ea1f939a3b6 100644
--- a/torch/ao/quantization/pt2e/port_metadata_pass.py
+++ b/torch/ao/quantization/pt2e/port_metadata_pass.py
@@ -136,7 +136,7 @@ def _port_metadata_for_output_quant_nodes(
 
     node_users = _filter_sym_size_users(node)
     if len(node_users) != 1:
-        raise InternalError(f"Expecting {node} to have single user")
+        logger.warning(f"Expecting {node} to have single user")  # noqa: G004
     q_node = node_users.pop()
     if q_node.op != "call_function" or q_node.target not in _QUANTIZE_OPS:
         logger.warning(
diff --git a/torch/ao/quantization/pt2e/utils.py b/torch/ao/quantization/pt2e/utils.py
index 051f02de2da8..25f82f04e4e3 100644
--- a/torch/ao/quantization/pt2e/utils.py
+++ b/torch/ao/quantization/pt2e/utils.py
@@ -21,7 +21,6 @@
 
 __all__ = [
     "fold_bn_weights_into_conv_node",
-    "_get_aten_graph_module_for_pattern",
     "remove_tensor_overload_for_qdq_ops",
 ]
 
diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
index e12aef1158a8..9b5788aff227 100644
--- a/torch/autograd/__init__.py
+++ b/torch/autograd/__init__.py
@@ -31,7 +31,24 @@
 
 from .variable import Variable
 
-__all__ = ["Variable", "Function", "backward", "grad_mode"]
+__all__ = [
+    "Variable",
+    "Function",
+    "backward",
+    "grad_mode",
+    "NestedIOFunction",
+    "detect_anomaly",
+    "enable_grad",
+    "grad",
+    "gradcheck",
+    "gradgradcheck",
+    "inference_mode",
+    "no_grad",
+    "set_detect_anomaly",
+    "set_grad_enabled",
+    "set_multithreading_enabled",
+    "variable",
+]
 
 _OptionalTensor = Optional[torch.Tensor]
 _ShapeorNestedShape = Union[_size, Sequence[_size], torch.Tensor]
diff --git a/torch/backends/cuda/__init__.py b/torch/backends/cuda/__init__.py
index 6d079bdf06bf..f1b68a446225 100644
--- a/torch/backends/cuda/__init__.py
+++ b/torch/backends/cuda/__init__.py
@@ -15,7 +15,6 @@
     "preferred_blas_library",
     "cufft_plan_cache",
     "matmul",
-    "SDPBackend",
     "SDPAParams",
     "enable_cudnn_sdp",
     "cudnn_sdp_enabled",
@@ -399,7 +398,7 @@ def sdp_kernel(
         ),
         FutureWarning,
     )
-    from torch.nn.attention import sdpa_kernel, SDPBackend
+    from torch.nn.attention import sdpa_kernel
 
     backend_list = []
     if enable_flash:
diff --git a/torch/backends/xeon/run_cpu.py b/torch/backends/xeon/run_cpu.py
index b67962c7025b..0344631ee6b4 100644
--- a/torch/backends/xeon/run_cpu.py
+++ b/torch/backends/xeon/run_cpu.py
@@ -134,7 +134,7 @@
 from typing import Dict, List
 
 from torch.distributed.elastic.multiprocessing import (
-    DefaultLogsSpecs,
+    DefaultLogsSpecs as _DefaultLogsSpecs,
     start_processes,
     Std,
 )
@@ -682,7 +682,7 @@ def launch(self, args):
             entrypoint=entrypoint,
             args=launch_args,
             envs=launch_envs,
-            logs_specs=DefaultLogsSpecs(log_dir=args.log_path, tee=launch_tee),
+            logs_specs=_DefaultLogsSpecs(log_dir=args.log_path, tee=launch_tee),
         )
         ctx.wait()
 
diff --git a/torch/cpu/__init__.py b/torch/cpu/__init__.py
index 14794627d752..2f2561b69c1c 100644
--- a/torch/cpu/__init__.py
+++ b/torch/cpu/__init__.py
@@ -11,6 +11,7 @@
 from .. import device as _device
 from . import amp
 
+
 __all__ = [
     "is_available",
     "synchronize",
@@ -49,7 +50,6 @@ def synchronize(device: _device_t = None) -> None:
 
     N.B. This function only exists to facilitate device-agnostic code.
     """
-    pass
 
 
 class Stream:
@@ -57,7 +57,7 @@ class Stream:
     N.B. This class only exists to facilitate device-agnostic code
     """
 
-    def __init__(self, priority: int = -1):
+    def __init__(self, priority: int = -1) -> None:
         pass
 
     def wait_stream(self, stream) -> None:
@@ -68,13 +68,13 @@ class Event:
     def query(self) -> bool:
         return True
 
-    def record(self, stream=None):
+    def record(self, stream=None) -> None:
         pass
 
-    def synchronize(self):
+    def synchronize(self) -> None:
         pass
 
-    def wait(self, stream=None):
+    def wait(self, stream=None) -> None:
         pass
 
 
@@ -100,6 +100,7 @@ class StreamContext(AbstractContextManager):
     N.B. This class only exists to facilitate device-agnostic code
 
     """
+
     cur_stream: Optional[Stream]
 
     def __init__(self, stream):
@@ -115,7 +116,7 @@ def __enter__(self):
         self.prev_stream = _current_stream
         _current_stream = cur_stream
 
-    def __exit__(self, type: Any, value: Any, traceback: Any):
+    def __exit__(self, type: Any, value: Any, traceback: Any) -> None:
         cur_stream = self.stream
         if cur_stream is None:
             return
@@ -146,7 +147,6 @@ def set_device(device: _device_t) -> None:
 
     N.B. This function only exists to facilitate device-agnostic code
     """
-    pass
 
 
 def current_device() -> str:
diff --git a/torch/cpu/amp/autocast_mode.py b/torch/cpu/amp/autocast_mode.py
index a29a96891722..3f0a574f7d38 100644
--- a/torch/cpu/amp/autocast_mode.py
+++ b/torch/cpu/amp/autocast_mode.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Any
 
 import torch
@@ -8,7 +9,7 @@
 class autocast(torch.amp.autocast_mode.autocast):
     r"""
     See :class:`torch.autocast`.
-    ``torch.cpu.amp.autocast(args...)`` is equivalent to ``torch.autocast("cpu", args...)``
+    ``torch.cpu.amp.autocast(args...)`` is deprecated. Please use ``torch.amp.autocast("cpu", args...)`` instead.
     """
 
     def __init__(
@@ -22,6 +23,10 @@ def __init__(
             self.device = "cpu"
             self.fast_dtype = dtype
             return
+        warnings.warn(
+            "torch.cpu.amp.autocast(args...) is deprecated. Please use torch.amp.autocast('cpu', args...) instead.",
+            DeprecationWarning,
+        )
         super().__init__(
             "cpu", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled
         )
diff --git a/torch/csrc/Exceptions.h b/torch/csrc/Exceptions.h
index 6b8d923f4090..4f8d614e16dc 100644
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@@ -15,7 +15,7 @@
 #include <torch/csrc/utils/cpp_stacktraces.h>
 #include <torch/csrc/utils/pybind.h>
 
-#if defined(USE_DISTRIBUTED) && defined(USE_C10D)
+#if defined(USE_DISTRIBUTED)
 #include <torch/csrc/distributed/c10d/exception.h>
 #endif
 
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 3be764220e0d..9ff9131435f4 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -422,6 +422,19 @@ PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
   END_HANDLE_TH_ERRORS
 }
 
+PyObject* THPModule_check_tp_alloc_is_default(
+    PyObject* _unused,
+    PyObject* cls) {
+  HANDLE_TH_ERRORS
+  TORCH_CHECK_TYPE(
+      PyType_Check(cls),
+      "cls must be a type (got ",
+      Py_TYPE(cls)->tp_name,
+      ")");
+  return PyBool_FromLong(Py_TYPE(cls)->tp_alloc == PyType_GenericAlloc);
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject* THPModule_addDocStr(PyObject* _unused, PyObject* args) {
   // adds a __doc__ string to a function, similar to numpy's arr_add_docstring
   static std::vector<std::string> all_docs;
@@ -1268,6 +1281,10 @@ static PyMethodDef TorchMethods[] = { // NOLINT
     {"_autograd_init", THPAutograd_initExtension, METH_NOARGS, nullptr},
     {"_add_docstr", THPModule_addDocStr, METH_VARARGS, nullptr},
     {"_swap_tensor_impl", THPModule_swap_tensor_impl, METH_VARARGS, nullptr},
+    {"_check_tp_alloc_is_default",
+     THPModule_check_tp_alloc_is_default,
+     METH_O,
+     nullptr},
     {"_init_names", THPModule_initNames, METH_O, nullptr},
     {"_has_distributed", THPModule_hasDistributed, METH_NOARGS, nullptr},
     {"_set_default_tensor_type",
diff --git a/torch/csrc/StorageMethods.cpp b/torch/csrc/StorageMethods.cpp
index 2e7597665267..540268d15224 100644
--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@@ -94,7 +94,13 @@ static PyObject* THPStorage_copy_(
   TORCH_CHECK(
       !invalid, "Attempted to call copy_() on an invalid python storage.")
 
-  TORCH_CHECK(self_.nbytes() == src.nbytes(), "size does not match");
+  TORCH_CHECK(
+      self_.nbytes() == src.nbytes(),
+      "size does not match, self was ",
+      self_.nbytes(),
+      " bytes but src was ",
+      src.nbytes(),
+      " bytes");
 
   at::storage_copy(self_, src, non_blocking);
 
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index 5b57e5891a1c..64b85dd72f59 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -18,6 +18,7 @@
 #include <torch/csrc/profiler/perf.h>
 #include <torch/csrc/profiler/standalone/itt_observer.h>
 #include <torch/csrc/profiler/standalone/nvtx_observer.h>
+#include <torch/csrc/profiler/standalone/privateuse1_observer.h>
 #include <torch/csrc/profiler/util.h>
 
 #include <ATen/Context.h>
@@ -638,6 +639,9 @@ void enableProfiler(
   } else if (config.state == ProfilerState::ITT) {
     torch::profiler::impl::pushITTCallbacks(config, scopes);
     return;
+  } else if (config.state == ProfilerState::PRIVATEUSE1) {
+    torch::profiler::impl::pushPRIVATEUSE1CallbacksStub(config, scopes);
+    return;
   }
 
   TORCH_CHECK(
@@ -673,7 +677,8 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
            config.state == ProfilerState::KINETO_PRIVATEUSE1_FALLBACK ||
            config.state == ProfilerState::KINETO_ONDEMAND ||
            config.state == ProfilerState::NVTX ||
-           config.state == ProfilerState::ITT),
+           config.state == ProfilerState::ITT ||
+           config.state == ProfilerState::PRIVATEUSE1),
       "Can't disable Kineto profiler when it's not running");
 
   state_ptr->removeCallback();
@@ -685,9 +690,11 @@ std::unique_ptr<ProfilerResult> disableProfiler() {
     return std::make_unique<ProfilerResult>();
   }
 
-  // Shared among NVTX, KINETO, KINETO_GPU_FALLBACK, KINETO_PRIVATEUSE1_FALLBACK
+  // Shared among NVTX, PRIVATEUSE1, KINETO, KINETO_GPU_FALLBACK,
+  // KINETO_PRIVATEUSE1_FALLBACK
   std::unique_ptr<ProfilerResult> result;
-  if (state_ptr->config().state == ProfilerState::NVTX) {
+  if (state_ptr->config().state == ProfilerState::NVTX ||
+      state_ptr->config().state == ProfilerState::PRIVATEUSE1) {
     result = std::make_unique<ProfilerResult>();
   }
 
diff --git a/torch/csrc/distributed/c10d/Functional.cpp b/torch/csrc/distributed/c10d/Functional.cpp
index d392c0213b84..9d525f0d5640 100644
--- a/torch/csrc/distributed/c10d/Functional.cpp
+++ b/torch/csrc/distributed/c10d/Functional.cpp
@@ -196,6 +196,19 @@ at::Tensor all_gather_into_tensor(
       inputs, group_size, std::move(group_name))[0];
 }
 
+at::Tensor& all_gather_into_tensor_out(
+    at::Tensor& input,
+    int64_t group_size,
+    std::string group_name,
+    at::Tensor& output) {
+  c10d::AllgatherOptions opts;
+
+  auto group = c10d::resolve_process_group(group_name);
+  auto work = group->_allgather_base(output, input, opts);
+  c10d::RankLocal<WorkRegistry>::get().register_work(output, work);
+  return output;
+}
+
 at::Tensor allocate_reduce_scatter_output(
     const at::Tensor& input,
     const int64_t group_size) {
@@ -321,6 +334,13 @@ TORCH_LIBRARY(_c10d_functional, m) {
           c10::DispatchKey::CompositeExplicitAutograd, ::all_reduce_coalesced_),
       {at::Tag::pt2_compliant_tag});
 
+  m.def(
+      "all_gather_into_tensor_out(Tensor input, int group_size, str group_name, *, Tensor(a!) out) -> Tensor(a!)",
+      torch::dispatch(
+          c10::DispatchKey::CompositeExplicitAutograd,
+          ::all_gather_into_tensor_out),
+      {at::Tag::pt2_compliant_tag});
+
   m.def(
       "all_gather_into_tensor(Tensor input, int group_size, str group_name) -> Tensor",
       torch::dispatch(
diff --git a/torch/csrc/distributed/c10d/HashStore.hpp b/torch/csrc/distributed/c10d/HashStore.hpp
index 1453c0a72808..3697d62301ba 100644
--- a/torch/csrc/distributed/c10d/HashStore.hpp
+++ b/torch/csrc/distributed/c10d/HashStore.hpp
@@ -22,7 +22,7 @@ class TORCH_API HashStore : public Store {
   std::vector<uint8_t> get(const std::string& key) override;
 
   void wait(const std::vector<std::string>& keys) override {
-    wait(keys, Store::kDefaultTimeout);
+    wait(keys, timeout_);
   }
 
   void wait(
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
index 7437a4ef1846..7586058475ff 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1,4 +1,3 @@
-
 #ifdef USE_C10D_NCCL
 
 #include <exception>
@@ -64,6 +63,10 @@ std::map<at::ScalarType, ncclDataType_t> ncclDataType = {
     {at::kLong, ncclInt64},
     {at::kHalf, ncclHalf},
     {at::kBool, ncclUint8},
+    {at::kFloat8_e5m2, ncclUint8},
+    {at::kFloat8_e4m3fn, ncclUint8},
+    {at::kFloat8_e4m3fnuz, ncclUint8},
+    {at::kFloat8_e5m2fnuz, ncclUint8},
 #if HAS_NCCL_BF16_DATATYPE
     {at::kBFloat16, ncclBfloat16},
 #endif
@@ -748,6 +751,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
   // both timeout and other errors.
   dumpOnException_ = getCvarBool(TORCH_NCCL_DUMP_ON_TIMEOUT, false) ||
       (dist_debug_level_ >= DebugLevel::Detail);
+  enableNanCheck_ = getCvarBool(TORCH_NCCL_NAN_CHECK, false);
   heartbeat_ = 1ULL;
   monitorThreadEnabled_.store(getCvarBool(TORCH_NCCL_ENABLE_MONITORING, true));
   heartbeatTimeoutInSec_ =
@@ -836,6 +840,7 @@ ProcessGroupNCCL::ProcessGroupNCCL(
             << ", TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC: " << heartbeatTimeoutInSec_
             << ", TORCH_NCCL_TRACE_BUFFER_SIZE: " << ncclTraceBufferSize_
             << ", TORCH_NCCL_COORD_CHECK_MILSEC: " << coordCheckIntervalMilSec_
+            << ", TORCH_NCCL_NAN_CHECK: " << enableNanCheck_
             << ", PG Name: " << options_->group_name;
 
   if (options_->global_ranks_in_group.empty()) {
@@ -1565,6 +1570,8 @@ void ProcessGroupNCCL::watchdogHandler() {
       data.strings["last_enqueued_work_name"] = lastEnqueuedWorkName_;
       data.strings["last_started_work_name"] = lastStartedWorkName_;
       data.strings["last_completed_work_name"] = lastCompletedWorkName_;
+      data.strings["pg_name"] = pg_name_;
+      data.strings["pg_desc"] = pg_desc_;
       logger->log(data);
       lastStatusUpdateTime = std::chrono::steady_clock::now();
     }
@@ -2424,6 +2431,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::collective(
     OpType opType,
     const char* profilingTitle,
     bool avoidRecordStreams) {
+  if (enableNanCheck_) {
+    checkForNan(input);
+  }
   // Environment setting by the user may add onto collective call's option
   avoidRecordStreams |= avoidRecordStreams_;
   c10::cuda::CaptureStatus capture_status =
@@ -2779,6 +2789,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::pointToPoint(
     PreProcess pre,
     PostProcess post,
     const char* profilingTitle) {
+  if (enableNanCheck_) {
+    checkForNan(tensor);
+  }
   // avoidRecordStreams_ note:
   // send, recv, and irecv should be ok with avoidRecordStreams,
   // However, for isend, I don't think the API requires the user
@@ -3029,6 +3042,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_sparse(
     const AllreduceOptions& opts) {
   TORCH_CHECK(tensors.size() == 1, MULTI_DEVICE_ERROR_MSG);
   auto tensor = tensors.back();
+  TORCH_CHECK(
+      !isFloat8Type(tensor.scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
 #ifdef IS_NCCLX
   tensor = tensor.coalesce();
   at::Tensor outputTensor =
@@ -3143,7 +3159,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce(
       return c10::make_intrusive<IntraNodeCommWork>();
     }
   }
-
+  TORCH_CHECK(
+      !isFloat8Type(tensor.scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
   // @lint-ignore CLANGTIDY
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
@@ -3170,6 +3188,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::allreduce_coalesced(
     std::vector<at::Tensor>& tensors,
     const AllreduceCoalescedOptions& opts) {
   auto total_numel = check_gpu_tensors_same_device(tensors);
+  TORCH_CHECK(
+      !isFloat8Type(tensors.back().scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
 
   // @lint-ignore CLANGTIDY
   RECORD_PARAM_COMMS_DATA(
@@ -3542,6 +3563,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter(
   check_gpu_single_tensor(outputTensor);
   // @lint-ignore CLANGTIDY
   auto inputTensors_ = inputTensors.back();
+  TORCH_CHECK(
+      !isFloat8Type(outputTensor.scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
 
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
@@ -3653,6 +3677,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_reduce_scatter_base(
 
   // @lint-ignore CLANGTIDY
   const auto& tensor = outputTensor;
+  TORCH_CHECK(
+      !isFloat8Type(tensor.scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
   RECORD_PARAM_COMMS_DATA(
       static_cast<int>(
           this->getSequenceNumberForGroup() + 1), // seq + 1 to match collective
@@ -3713,6 +3740,9 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::reduce_scatter_tensor_coalesced(
     std::vector<at::Tensor>& outputs,
     std::vector<at::Tensor>& inputs,
     const ReduceScatterOptions& opts) {
+  TORCH_CHECK(
+      !isFloat8Type(inputs.back().scalar_type()),
+      "Float8 dtypes are not currenlty supported for NCCL reductions");
   return collectiveCoalesced(
       inputs,
       outputs,
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
index 4217d2fa4cea..07f3730b1338 100644
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@@ -100,6 +100,8 @@ static std::vector<std::string> TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC = {
 static std::vector<std::string> TORCH_NCCL_COORD_CHECK_MILSEC = {
     "TORCH_NCCL_COORD_CHECK_MILSEC"};
 
+static std::vector<std::string> TORCH_NCCL_NAN_CHECK = {"TORCH_NCCL_NAN_CHECK"};
+
 constexpr const char* NCCL_BACKEND_NAME = "nccl";
 
 constexpr const char* EXCEPTION_DUMP = "exception_dump";
@@ -1024,6 +1026,9 @@ class TORCH_API ProcessGroupNCCL : public Backend {
   // timeout and nccl errors.
   bool dumpOnException_;
 
+  // Whether or not to enable nan check for input tensors to collectives.
+  bool enableNanCheck_;
+
   // Whether or not to create start CUDAEvent and enable timing for start
   // and end events. Note that enableTiming_ is always true if desyncDebug_
   // is set to true.
diff --git a/torch/csrc/distributed/c10d/Store.hpp b/torch/csrc/distributed/c10d/Store.hpp
index af715ba98a79..993284fa7cc5 100644
--- a/torch/csrc/distributed/c10d/Store.hpp
+++ b/torch/csrc/distributed/c10d/Store.hpp
@@ -97,4 +97,33 @@ class TORCH_API Store : public torch::CustomClassHolder {
   std::chrono::milliseconds timeout_;
 };
 
+/*
+StoreTimeoutGuard is a RAII guard that will set the store timeout and restore it
+when it returns.
+*/
+class StoreTimeoutGuard {
+ public:
+  explicit StoreTimeoutGuard(
+      Store& store,
+      const std::chrono::milliseconds& timeout)
+      : store_(store) {
+    oldTimeout_ = store.getTimeout();
+    store.setTimeout(timeout);
+  }
+
+  ~StoreTimeoutGuard() {
+    store_.setTimeout(oldTimeout_);
+  }
+
+  /* Disabling copy and move semantics */
+  StoreTimeoutGuard(const StoreTimeoutGuard&) = delete;
+  StoreTimeoutGuard& operator=(const StoreTimeoutGuard&) = delete;
+  StoreTimeoutGuard(StoreTimeoutGuard&&) = delete;
+  StoreTimeoutGuard& operator=(StoreTimeoutGuard&&) = delete;
+
+ private:
+  Store& store_;
+  std::chrono::milliseconds oldTimeout_;
+};
+
 } // namespace c10d
diff --git a/torch/csrc/distributed/c10d/TraceUtils.h b/torch/csrc/distributed/c10d/TraceUtils.h
index ff1bf5b6ed9a..181f2208160b 100644
--- a/torch/csrc/distributed/c10d/TraceUtils.h
+++ b/torch/csrc/distributed/c10d/TraceUtils.h
@@ -15,6 +15,34 @@
 #include <vector>
 namespace c10d {
 
+static c10::IValue entries_key = "entries";
+static c10::IValue nccl_comm_key = "nccl_comm_state";
+static c10::IValue version_key = "version";
+// Update whenever changing contents or formatting of the dump
+// (minor when adding fields, major when changing existing fields)
+static c10::IValue version_val = "1.5";
+static c10::IValue pg_config_key = "pg_config";
+static c10::IValue record_id_key = "record_id";
+static c10::IValue pg_id_key = "pg_id";
+static c10::IValue pg_name_key = "process_group";
+static c10::IValue seq_id_key = "seq_id";
+static c10::IValue op_id_key = "op_id";
+static c10::IValue profiling_name_key = "profiling_name";
+static c10::IValue input_sizes_key = "input_sizes";
+static c10::IValue output_sizes_key = "output_sizes";
+static c10::IValue time_created_key = "time_created_ns";
+static c10::IValue duration_key = "duration_ms";
+
+static c10::IValue frames_key = "frames";
+static c10::IValue state_key = "state";
+static c10::IValue line_key = "line";
+static c10::IValue name_key = "name";
+static c10::IValue filename_key = "filename";
+static c10::IValue retired_key = "retired";
+static c10::IValue time_discovered_started_key = "time_discovered_started_ns";
+static c10::IValue time_discovered_completed_key =
+    "time_discovered_completed_ns";
+
 /* Trace Utils Related to TORCH_NCCL_DESYNC_DEBUG */
 
 inline std::string getTraceStartKey(const std::string& pgName, int rank) {
@@ -606,32 +634,6 @@ struct NCCLTraceBuffer {
           std::unordered_map<std::string, std::string>>>& ncclDumpMap) {
     auto result = dump_entries();
     auto entries = new_list();
-    c10::IValue entries_key = "entries";
-    c10::IValue nccl_comm_key = "nccl_comm_state";
-    c10::IValue version_key = "version";
-    // Update whenever changing contents or formatting of the dump
-    // (minor when adding fields, major when changing existing fields)
-    c10::IValue version_val = "1.5";
-    c10::IValue pg_config_key = "pg_config";
-    c10::IValue record_id_key = "record_id";
-    c10::IValue pg_id_key = "pg_id";
-    c10::IValue pg_name_key = "process_group";
-    c10::IValue seq_id_key = "seq_id";
-    c10::IValue op_id_key = "op_id";
-    c10::IValue profiling_name_key = "profiling_name";
-    c10::IValue input_sizes_key = "input_sizes";
-    c10::IValue output_sizes_key = "output_sizes";
-    c10::IValue time_created_key = "time_created_ns";
-    c10::IValue duration_key = "duration_ms";
-
-    c10::IValue frames_key = "frames";
-    c10::IValue state_key = "state";
-    c10::IValue line_key = "line";
-    c10::IValue name_key = "name";
-    c10::IValue filename_key = "filename";
-    c10::IValue retired_key = "retired";
-    c10::IValue time_discovered_started_key = "time_discovered_started_ns";
-    c10::IValue time_discovered_completed_key = "time_discovered_completed_ns";
 
     std::vector<torch::CapturedTraceback*> tracebacks;
     for (auto& e : result) {
diff --git a/torch/csrc/distributed/c10d/Utils.cu b/torch/csrc/distributed/c10d/Utils.cu
new file mode 100644
index 000000000000..1a4b3ebb651b
--- /dev/null
+++ b/torch/csrc/distributed/c10d/Utils.cu
@@ -0,0 +1,45 @@
+#include <ATen/Dispatch.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/torch.h>
+#include <algorithm>
+
+namespace c10d {
+
+// CUDA kernel to check if data has NAN, device side assert
+// is raised if NAN is found
+template <typename T>
+__global__ void checkForNaN(T* data, size_t size) {
+  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  size_t stride = blockDim.x * gridDim.x;
+
+  for (size_t i = tid; i < size; i += stride) {
+    CUDA_KERNEL_ASSERT(!isnan(data[i]));
+  }
+}
+
+// CHECK if a Tensor contains NAN in any of its element
+void checkForNan(const at::Tensor& tensor) {
+  // skip check for non float types
+  if (!torch::is_floating_point(tensor)) {
+    return;
+  }
+  const size_t maxNumThreadsPerBlock = 256;
+  const size_t maxNumBlocks = 24;
+  const size_t numThreadsPerBlock =
+      std::min<size_t>(maxNumThreadsPerBlock, tensor.numel());
+
+  const size_t numBlocks = std::min<size_t>(
+      maxNumBlocks,
+      (tensor.numel() + numThreadsPerBlock - 1) / numThreadsPerBlock);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensor.scalar_type(), "checkForNaN", [&] {
+    checkForNaN<scalar_t><<<numBlocks, numThreadsPerBlock>>>(
+        tensor.data_ptr<scalar_t>(), tensor.numel());
+    C10_CUDA_KERNEL_LAUNCH_CHECK();
+  });
+
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/Utils.hpp b/torch/csrc/distributed/c10d/Utils.hpp
index 8427b63e38e8..b193c8971b57 100644
--- a/torch/csrc/distributed/c10d/Utils.hpp
+++ b/torch/csrc/distributed/c10d/Utils.hpp
@@ -612,6 +612,8 @@ using SizeType = uint64_t;
 // Since SOCKET_ERROR = -1 in MSVC, so also leverage SYSCHECK_ERR_RETURN_NEG1
 #define SYSCHECK_ERR_RETURN_NEG1(expr) SYSCHECK(expr, __output != -1)
 
+void checkForNan(const at::Tensor& tensor);
+
 namespace tcputil {
 
 // Send and receive
diff --git a/torch/csrc/distributed/c10d/control_collectives/ControlCollectives.hpp b/torch/csrc/distributed/c10d/control_collectives/ControlCollectives.hpp
new file mode 100644
index 000000000000..b98f9a71fb02
--- /dev/null
+++ b/torch/csrc/distributed/c10d/control_collectives/ControlCollectives.hpp
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <ATen/core/ivalue.h>
+#include <chrono>
+#include <cstdint>
+#include <string>
+#include <vector>
+
+#include <c10/macros/Macros.h>
+#include <torch/custom_class.h>
+
+namespace c10d {
+
+using namespace std::chrono_literals;
+
+class TORCH_API ControlCollectives : public torch::CustomClassHolder {
+ public:
+  virtual void barrier(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min,
+      bool block = true) = 0;
+
+  virtual void broadcastSend(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+  virtual std::vector<uint8_t> broadcastRecv(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min) = 0;
+
+  virtual void gatherSend(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+  virtual std::vector<std::vector<uint8_t>> gatherRecv(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+
+  virtual std::vector<uint8_t> scatterSend(
+      const std::string& key,
+      const std::vector<std::vector<uint8_t>>& data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+  virtual std::vector<uint8_t> scatterRecv(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min) = 0;
+
+  virtual std::vector<std::vector<uint8_t>> allGather(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+
+  virtual int64_t allSum(
+      const std::string& key,
+      int64_t data,
+      std::chrono::milliseconds timeout = 5min) = 0;
+};
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp
new file mode 100644
index 000000000000..995899441d46
--- /dev/null
+++ b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.cpp
@@ -0,0 +1,222 @@
+#include <c10/util/Exception.h>
+#include <fmt/format.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp>
+#include <chrono>
+#include <exception>
+#include <vector>
+
+namespace {
+std::string getRankKey(const std::string& key, int rank) {
+  return fmt::format("{}/{}", key, rank);
+}
+} // namespace
+
+namespace c10d {
+
+StoreCollectives::StoreCollectives(
+    c10::intrusive_ptr<::c10d::Store> store,
+    int rank,
+    int worldSize)
+    : store_(std::move(store)), rank_(rank), worldSize_(worldSize) {}
+
+void StoreCollectives::barrier(
+    const std::string& key,
+    std::chrono::milliseconds timeout,
+    bool blocking) {
+  enforceUnique(key);
+  StoreTimeoutGuard g{*store_, timeout};
+
+  auto num_members_key = fmt::format("{}/num_members", key);
+  auto last_members_key = fmt::format("{}/last_members", key);
+
+  auto idx = store_->add(num_members_key, 1);
+  store_->set(getRankKey(key, rank_), "joined");
+
+  if (idx == worldSize_) {
+    store_->set(last_members_key, "<val_ignored>");
+  } else if (blocking) {
+    try {
+      store_->wait({last_members_key});
+    } catch (const std::exception& e) {
+      std::string msg = "barrier failed -- missing ranks: ";
+      for (int i = 0; i < worldSize_; i++) {
+        if (i == rank_) {
+          continue;
+        }
+        auto rank_key = getRankKey(key, i);
+        if (!store_->check({rank_key})) {
+          msg += fmt::format("{}, ", i);
+        }
+      }
+      throw std::runtime_error(msg + e.what());
+    }
+  }
+}
+
+void StoreCollectives::broadcastSend(
+    const std::string& key,
+    const std::vector<uint8_t>& data,
+    std::chrono::milliseconds timeout) {
+  enforceUnique(key);
+  StoreTimeoutGuard g{*store_, timeout};
+
+  store_->set(key, data);
+}
+
+std::vector<uint8_t> StoreCollectives::broadcastRecv(
+    const std::string& key,
+    std::chrono::milliseconds timeout) {
+  enforceUnique(key);
+  StoreTimeoutGuard g{*store_, timeout};
+
+  return store_->get(key);
+}
+
+void StoreCollectives::gatherSend(
+    const std::string& key,
+    const std::vector<uint8_t>& data,
+    std::chrono::milliseconds timeout) {
+  enforceUnique(key);
+  StoreTimeoutGuard g{*store_, timeout};
+
+  auto rank_key = getRankKey(key, rank_);
+  store_->set(rank_key, data);
+}
+
+std::vector<std::vector<uint8_t>> StoreCollectives::gatherRecv(
+    const std::string& key,
+    const std::vector<uint8_t>& data,
+    std::chrono::milliseconds timeout) {
+  enforceUnique(key);
+  StoreTimeoutGuard g{*store_, timeout};
+
+  std::vector<std::string> keys;
+  keys.reserve(worldSize_);
+
+  for (int i = 0; i < worldSize_; i++) {
+    if (i == rank_) {
+      continue;
+    }
+    auto rank_key = getRankKey(key, i);
+    keys.emplace_back(rank_key);
+  }
+
+  std::vector<std::vector<uint8_t>> results;
+  results.reserve(worldSize_);
+
+  try {
+    results = store_->multiGet(keys);
+  } catch (const std::exception& e) {
+    std::string msg = "gather failed -- missing ranks: ";
+    for (int i = 0; i < worldSize_; i++) {
+      if (i == rank_) {
+        continue;
+      }
+      auto rank_key = getRankKey(key, i);
+      if (!store_->check({rank_key})) {
+        msg += fmt::format("{}, ", i);
+      }
+    }
+    throw std::runtime_error(msg + e.what());
+  }
+
+  // insert local data
+  results.insert(results.begin() + rank_, data);
+  return results;
+}
+
+std::vector<uint8_t> StoreCollectives::scatterSend(
+    const std::string& key,
+    const std::vector<std::vector<uint8_t>>& data,
+    std::chrono::milliseconds timeout) {
+  enforceUnique(key);
+  StoreTimeoutGuard g{*store_, timeout};
+
+  std::vector<std::string> keys;
+  keys.reserve(worldSize_);
+  for (int i = 0; i < worldSize_; i++) {
+    if (i == rank_) {
+      continue;
+    }
+    auto rank_key = getRankKey(key, i);
+    keys.emplace_back(rank_key);
+  }
+  auto local = data.at(rank_);
+
+  std::vector<std::vector<uint8_t>> toSend{data};
+
+  toSend.erase(toSend.begin() + rank_);
+
+  store_->multiSet(keys, toSend);
+
+  return local;
+}
+
+std::vector<uint8_t> StoreCollectives::scatterRecv(
+    const std::string& key,
+    std::chrono::milliseconds timeout) {
+  enforceUnique(key);
+  StoreTimeoutGuard g{*store_, timeout};
+
+  auto rank_key = getRankKey(key, rank_);
+  return store_->get(rank_key);
+}
+
+std::vector<std::vector<uint8_t>> StoreCollectives::allGather(
+    const std::string& key,
+    const std::vector<uint8_t>& data,
+    std::chrono::milliseconds timeout) {
+  enforceUnique(key);
+  StoreTimeoutGuard g{*store_, timeout};
+
+  auto localKey = getRankKey(key, rank_);
+  store_->set(localKey, data);
+
+  std::vector<std::string> keys;
+  keys.reserve(worldSize_);
+
+  for (int i = 0; i < worldSize_; i++) {
+    auto rank_key = getRankKey(key, i);
+    keys.emplace_back(rank_key);
+  }
+
+  try {
+    return store_->multiGet(keys);
+  } catch (const std::exception& e) {
+    std::string msg = "all_gather failed -- missing ranks: ";
+    for (int i = 0; i < worldSize_; i++) {
+      if (i == rank_) {
+        continue;
+      }
+      auto rank_key = getRankKey(key, i);
+      if (!store_->check({rank_key})) {
+        msg += fmt::format("{}, ", i);
+      }
+    }
+    throw std::runtime_error(msg + e.what());
+  }
+}
+
+int64_t StoreCollectives::allSum(
+    const std::string& key,
+    int64_t value,
+    std::chrono::milliseconds timeout) {
+  enforceUnique(key);
+  StoreTimeoutGuard g{*store_, timeout};
+
+  store_->add(key, value);
+
+  barrier(key + "/barrier", timeout);
+
+  return store_->add(key, 0);
+}
+
+void StoreCollectives::enforceUnique(const std::string& key) {
+  auto it = seenKeys_.find(key);
+  TORCH_INTERNAL_ASSERT(
+      it == seenKeys_.end(), "Key ", key, " has already been used.");
+  seenKeys_.emplace(key);
+}
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp
new file mode 100644
index 000000000000..7d3eb5038565
--- /dev/null
+++ b/torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp
@@ -0,0 +1,68 @@
+#pragma once
+
+#include <c10/macros/Macros.h>
+#include <c10/util/FbcodeMaps.h>
+#include <torch/csrc/distributed/c10d/Store.hpp>
+#include <torch/csrc/distributed/c10d/control_collectives/ControlCollectives.hpp>
+
+namespace c10d {
+
+class TORCH_API StoreCollectives : public ControlCollectives {
+ public:
+  explicit StoreCollectives(
+      c10::intrusive_ptr<Store> store,
+      int rank,
+      int worldSize);
+
+  void barrier(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min,
+      bool block = true) override;
+
+  void broadcastSend(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) override;
+  std::vector<uint8_t> broadcastRecv(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min) override;
+
+  void gatherSend(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) override;
+  std::vector<std::vector<uint8_t>> gatherRecv(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) override;
+
+  std::vector<uint8_t> scatterSend(
+      const std::string& key,
+      const std::vector<std::vector<uint8_t>>& data,
+      std::chrono::milliseconds timeout = 5min) override;
+  std::vector<uint8_t> scatterRecv(
+      const std::string& key,
+      std::chrono::milliseconds timeout = 5min) override;
+
+  std::vector<std::vector<uint8_t>> allGather(
+      const std::string& key,
+      const std::vector<uint8_t>& data,
+      std::chrono::milliseconds timeout = 5min) override;
+
+  int64_t allSum(
+      const std::string& key,
+      int64_t data,
+      std::chrono::milliseconds timeout = 5min) override;
+
+ private:
+  void enforceUnique(const std::string& key);
+
+ private:
+  c10::intrusive_ptr<Store> store_;
+  int rank_;
+  int worldSize_;
+
+  c10::FastSet<std::string> seenKeys_{};
+};
+
+} // namespace c10d
diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp
index 483becbce009..505b64e2a697 100644
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@@ -6,6 +6,9 @@
 #include <torch/csrc/distributed/c10d/GroupRegistry.hpp>
 #include <torch/csrc/distributed/c10d/TCPStore.hpp>
 #include <torch/csrc/distributed/c10d/Utils.hpp>
+#include <torch/csrc/distributed/c10d/control_collectives/ControlCollectives.hpp>
+#include <torch/csrc/distributed/c10d/control_collectives/StoreCollectives.hpp>
+#include <vector>
 #ifndef _WIN32
 #include <torch/csrc/distributed/c10d/HashStore.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupRoundRobin.hpp>
@@ -136,6 +139,34 @@ namespace torch::distributed::c10d {
 
 namespace {
 
+py::bytes toPyBytes(const std::vector<uint8_t>& data) {
+  return py::bytes(reinterpret_cast<const char*>(data.data()), data.size());
+}
+
+std::vector<py::bytes> toPyBytes(
+    const std::vector<std::vector<uint8_t>>& data) {
+  std::vector<py::bytes> out;
+  out.reserve(data.size());
+  for (const std::vector<uint8_t>& data_ : data) {
+    out.emplace_back(reinterpret_cast<const char*>(data_.data()), data_.size());
+  }
+  return out;
+}
+
+std::vector<uint8_t> toVec8(const std::string& data) {
+  std::vector<uint8_t> out{data.begin(), data.end()};
+  return out;
+}
+
+std::vector<std::vector<uint8_t>> toVec8(const std::vector<std::string>& data) {
+  std::vector<std::vector<uint8_t>> out;
+  out.reserve(data.size());
+  for (auto& data_ : data) {
+    out.emplace_back(toVec8(data_));
+  }
+  return out;
+}
+
 template <typename T>
 using shared_ptr_class_ = py::class_<T, std::shared_ptr<T>>;
 
@@ -166,8 +197,7 @@ class PythonStore : public ::c10d::Store {
         pybind11::get_overload(static_cast<const ::c10d::Store*>(this), "set");
     TORCH_INTERNAL_ASSERT(fn, "Not implemented.");
     // Call function with a py::bytes object for the value.
-    fn(key,
-       py::bytes(reinterpret_cast<const char*>(value.data()), value.size()));
+    fn(key, toPyBytes(value));
   }
 
   // Note: this function manually calls the Python-side overload
@@ -184,7 +214,7 @@ class PythonStore : public ::c10d::Store {
     // std::vector<uint8_t>. There is no API for directly accessing
     // the contents of the py::bytes object.
     std::string str = pybind11::cast<py::bytes>(fn(key));
-    return std::vector<uint8_t>(str.begin(), str.end());
+    return toVec8(str);
   }
 
   // Note: this function manually calls the Python-side overload
@@ -204,14 +234,8 @@ class PythonStore : public ::c10d::Store {
     // std::vector<uint8_t>. There is no API for directly accessing
     // the contents of the py::bytes object.
     std::string str = pybind11::cast<py::bytes>(
-        fn(key,
-           py::bytes(
-               reinterpret_cast<const char*>(expectedValue.data()),
-               expectedValue.size()),
-           py::bytes(
-               reinterpret_cast<const char*>(desiredValue.data()),
-               desiredValue.size())));
-    return std::vector<uint8_t>(str.begin(), str.end());
+        fn(key, toPyBytes(expectedValue), toPyBytes(desiredValue)));
+    return toVec8(str);
   }
 
   int64_t add(const std::string& key, int64_t value) override {
@@ -253,8 +277,7 @@ class PythonStore : public ::c10d::Store {
       return Store::append(key, value);
     }
     // Call function with a py::bytes object for the value.
-    fn(key,
-       py::bytes(reinterpret_cast<const char*>(value.data()), value.size()));
+    fn(key, toPyBytes(value));
   }
 
   std::vector<std::vector<uint8_t>> multiGet(
@@ -287,14 +310,7 @@ class PythonStore : public ::c10d::Store {
       return Store::multiSet(keys, values);
     }
 
-    std::vector<py::bytes> bytes;
-    bytes.reserve(values.size());
-    for (auto& value : values) {
-      bytes.emplace_back(
-          reinterpret_cast<const char*>(value.data()), value.size());
-    }
-
-    fn(keys, bytes);
+    fn(keys, toPyBytes(values));
   }
 
   bool hasExtendedApi() const override {
@@ -973,10 +989,7 @@ and :class:`~torch.distributed.HashStore`).
               "set",
               [](::c10d::Store& store,
                  const std::string& key,
-                 const std::string& value) {
-                std::vector<uint8_t> value_(value.begin(), value.end());
-                store.set(key, value_);
-              },
+                 const std::string& value) { store.set(key, toVec8(value)); },
               py::call_guard<py::gil_scoped_release>(),
               R"(
 Inserts the key-value pair into the store based on the supplied ``key`` and
@@ -1001,14 +1014,9 @@ Example::
                  const std::string& key,
                  const std::string& expected_value,
                  const std::string& desired_value) -> py::bytes {
-                std::vector<uint8_t> expectedValue_(
-                    expected_value.begin(), expected_value.end());
-                std::vector<uint8_t> desiredValue_(
-                    desired_value.begin(), desired_value.end());
-                auto value =
-                    store.compareSet(key, expectedValue_, desiredValue_);
-                return py::bytes(
-                    reinterpret_cast<char*>(value.data()), value.size());
+                auto value = store.compareSet(
+                    key, toVec8(expected_value), toVec8(desired_value));
+                return toPyBytes(value);
               },
               py::call_guard<py::gil_scoped_release>(),
               R"(
@@ -1040,8 +1048,7 @@ Example::
                   py::gil_scoped_release guard;
                   return store.get(key);
                 }();
-                return py::bytes(
-                    reinterpret_cast<char*>(value.data()), value.size());
+                return toPyBytes(value);
               },
               R"(
 Retrieves the value associated with the given ``key`` in the store. If ``key`` is not
@@ -1240,8 +1247,7 @@ Example::
               [](::c10d::Store& store,
                  const std::string& key,
                  const std::string& value) {
-                std::vector<uint8_t> value_(value.begin(), value.end());
-                store.append(key, value_);
+                store.append(key, toVec8(value));
               },
               py::call_guard<py::gil_scoped_release>(),
               R"(
@@ -1268,14 +1274,7 @@ Example::
                   py::gil_scoped_release guard;
                   return store.multiGet(keys);
                 }();
-                std::vector<py::bytes> res;
-                for (auto& value : values) {
-                  auto bytes = py::bytes(
-                      reinterpret_cast<const char*>(value.data()),
-                      value.size());
-                  res.push_back(bytes);
-                }
-                return res;
+                return toPyBytes(values);
               },
               R"(
 Retrieve all values in ``keys``. If any key in ``keys`` is not
@@ -1298,12 +1297,7 @@ Example::
               [](::c10d::Store& store,
                  const std::vector<std::string>& keys,
                  const std::vector<std::string>& values) {
-                std::vector<std::vector<uint8_t>> vals;
-                vals.reserve(values.size());
-                for (auto& value : values) {
-                  vals.emplace_back(value.begin(), value.end());
-                }
-                store.multiSet(keys, vals);
+                store.multiSet(keys, toVec8(values));
               },
               py::call_guard<py::gil_scoped_release>(),
               R"(
@@ -1487,6 +1481,212 @@ that adds a prefix to each key inserted to the store.
           &::c10d::PrefixStore::getUnderlyingNonPrefixStore,
           R"(Recursively to get the store before layers of wrapping with PrefixStore.)");
 
+  using namespace std::chrono_literals;
+
+  auto collectives =
+      py::class_<
+          ::c10d::ControlCollectives,
+          c10::intrusive_ptr<::c10d::ControlCollectives>>(
+          module,
+          "_ControlCollectives",
+          R"(
+Base class for all ControlCollectives implementations.
+)")
+          .def(
+              "barrier",
+              &::c10d::ControlCollectives::barrier,
+              py::arg("key"),
+              py::arg("timeout") = 5min,
+              py::arg("block") = true,
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Blocks until all workers have entered this function.
+
+Arguments:
+    key (str): The unique key used to identify this operation.
+    timeout (duration): The timeout for this operation.
+    block (bool): whether to block this working waiting on the results of the barrier.
+)")
+          .def(
+              "all_sum",
+              &::c10d::ControlCollectives::allSum,
+              py::arg("key"),
+              py::arg("data"),
+              py::arg("timeout") = 5min,
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Computes a sum across all workers and returns the final value.
+
+Arguments:
+    key (str): The unique key used to identify this operation.
+    data (int): The data to sum.
+    timeout (duration): The timeout for this operation.
+)")
+          .def(
+              "broadcast_send",
+              [](::c10d::ControlCollectives& collectives,
+                 const std::string& key,
+                 const std::string& data,
+                 std::chrono::milliseconds timeout = 5min) {
+                collectives.broadcastSend(key, toVec8(data), timeout);
+              },
+              py::arg("key"),
+              py::arg("data"),
+              py::arg("timeout") = 5min,
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Sends data to all other workers. Must be only called from one worker.
+
+Arguments:
+    key (str): The unique key used to identify this operation.
+    data (str): The data to send.
+    timeout (duration): The timeout for this operation.
+)")
+          .def(
+              "broadcast_recv",
+              [](::c10d::ControlCollectives& collectives,
+                 const std::string& key,
+                 std::chrono::milliseconds timeout = 5min) {
+                auto out = [&]() {
+                  py::gil_scoped_release guard;
+                  return collectives.broadcastRecv(key, timeout);
+                }();
+                return toPyBytes(out);
+              },
+              py::arg("key"),
+              py::arg("timeout") = 5min,
+              R"(
+Receives data broadcasted from 1 worker.
+
+Arguments:
+    key (str): The unique key used to identify this operation.
+    timeout (duration): The timeout for this operation.
+)")
+          .def(
+              "gather_send",
+              [](::c10d::ControlCollectives& collectives,
+                 const std::string& key,
+                 const std::string& data,
+                 std::chrono::milliseconds timeout = 5min) {
+                collectives.gatherSend(key, toVec8(data), timeout);
+              },
+              py::arg("key"),
+              py::arg("data"),
+              py::arg("timeout") = 5min,
+              py::call_guard<py::gil_scoped_release>(),
+              R"(
+Sends data to one other worker.
+
+Arguments:
+    key (str): The unique key used to identify this operation.
+    data (str): The data to send.
+    timeout (duration): The timeout for this operation.
+)")
+          .def(
+              "gather_recv",
+              [](::c10d::ControlCollectives& collectives,
+                 const std::string& key,
+                 const std::string& data,
+                 std::chrono::milliseconds timeout = 5min) {
+                auto out = [&]() {
+                  py::gil_scoped_release guard;
+                  return collectives.gatherRecv(key, toVec8(data), timeout);
+                }();
+                return toPyBytes(out);
+              },
+              py::arg("key"),
+              py::arg("data"),
+              py::arg("timeout") = 5min,
+              R"(
+Receives data broadcasted from all workers. Must only be called by one worker.
+
+Arguments:
+    key (str): The unique key used to identify this operation.
+    timeout (duration): The timeout for this operation.
+)")
+
+          .def(
+              "scatter_send",
+              [](::c10d::ControlCollectives& collectives,
+                 const std::string& key,
+                 const std::vector<std::string>& data,
+                 std::chrono::milliseconds timeout = 5min) {
+                auto out = [&]() {
+                  py::gil_scoped_release guard;
+                  return collectives.scatterSend(key, toVec8(data), timeout);
+                }();
+                return toPyBytes(out);
+              },
+              py::arg("key"),
+              py::arg("data"),
+              py::arg("timeout") = 5min,
+              R"(
+Sends rank specific data to all other workers.
+
+Arguments:
+    key (str): The unique key used to identify this operation.
+    data (str): The data to send.
+    timeout (duration): The timeout for this operation.
+)")
+          .def(
+              "scatter_recv",
+              [](::c10d::ControlCollectives& collectives,
+                 const std::string& key,
+                 std::chrono::milliseconds timeout = 5min) {
+                auto out = [&]() {
+                  py::gil_scoped_release guard;
+                  return collectives.scatterRecv(key, timeout);
+                }();
+                return toPyBytes(out);
+              },
+              py::arg("key"),
+              py::arg("timeout") = 5min,
+              R"(
+Receives rank specific data from one worker.
+
+Arguments:
+    key (str): The unique key used to identify this operation.
+    timeout (duration): The timeout for this operation.
+)")
+
+          .def(
+              "all_gather",
+              [](::c10d::ControlCollectives& collectives,
+                 const std::string& key,
+                 const std::string& data,
+                 std::chrono::milliseconds timeout = 5min) {
+                auto out = [&]() {
+                  py::gil_scoped_release guard;
+                  return collectives.allGather(key, toVec8(data), timeout);
+                }();
+                return toPyBytes(out);
+              },
+              py::arg("key"),
+              py::arg("data"),
+              py::arg("timeout") = 5min,
+              R"(
+Sends data to all workers and receives data from all other workers.
+
+Arguments:
+    key (str): The unique key used to identify this operation.
+    data (str): The data to send.
+    timeout (duration): The timeout for this operation.
+)");
+
+  intrusive_ptr_class_<::c10d::StoreCollectives>(
+      module,
+      "_StoreCollectives",
+      collectives,
+      R"(
+An implementation of ControlCollectives that uses the provided store as the underlying
+communication mechanism.
+      )")
+      .def(
+          py::init<c10::intrusive_ptr<::c10d::Store>, int, int>(),
+          py::arg("store"),
+          py::arg("rank"),
+          py::arg("world_size"));
+
   auto processGroup =
       py::class_<
           ::c10d::ProcessGroup,
diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.cpp b/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
index 238050f50122..1ada9415ea12 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
+++ b/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
@@ -96,8 +96,13 @@ bool unpack_tensors(
     const std::vector<c10::Argument>& arguments,
     const torch::jit::Stack& stack,
     const c10::Device& device,
-    std::vector<at::Tensor>& inputs) {
+    std::vector<at::Tensor>& inputs,
+    bool with_scalar = false) {
   for (size_t idx = 0; idx < stack.size(); idx++) {
+    if (!with_scalar && stack[idx].isScalar()) {
+      continue;
+    }
+
     if (!unpack_ivalue(arguments[idx], stack[idx], device, inputs)) {
       return false;
     }
@@ -106,6 +111,40 @@ bool unpack_tensors(
   return true;
 }
 
+std::vector<size_t> get_tensor_parameter_index(
+    const std::vector<c10::Argument>& arguments,
+    const torch::jit::Stack& stack) {
+  std::vector<size_t> tensor_parameter_index;
+  for (size_t idx = 0; idx < stack.size(); idx++) {
+    if (stack[idx].isScalar() || stack[idx].isTensor()) {
+      // scalar and tensor
+      tensor_parameter_index.push_back(idx);
+    } else if (stack[idx].isTensorList()) {
+      // tensor list
+      std::fill_n(
+          std::back_inserter(tensor_parameter_index),
+          stack[idx].toListRef().size(),
+          idx);
+    } else if (stack[idx].isOptionalTensorList()) {
+      // optional tensor list: std::vector<std::optional<at::Tensor>>
+      for (const auto& item : stack[idx].toListRef()) {
+        if (item.toOptional<at::Tensor>().has_value()) {
+          tensor_parameter_index.push_back(idx);
+        }
+      }
+    } else if (
+        *arguments[idx].real_type() ==
+        *c10::getTypePtr<c10::optional<at::Tensor>>()) {
+      // optional tensor
+      if (stack[idx].toOptional<at::Tensor>().has_value()) {
+        tensor_parameter_index.push_back(idx);
+      }
+    }
+  }
+
+  return tensor_parameter_index;
+}
+
 } // namespace
 
 AOTIPythonKernelHolder::AOTIPythonKernelHolder(
@@ -149,14 +188,19 @@ bool AOTIPythonKernelHolder::cache_lookup(
       "Not implemented for operations that return a non-Tensor value.");
 
   std::vector<at::Tensor> inputs;
-  auto res = unpack_tensors(op.schema().arguments(), *stack, device_, inputs);
+  auto res =
+      unpack_tensors(op.schema().arguments(), *stack, device_, inputs, true);
   TORCH_CHECK_NOT_IMPLEMENTED(
       res && inputs.size() > 0,
       "Not implemented for operations that contain a parameter which is ",
       "not one of the following types: at::Tensor, at::TensorList, ",
       "std::optional<at::Tensor>, std::vector<std::optional<at::Tensor>>.");
 
-  auto inputs_metadata = get_inputs_metadata(inputs);
+  auto tensor_parameter_index =
+      get_tensor_parameter_index(op.schema().arguments(), *stack);
+  TORCH_INTERNAL_ASSERT(tensor_parameter_index.size() == inputs.size());
+  auto inputs_metadata = get_inputs_metadata(
+      inputs, op.schema().arguments(), tensor_parameter_index);
   auto aoti_kernel_state = aoti_kernel_cache_.find(inputs_metadata);
   if (aoti_kernel_state == aoti_kernel_cache_.end()) {
     return false;
@@ -197,18 +241,49 @@ void AOTIPythonKernelHolder::cache_hit(
 }
 
 AOTIKernelMetadata AOTIPythonKernelHolder::get_inputs_metadata(
-    const std::vector<at::Tensor>& inputs) {
+    const std::vector<at::Tensor>& inputs,
+    const std::vector<c10::Argument>& inputs_argument,
+    const std::vector<size_t>& inputs_argument_index) {
   AOTIKernelMetadata inputs_metadata;
-  for (const auto& input : inputs) {
+  for (size_t idx = 0; idx < inputs.size(); ++idx) {
+    auto input = inputs[idx];
+    auto input_info = inputs_argument[inputs_argument_index[idx]];
+
     auto device = input.device();
     if (device.is_cpu()) {
       // If the device is CPU, set the device index to -1.
       device = c10::Device(device.type(), -1);
     }
 
+    c10::Scalar scalar_value((double)1.0);
+    auto tensor_type = input.scalar_type();
+
+    bool is_scalar = input_info.type()->isSubtypeOf(*c10::NumberType::get());
+    if (is_scalar) {
+      if (c10::isFloatingType(input.scalar_type())) {
+        auto scalar_numeric_value = input.item().toDouble();
+        tensor_type = c10::ScalarType::Double;
+        scalar_value = c10::Scalar(scalar_numeric_value);
+      } else if (c10::isIntegralType(input.scalar_type(), false)) {
+        auto scalar_numeric_value = input.item().toUInt64();
+        tensor_type = c10::ScalarType::UInt64;
+        scalar_value = c10::Scalar(scalar_numeric_value);
+      } else if (input.scalar_type() == c10::ScalarType::Bool) {
+        auto scalar_numeric_value = input.item().toBool();
+        tensor_type = c10::ScalarType::Bool;
+        scalar_value = c10::Scalar(scalar_numeric_value);
+      } else {
+        TORCH_CHECK(
+            false,
+            "Unsupported scalar tensor type: ",
+            c10::toString(input.scalar_type()));
+      }
+    }
+
     inputs_metadata.emplace_back(
-        false, // is symbloic
-        input.scalar_type(),
+        false,
+        tensor_type,
+        c10::IValue(scalar_value),
         device,
         input.sizes().vec(),
         input.strides().vec());
@@ -269,6 +344,7 @@ void AOTIPythonKernelHolder::init_aoti_kernel_cache() {
           reinterpret_cast<THPDtype*>(data_type_obj.ptr())->scalar_type;
       auto sizes = metadata["sizes"].cast<std::vector<int64_t>>();
       auto strides = metadata["strides"].cast<std::vector<int64_t>>();
+      bool is_scalar = metadata.contains("scalar_value");
 
       std::vector<std::optional<c10::SymInt>> sym_optional_sizes;
       std::vector<std::optional<c10::SymInt>> sym_optional_strides;
@@ -279,10 +355,34 @@ void AOTIPythonKernelHolder::init_aoti_kernel_cache() {
         sym_optional_strides.push_back(std::optional<c10::SymInt>(stride));
       }
 
-      // Now you can use these variables in your code
+      // If an input parameter is a scalar, its detailed value is cached.
+      // This is done to ensure correctness during subsequent checks.
+      c10::Scalar scalar_value((double)1.0);
+      if (is_scalar) {
+        if (c10::isFloatingType(data_type)) {
+          auto scalar_numeric_value = metadata["scalar_value"].cast<double>();
+          data_type = c10::ScalarType::Double;
+          scalar_value = c10::Scalar(scalar_numeric_value);
+        } else if (c10::isIntegralType(data_type, false)) {
+          auto scalar_numeric_value = metadata["scalar_value"].cast<int64_t>();
+          data_type = c10::ScalarType::UInt64;
+          scalar_value = c10::Scalar(scalar_numeric_value);
+        } else if (data_type == c10::ScalarType::Bool) {
+          auto scalar_numeric_value = metadata["scalar_value"].cast<bool>();
+          data_type = c10::ScalarType::Bool;
+          scalar_value = c10::Scalar(scalar_numeric_value);
+        } else {
+          TORCH_CHECK(
+              false,
+              "Unsupported scalar tensor type: ",
+              c10::toString(data_type));
+        }
+      }
+
       tensor_metadata_list.emplace_back(
           is_dynamic,
           data_type,
+          c10::IValue(scalar_value),
           c10::Device(c10::Device(device_type).type(), device_index),
           sizes,
           strides);
diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.h b/torch/csrc/inductor/aoti_eager/kernel_holder.h
index 9cbcc217d7c3..b67e4e7d4464 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_holder.h
+++ b/torch/csrc/inductor/aoti_eager/kernel_holder.h
@@ -3,6 +3,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/function_schema.h>
 
 #include <torch/csrc/dynamo/guards.h>
 #include <torch/csrc/inductor/aoti_eager/kernel_meta_info.h>
@@ -82,7 +83,10 @@ class AOTIPythonKernelHolder : public c10::OperatorKernel {
   void init_aoti_kernel_cache();
   // Abstract the meta information of each tensor for the given operation. The
   // meta infomation will be used for cache lookup as the key.
-  AOTIKernelMetadata get_inputs_metadata(const std::vector<at::Tensor>&);
+  AOTIKernelMetadata get_inputs_metadata(
+      const std::vector<at::Tensor>& inputs,
+      const std::vector<c10::Argument>& inputs_argument,
+      const std::vector<size_t>& inputs_argument_index);
   // Load the AOTIModelContainerRunner object from the given file path.
   std::shared_ptr<AOTIModelContainerRunner> load_aoti_model_runner(
       const std::string&);
diff --git a/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
index e89c59142328..a49fab21d671 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
+++ b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
@@ -1,5 +1,6 @@
 #if !defined(C10_MOBILE) && !defined(ANDROID)
 #include <torch/csrc/inductor/aoti_eager/kernel_meta_info.h>
+#include <iostream>
 
 namespace torch::inductor {
 
@@ -17,6 +18,24 @@ TensorMetadata::TensorMetadata(
     std::vector<int64_t> strides)
     : is_symbolic_(is_symbolic),
       dtype_(dtype),
+      scalar_value_((float)1.0),
+      device_(device),
+      sizes_(sizes),
+      strides_(strides) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      !is_symbolic_, "Not support symbolic shape now");
+}
+
+TensorMetadata::TensorMetadata(
+    bool is_symbolic,
+    c10::ScalarType dtype,
+    c10::IValue scalar_value,
+    c10::Device device,
+    std::vector<int64_t> sizes,
+    std::vector<int64_t> strides)
+    : is_symbolic_(is_symbolic),
+      dtype_(dtype),
+      scalar_value_(scalar_value),
       device_(device),
       sizes_(sizes),
       strides_(strides) {
@@ -29,15 +48,39 @@ bool TensorMetadata::operator==(const TensorMetadata& other) const {
       !is_symbolic_, "Not support symbolic shape now");
   return this->is_symbolic_ == other.is_symbolic_ &&
       this->dtype_ == other.dtype_ &&
+      this->scalar_value_ == other.scalar_value_ &&
       this->device_.type() == other.device_.type() &&
       this->sizes_ == other.sizes_ && this->strides_ == other.strides_;
 }
 
+std::ostream& operator<<(
+    std::ostream& stream,
+    const TensorMetadata& tensor_metadata) {
+  stream << "is_symbolic_: " << tensor_metadata.is_symbolic_ << std::endl;
+  stream << "dtype_: " << tensor_metadata.dtype_ << std::endl;
+  stream << "scalar_value_: " << tensor_metadata.scalar_value_.type()->str()
+         << "(" << tensor_metadata.scalar_value_ << ")" << std::endl;
+  stream << "device_: " << tensor_metadata.device_ << std::endl;
+  stream << "sizes_: ";
+  for (const auto& size : tensor_metadata.sizes_) {
+    stream << size << " ";
+  }
+  stream << std::endl;
+  stream << "strides_: ";
+  for (const auto& stride : tensor_metadata.strides_) {
+    stream << stride << " ";
+  }
+  stream << std::endl;
+  return stream;
+}
+
 size_t TensorMetadataHash::operator()(
     const TensorMetadata& tensor_metadata) const {
   auto hash = std::hash<bool>()(tensor_metadata.is_symbolic_);
   hash = c10::hash_combine(
       hash, std::hash<c10::ScalarType>()(tensor_metadata.dtype_));
+  hash =
+      c10::hash_combine(hash, c10::IValue::hash(tensor_metadata.scalar_value_));
   hash = c10::hash_combine(
       hash, std::hash<c10::DeviceType>()(tensor_metadata.device_.type()));
 
diff --git a/torch/csrc/inductor/aoti_eager/kernel_meta_info.h b/torch/csrc/inductor/aoti_eager/kernel_meta_info.h
index c7f8315d2707..5c22e9b75f65 100644
--- a/torch/csrc/inductor/aoti_eager/kernel_meta_info.h
+++ b/torch/csrc/inductor/aoti_eager/kernel_meta_info.h
@@ -33,6 +33,8 @@ struct TensorMetadata {
   bool is_symbolic_;
   // Dtype of a tensor(For scalar, we will wrap it as a scalar tensor)
   c10::ScalarType dtype_;
+  // Concrete scalar value. Serve for operations w/ scalar parameter
+  c10::IValue scalar_value_;
   // Device of a tensor.
   c10::Device device_;
   // Sizes of a tensor. Currently, we only support static shape and use int64_t
@@ -49,6 +51,13 @@ struct TensorMetadata {
       c10::Device device,
       std::vector<int64_t> sizes,
       std::vector<int64_t> strides);
+  TensorMetadata(
+      bool is_symbolic,
+      c10::ScalarType dtype,
+      c10::IValue scalar_value,
+      c10::Device device,
+      std::vector<int64_t> sizes,
+      std::vector<int64_t> strides);
 
   bool operator==(const TensorMetadata& other) const;
 };
diff --git a/torch/csrc/inductor/aoti_torch/c/shim.h b/torch/csrc/inductor/aoti_torch/c/shim.h
index b05c52c6a387..6fa7df75c056 100644
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@@ -72,6 +72,9 @@ extern "C" {
 struct AtenTensorOpaque;
 using AtenTensorHandle = AtenTensorOpaque*;
 
+struct AtenGeneratorOpaque;
+using AtenGeneratorHandle = AtenGeneratorOpaque*;
+
 struct AOTIProxyExecutorOpaque;
 using AOTIProxyExecutorHandle = AOTIProxyExecutorOpaque*;
 
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
index f31c52408aa7..2c7f05dd84cd 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cpu.h
@@ -47,6 +47,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool2d(AtenTensorHandle self
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bernoulli__Tensor(AtenTensorHandle self, AtenTensorHandle p, AtenGeneratorHandle* generator);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bernoulli__float(AtenTensorHandle self, double p, AtenGeneratorHandle* generator);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
@@ -58,6 +60,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummax(AtenTensorHandle self, in
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumprod(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_cumsum(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_exponential(AtenTensorHandle self, double lambd, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_fractional_max_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
@@ -67,6 +70,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_geqrf(AtenTensorHandle self, Ate
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_histogram_bin_ct(AtenTensorHandle self, int64_t bins, const double** range, int64_t range_len_, AtenTensorHandle* weight, int32_t density, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_index_reduce(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
@@ -83,25 +88,38 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_median(AtenTensorHandle self, At
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_pow_Tensor_Tensor(AtenTensorHandle self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_rand_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_generator(int64_t high, const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randn_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad1d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_uniform(AtenTensorHandle self, double from, double to, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_linear1d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cpu_upsample_trilinear3d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
index 37e45a7030a5..1dceac240e40 100644
--- a/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
+++ b/torch/csrc/inductor/aoti_torch/generated/c_shim_cuda.h
@@ -28,8 +28,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_forward_only(Ate
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__embedding_bag_per_sample_weights_backward(AtenTensorHandle grad, AtenTensorHandle weight, AtenTensorHandle indices, AtenTensorHandle offsets, AtenTensorHandle offset2bag, int64_t mode, int64_t padding_idx, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_c2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t forward, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fft_r2c(AtenTensorHandle self, const int64_t* dim, int64_t dim_len_, int64_t normalization, int32_t onesided, AtenTensorHandle* ret0);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_forward(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* cum_seq_q, AtenTensorHandle* cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_backward(AtenTensorHandle grad_out, AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle out, AtenTensorHandle logsumexp, AtenTensorHandle cum_seq_q, AtenTensorHandle cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, AtenTensorHandle philox_seed, AtenTensorHandle philox_offset, double* scale, int64_t* window_size_left, int64_t* window_size_right, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__flash_attention_forward(AtenTensorHandle query, AtenTensorHandle key, AtenTensorHandle value, AtenTensorHandle* cum_seq_q, AtenTensorHandle* cum_seq_k, int64_t max_q, int64_t max_k, double dropout_p, int32_t is_causal, int32_t return_debug_mask, double* scale, int64_t* window_size_left, int64_t* window_size_right, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_moving_avg_obs_fq_helper(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__fused_moving_avg_obs_fq_helper_functional(AtenTensorHandle self, AtenTensorHandle observer_on, AtenTensorHandle fake_quant_on, AtenTensorHandle running_min, AtenTensorHandle running_max, AtenTensorHandle scale, AtenTensorHandle zero_point, double averaging_const, int64_t quant_min, int64_t quant_max, int64_t ch_axis, int32_t per_row_fake_quant, int32_t symmetric_quant, AtenTensorHandle* ret0, AtenTensorHandle* ret1, AtenTensorHandle* ret2, AtenTensorHandle* ret3, AtenTensorHandle* ret4, AtenTensorHandle* ret5);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda__pdist_backward(AtenTensorHandle grad, AtenTensorHandle self, double p, AtenTensorHandle pdist, AtenTensorHandle* ret0);
@@ -55,6 +55,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool2d(AtenTensorHandle sel
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_avg_pool3d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* stride, int64_t stride_len_, const int64_t* padding, int64_t padding_len_, int32_t ceil_mode, int32_t count_include_pad, int64_t* divisor_override, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bernoulli__Tensor(AtenTensorHandle self, AtenTensorHandle p, AtenGeneratorHandle* generator);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bernoulli__float(AtenTensorHandle self, double p, AtenGeneratorHandle* generator);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bmm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_bucketize_Tensor(AtenTensorHandle self, AtenTensorHandle boundaries, int32_t out_int32, int32_t right, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cat(const AtenTensorHandle* tensors, int64_t tensors_len_, int64_t dim, AtenTensorHandle* ret0);
@@ -66,6 +68,7 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cummax(AtenTensorHandle self, i
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cummin(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cumprod(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_cumsum(AtenTensorHandle self, int64_t dim, int32_t* dtype, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_exponential(AtenTensorHandle self, double lambd, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool2d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle indices, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_fractional_max_pool3d(AtenTensorHandle self, const int64_t* kernel_size, int64_t kernel_size_len_, const int64_t* output_size, int64_t output_size_len_, AtenTensorHandle random_samples, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
@@ -74,6 +77,8 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_gcd(AtenTensorHandle self, Aten
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_geqrf(AtenTensorHandle self, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_grid_sampler_2d_backward(AtenTensorHandle grad_output, AtenTensorHandle input, AtenTensorHandle grid, int64_t interpolation_mode, int64_t padding_mode, int32_t align_corners, const int32_t* output_mask, int64_t output_mask_len_, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_histc(AtenTensorHandle self, int64_t bins, double min, double max, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_Tensor(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_put(AtenTensorHandle self, const AtenTensorHandle** indices, int64_t indices_len_, AtenTensorHandle values, int32_t accumulate, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_index_reduce(AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle source, const char* reduce, int32_t include_self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_kthvalue(AtenTensorHandle self, int64_t k, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_logcumsumexp(AtenTensorHandle self, int64_t dim, AtenTensorHandle* ret0);
@@ -90,25 +95,38 @@ AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_median(AtenTensorHandle self, A
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mm_out(AtenTensorHandle out, AtenTensorHandle self, AtenTensorHandle mat2);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mode(AtenTensorHandle self, int64_t dim, int32_t keepdim, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Scalar(AtenTensorHandle self, double other, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_mul_Tensor(AtenTensorHandle self, AtenTensorHandle other, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nanmedian(AtenTensorHandle self, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_native_dropout(AtenTensorHandle input, double p, int32_t* train, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_nonzero(AtenTensorHandle self, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_ormqr(AtenTensorHandle self, AtenTensorHandle input2, AtenTensorHandle input3, int32_t left, int32_t transpose, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Scalar(double self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Tensor_Scalar(AtenTensorHandle self, double exponent, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_pow_Tensor_Tensor(AtenTensorHandle self, AtenTensorHandle exponent, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rand(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_rand_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint(int64_t high, const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_generator(int64_t high, const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randint_low_out(AtenTensorHandle out, int64_t low, int64_t high, const int64_t* size, int64_t size_len_);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randn(const int64_t* size, int64_t size_len_, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randn_generator(const int64_t* size, int64_t size_len_, AtenGeneratorHandle* generator, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_randperm(int64_t n, int32_t* dtype, int32_t* layout, int32_t* device, int32_t device_index_, int32_t* pin_memory, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_repeat_interleave_Tensor(AtenTensorHandle repeats, int64_t* output_size, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad1d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_replication_pad2d_backward(AtenTensorHandle grad_output, AtenTensorHandle self, const int64_t* padding, int64_t padding_len_, AtenTensorHandle* ret0);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format, AtenTensorHandle* ret0);
-AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_(AtenTensorHandle self, const int64_t* size, int64_t size_len_, int32_t* memory_format);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_resize_as_(AtenTensorHandle self, AtenTensorHandle the_template, int32_t* memory_format);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_src_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_value_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, double value);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_scatter_reduce_two_out(AtenTensorHandle out, AtenTensorHandle self, int64_t dim, AtenTensorHandle index, AtenTensorHandle src, const char* reduce, int32_t include_self);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_segment_reduce(AtenTensorHandle data, const char* reduce, AtenTensorHandle* lengths, AtenTensorHandle* indices, AtenTensorHandle* offsets, int64_t axis, int32_t unsafe, double* initial, AtenTensorHandle* ret0);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_slice_Tensor(AtenTensorHandle self, int64_t dim, int64_t* start, int64_t* end, int64_t step, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_soft_margin_loss_backward(AtenTensorHandle grad_output, AtenTensorHandle self, AtenTensorHandle target, int64_t reduction, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort(AtenTensorHandle self, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_sort_stable(AtenTensorHandle self, int32_t* stable, int64_t dim, int32_t descending, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_topk(AtenTensorHandle self, int64_t k, int64_t dim, int32_t largest, int32_t sorted, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_triangular_solve(AtenTensorHandle self, AtenTensorHandle A, int32_t upper, int32_t transpose, int32_t unitriangular, AtenTensorHandle* ret0, AtenTensorHandle* ret1);
+AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_uniform(AtenTensorHandle self, double from, double to, AtenGeneratorHandle* generator, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_bicubic2d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_linear1d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales, AtenTensorHandle* ret0);
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_upsample_trilinear3d_backward(AtenTensorHandle grad_output, const int64_t* output_size, int64_t output_size_len_, const int64_t* input_size, int64_t input_size_len_, int32_t align_corners, double* scales_d, double* scales_h, double* scales_w, AtenTensorHandle* ret0);
diff --git a/torch/csrc/inductor/aoti_torch/utils.h b/torch/csrc/inductor/aoti_torch/utils.h
index 0964479caabd..44ca34b1c6e8 100644
--- a/torch/csrc/inductor/aoti_torch/utils.h
+++ b/torch/csrc/inductor/aoti_torch/utils.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <ATen/Generator.h>
 #include <ATen/Tensor.h>
 #include <ATen/core/List.h>
 #include <c10/core/DeviceType.h>
@@ -32,6 +33,16 @@ inline AtenTensorHandle tensor_pointer_to_tensor_handle(at::Tensor* tensor) {
   return reinterpret_cast<AtenTensorHandle>(tensor);
 }
 
+inline at::Generator* generator_handle_to_generator_pointer(
+    AtenGeneratorHandle handle) {
+  return reinterpret_cast<at::Generator*>(handle);
+}
+
+inline AtenGeneratorHandle generator_pointer_to_generator_handle(
+    at::Generator* generator) {
+  return reinterpret_cast<AtenGeneratorHandle>(generator);
+}
+
 inline AtenTensorHandle new_tensor_handle(at::Tensor&& tensor) {
   at::Tensor* new_tensor = new at::Tensor(std::move(tensor));
   return tensor_pointer_to_tensor_handle(new_tensor);
@@ -61,6 +72,13 @@ inline std::optional<at::Tensor> pointer_to_optional(
              : c10::nullopt;
 }
 
+template <>
+inline std::optional<at::Generator> pointer_to_optional(
+    AtenGeneratorHandle* ptr) {
+  return ptr ? c10::make_optional(*generator_handle_to_generator_pointer(*ptr))
+             : c10::nullopt;
+}
+
 inline std::optional<c10::Device> pointer_to_optional_device(
     int32_t* device_type,
     int32_t device_index) {
diff --git a/torch/csrc/jit/api/module.cpp b/torch/csrc/jit/api/module.cpp
index 1b9932ed34d4..45b99eb8e47a 100644
--- a/torch/csrc/jit/api/module.cpp
+++ b/torch/csrc/jit/api/module.cpp
@@ -323,7 +323,7 @@ Module Module::deepcopy(std::optional<at::Device> device) const {
 
 Module Module::clone(bool inplace) const {
   std::unordered_map<TypePtr, TypePtr> type_remap;
-  IValue::HashAliasedIValueMap memo;
+  IValue::HashIdentityIValueMap memo;
   const std::unordered_set<std::string> ignored_methods;
   const std::unordered_set<std::string> ignored_attributes;
   return clone_impl(
@@ -335,7 +335,7 @@ Module Module::clone(
     const std::unordered_set<std::string>& ignored_methods,
     const std::unordered_set<std::string>& ignored_attributes) const {
   std::unordered_map<TypePtr, TypePtr> type_remap;
-  IValue::HashAliasedIValueMap memo;
+  IValue::HashIdentityIValueMap memo;
   return clone_impl(
       type_remap, inplace, memo, ignored_methods, ignored_attributes);
 }
@@ -343,7 +343,7 @@ Module Module::clone(
 Module Module::clone_impl(
     std::unordered_map<TypePtr, TypePtr>& type_remap,
     bool inplace,
-    IValue::HashAliasedIValueMap memo,
+    IValue::HashIdentityIValueMap memo,
     const std::unordered_set<std::string>& ignored_methods,
     const std::unordered_set<std::string>& ignored_attributes) const {
   // Create a new _ivalue in the same compilation unit.
diff --git a/torch/csrc/jit/api/module.h b/torch/csrc/jit/api/module.h
index 0787210a4aef..e779542e315f 100644
--- a/torch/csrc/jit/api/module.h
+++ b/torch/csrc/jit/api/module.h
@@ -301,7 +301,7 @@ struct TORCH_API Module : public Object {
   Module clone_impl(
       std::unordered_map<TypePtr, TypePtr>& type_remap,
       bool inplace,
-      IValue::HashAliasedIValueMap memo,
+      IValue::HashIdentityIValueMap memo,
       const std::unordered_set<std::string>& ignored_methods,
       const std::unordered_set<std::string>& ignored_attributes) const;
 
diff --git a/torch/csrc/jit/passes/onnx.cpp b/torch/csrc/jit/passes/onnx.cpp
index 85b49dd31e94..820ab3f6ace3 100644
--- a/torch/csrc/jit/passes/onnx.cpp
+++ b/torch/csrc/jit/passes/onnx.cpp
@@ -310,14 +310,25 @@ void NodeToONNX(
         if (old->hasDebugName() && !exist_in_env) {
           auto old_name = outputs[i]->debugName();
           auto new_name = old->debugNameBase();
-          auto debug_names = new_block->owningGraph()->debugNames();
-          auto exist_name = debug_names.find(new_name);
+          Value* found_value;
+          bool exists;
+          // In this scope, we fetch debug_names as a const reference and then
+          // construct an iterator exist_name based on it. This iterator will
+          // be corrupted if the underlying map of debug_names changes. This
+          // will happen as a side-effect of setDebugName. For these reasons,
+          // we make an explicit scope for exist_name and make sure that
+          // setDebugName is never called with this scope.
+          {
+            const auto& debug_names = new_block->owningGraph()->debugNames();
+            auto exist_name = debug_names.find(new_name);
+            exists = exist_name != debug_names.end();
+            if (exists) {
+              found_value = exist_name->second;
+            }
+          }
           outputs[i]->setDebugName(new_name);
-          if (exist_name != debug_names.end()) {
-            // setDebugName changes name of existing value with same name.
-            // Set again to revert the changes, but update name for new value
-            // with suffix.
-            exist_name->second->setDebugName(new_name);
+          if (exists) {
+            found_value->setDebugName(new_name);
           }
           ConstantValueMap::UpdateValueName(old_name, outputs[i]->debugName());
         }
diff --git a/torch/csrc/jit/passes/onnx/constant_map.cpp b/torch/csrc/jit/passes/onnx/constant_map.cpp
index 716232cebbb0..e249d0a83a64 100644
--- a/torch/csrc/jit/passes/onnx/constant_map.cpp
+++ b/torch/csrc/jit/passes/onnx/constant_map.cpp
@@ -48,6 +48,14 @@ c10::optional<bool> ConstantValueMap::GetAllGraphInputsStatic() {
   return ConstantValueMap::getInstance().allGraphInputsStatic;
 }
 
+void ConstantValueMap::SetAllGraphInputsReliableComputed(bool computed) {
+  ConstantValueMap::getInstance().allGraphInputsReliableComputed = computed;
+}
+
+bool ConstantValueMap::GetAllGraphInputsReliableComputed() {
+  return ConstantValueMap::getInstance().allGraphInputsReliableComputed;
+}
+
 void ConstantValueMap::SetShape(
     const std::string& tensorName,
     const c10::SymbolicShape& shapeValue) {
@@ -227,6 +235,10 @@ SymbolDimMap& ConstantValueMap::GetSymbolDimMap() {
   return ConstantValueMap::getInstance().symbolDimMap;
 }
 
+DimSymbolMap& ConstantValueMap::GetDimSymbolMap() {
+  return ConstantValueMap::getInstance().dimSymbolMap;
+}
+
 template <typename Map>
 void UpdateStrKey(
     Map& map,
@@ -271,7 +283,9 @@ void ConstantValueMap::ClearMaps() {
   ConstantValueMap::getInstance().shapeValueMap.clear();
   ConstantValueMap::getInstance().inferredShapeData.clear();
   ConstantValueMap::getInstance().symbolDimMap.clear();
+  ConstantValueMap::getInstance().dimSymbolMap.clear();
   ConstantValueMap::getInstance().allGraphInputsStatic = c10::nullopt;
+  ConstantValueMap::getInstance().allGraphInputsReliableComputed = false;
 }
 
 // For debug only.
@@ -359,6 +373,15 @@ void ConstantValueMap::PrintMaps() {
       std::cout << std::endl;
     }
   }
+  std::cout << "DimSymbol Map:" << std::endl;
+  count = 0;
+  for (const auto& x : ConstantValueMap::getInstance().dimSymbolMap) {
+    std::cout << "(" << x.first << ": " << x.second << "), ";
+    count++;
+    if (count % 10 == 0) {
+      std::cout << std::endl;
+    }
+  }
 }
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx/constant_map.h b/torch/csrc/jit/passes/onnx/constant_map.h
index fe33183ef8d6..4261e45cc56c 100644
--- a/torch/csrc/jit/passes/onnx/constant_map.h
+++ b/torch/csrc/jit/passes/onnx/constant_map.h
@@ -29,6 +29,9 @@ class ConstantValueMap {
   static void SetAllGraphInputsStatic(bool all_static);
   static c10::optional<bool> GetAllGraphInputsStatic();
 
+  static void SetAllGraphInputsReliableComputed(bool computed);
+  static bool GetAllGraphInputsReliableComputed();
+
   static void SetShape(
       const std::string& tensorName,
       const c10::SymbolicShape& shapeValue);
@@ -70,6 +73,7 @@ class ConstantValueMap {
   static ShapeDataMap& GetInferredShapeData();
 
   static SymbolDimMap& GetSymbolDimMap();
+  static DimSymbolMap& GetDimSymbolMap();
 
   static void UpdateValueName(
       const std::string& old_name,
@@ -104,8 +108,11 @@ class ConstantValueMap {
   // during future node-level shape inference.
   ShapeDataMap inferredShapeData;
   SymbolDimMap symbolDimMap;
+  DimSymbolMap dimSymbolMap;
   // Stores if all graph-level inputs have static shape
   c10::optional<bool> allGraphInputsStatic;
+  // True if reliable has been computed for all graph inputs
+  bool allGraphInputsReliableComputed;
 };
 
 } // namespace jit
diff --git a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
index eefa9621ba1f..65d065adeb2b 100644
--- a/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
+++ b/torch/csrc/jit/passes/onnx/shape_type_inference.cpp
@@ -87,9 +87,14 @@ namespace onnx_torch = ::torch::onnx;
 namespace onnx = ::ONNX_NAMESPACE;
 namespace diagnostics = ::torch::onnx::diagnostics;
 
+// SymbolDimMap is a Torch-to-ONNX shape look-up. This is built so it can be
+// returned by the export function. During the export however, when we come
+// across new ONNX shapes, the reverse look-up is needed. To avoid incurring
+// a linear-time look-up, we maintain DimSymbolMap in parallel.
 c10::ShapeSymbol ONNXDimToShapeSymbol(
     const onnx::TensorShapeProto_Dimension& dim,
-    SymbolDimMap& symbol_dim_map) {
+    SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map) {
   if (dim.has_dim_value()) {
     return c10::ShapeSymbol::fromStaticSize(dim.dim_value());
   }
@@ -97,11 +102,9 @@ c10::ShapeSymbol ONNXDimToShapeSymbol(
   if (dim.has_dim_param()) {
     // If this param is already known, assign the same Symbol.
     GRAPH_UPDATE("Got dim_param:", dim.dim_param());
-    for (const auto& pair : symbol_dim_map) {
-      if (pair.second == dim.dim_param()) {
-        sym = pair.first;
-        break;
-      }
+    auto maybe_symbol = dim_symbol_map.find(dim.dim_param());
+    if (maybe_symbol != dim_symbol_map.end()) {
+      sym = maybe_symbol->second;
     }
   }
   if (!sym) {
@@ -109,13 +112,15 @@ c10::ShapeSymbol ONNXDimToShapeSymbol(
     // If dim.dim_param() is empty, no need to keep track
     // because there won't be duplicates.
     symbol_dim_map[sym.value()] = dim.dim_param();
+    dim_symbol_map[dim.dim_param()] = sym.value();
   }
   return sym.value();
 }
 
 TensorTypePtr TorchTensorTypeFromONNX(
     const onnx::TypeProto_Tensor& onnx_tensor_type,
-    SymbolDimMap& symbol_dim_map) {
+    SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map) {
   std::optional<at::ScalarType> scalar_type;
   if (onnx_tensor_type.has_elem_type()) {
     scalar_type = ONNXTypeToATenType(onnx_tensor_type.elem_type());
@@ -132,8 +137,8 @@ TensorTypePtr TorchTensorTypeFromONNX(
     const auto& onnx_shape = onnx_tensor_type.shape();
 
     for (const auto i : c10::irange(onnx_shape.dim_size())) {
-      sizes.emplace_back(
-          ONNXDimToShapeSymbol(onnx_shape.dim(i), symbol_dim_map));
+      sizes.emplace_back(ONNXDimToShapeSymbol(
+          onnx_shape.dim(i), symbol_dim_map, dim_symbol_map));
     }
     v_type = TensorType::create(scalar_type, at::kCPU, sizes.size(), {});
     v_type = v_type->withSymbolicShapes(c10::SymbolicShape(sizes));
@@ -150,13 +155,14 @@ TensorTypePtr TorchTensorTypeFromONNX(
 
 ListTypePtr TorchListTypeFromONNX(
     const onnx::TypeProto_Sequence& onnx_sequence_type,
-    SymbolDimMap& symbol_dim_map) {
+    SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map) {
   if (onnx_sequence_type.has_elem_type()) {
     const auto& onnx_seq_elem_type = onnx_sequence_type.elem_type();
     if (onnx_seq_elem_type.has_tensor_type()) {
       const auto& onnx_tensor_type = onnx_seq_elem_type.tensor_type();
-      const auto v_tensor_type =
-          TorchTensorTypeFromONNX(onnx_tensor_type, symbol_dim_map);
+      const auto v_tensor_type = TorchTensorTypeFromONNX(
+          onnx_tensor_type, symbol_dim_map, dim_symbol_map);
       auto v_type = ListType::create(v_tensor_type);
       return v_type;
     }
@@ -167,21 +173,22 @@ ListTypePtr TorchListTypeFromONNX(
 void UpdateTorchValueByOnnxValueInfo(
     Value* v,
     const onnx::ValueInfoProto& p_info,
-    SymbolDimMap& symbol_dim_map) {
+    SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map) {
   if (!p_info.has_type()) {
     return;
   }
 
   const auto& p_type = p_info.type();
   if (p_type.has_tensor_type()) {
-    const auto torch_tensor_type =
-        TorchTensorTypeFromONNX(p_type.tensor_type(), symbol_dim_map);
+    const auto torch_tensor_type = TorchTensorTypeFromONNX(
+        p_type.tensor_type(), symbol_dim_map, dim_symbol_map);
     if (torch_tensor_type) {
       MergeInferredTypeAndSetMap(v, v->type(), torch_tensor_type);
     }
   } else if (p_type.has_sequence_type()) {
-    const auto torch_list_type =
-        TorchListTypeFromONNX(p_type.sequence_type(), symbol_dim_map);
+    const auto torch_list_type = TorchListTypeFromONNX(
+        p_type.sequence_type(), symbol_dim_map, dim_symbol_map);
     if (torch_list_type) {
       MergeInferredTypeAndSetMap(v, v->type(), torch_list_type);
     }
@@ -377,6 +384,7 @@ void ConvertGraphToONNXProto(
     std::shared_ptr<Graph> graph,
     std::shared_ptr<onnx::ModelProto>& model_proto,
     SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map,
     int opset_version) {
   RawDataExportMap export_map;
   bool val_use_external_data_format;
@@ -402,6 +410,9 @@ void ConvertGraphToONNXProto(
           false,
           std::string());
   symbol_dim_map.insert(new_symbol_dim_map.begin(), new_symbol_dim_map.end());
+  for (const auto& pair : new_symbol_dim_map) {
+    dim_symbol_map[pair.second] = pair.first;
+  }
   for (int i = 0; i < model_proto->graph().output_size(); ++i) {
     model_proto->mutable_graph()->mutable_output(i)->clear_type();
   }
@@ -1796,7 +1807,8 @@ void UpdateOutputTypeByONNXProto(
     Node* n,
     Node* clone_node,
     const onnx::ModelProto& model_proto,
-    SymbolDimMap& symbol_dim_map) {
+    SymbolDimMap& symbol_dim_map,
+    DimSymbolMap& dim_symbol_map) {
   const auto& graph_proto = model_proto.graph();
 
   // get data from value_info and updated original graph.
@@ -1805,7 +1817,7 @@ void UpdateOutputTypeByONNXProto(
         for (size_t i = 0; i < n->outputs().size(); ++i) {
           if (clone_node->output(i)->debugName() == v_info.name()) {
             UpdateTorchValueByOnnxValueInfo(
-                n->output(i), v_info, symbol_dim_map);
+                n->output(i), v_info, symbol_dim_map, dim_symbol_map);
           }
         }
       };
@@ -2023,11 +2035,17 @@ void UpdateReliable(Node* n) {
   }
 }
 
+// Traverse the graph inputs and compute reliability (e.g., are shapes static).
+// Since the inputs do not change during export, we save computation time by
+// marking it as computed and subsequently skipping.
 void SetGraphInputTypeReliable(const Graph* g) {
-  for (auto graph_input : g->inputs()) {
-    if (!ConstantValueMap::HasTypeReliable(graph_input->debugName())) {
-      ConstantValueMap::SetTypeReliable(graph_input->debugName(), true);
+  if (!ConstantValueMap::GetAllGraphInputsReliableComputed()) {
+    for (auto graph_input : g->inputs()) {
+      if (!ConstantValueMap::HasTypeReliable(graph_input->debugName())) {
+        ConstantValueMap::SetTypeReliable(graph_input->debugName(), true);
+      }
     }
+    ConstantValueMap::SetAllGraphInputsReliableComputed(true);
   }
 }
 
@@ -2040,6 +2058,7 @@ void ONNXShapeTypeInference(
   auto& original_shape_data = ConstantValueMap::GetInferredShapeData();
   ShapeDataMap inferred_shape_data;
   auto& symbol_dim_map = ConstantValueMap::GetSymbolDimMap();
+  auto& dim_symbol_map = ConstantValueMap::GetDimSymbolMap();
 
   SetGraphInputTypeReliable(n->owningGraph());
   GRAPH_UPDATE(
@@ -2094,7 +2113,7 @@ void ONNXShapeTypeInference(
       //       e.g: ListConstruct, ListUnpack, etc.
       std::shared_ptr<onnx::ModelProto> model_proto;
       ConvertGraphToONNXProto(
-          n_graph, model_proto, symbol_dim_map, opset_version);
+          n_graph, model_proto, symbol_dim_map, dim_symbol_map, opset_version);
       GRAPH_DEBUG(
           "ONNX graph to run shape inference: ", prettyPrint(*model_proto));
 
@@ -2119,7 +2138,7 @@ void ONNXShapeTypeInference(
           }
         }
         UpdateOutputTypeByONNXProto(
-            n, clone_node, *model_proto, symbol_dim_map);
+            n, clone_node, *model_proto, symbol_dim_map, dim_symbol_map);
       } catch (std::runtime_error& ex) {
         // TODO: include this as warning once we have a more consolidated
         // warning system.
@@ -2161,8 +2180,8 @@ void ONNXShapeTypeInference(
       int rank = inferred_shape.dim_size();
       std::vector<::c10::ShapeSymbol> final_shape(rank);
       for (int i = 0; i < rank; ++i) {
-        final_shape[i] =
-            ONNXDimToShapeSymbol(inferred_shape.dim(i), symbol_dim_map);
+        final_shape[i] = ONNXDimToShapeSymbol(
+            inferred_shape.dim(i), symbol_dim_map, dim_symbol_map);
       }
       c10::SymbolicShape shape_value(final_shape);
       // Store data propagation result into shapeValueMap
diff --git a/torch/csrc/jit/passes/quantization/insert_observers.cpp b/torch/csrc/jit/passes/quantization/insert_observers.cpp
index e5df64f1929c..de1cff1ba9d1 100644
--- a/torch/csrc/jit/passes/quantization/insert_observers.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_observers.cpp
@@ -92,7 +92,7 @@ class ModuleCloneHelper {
       const ModuleQConfigMap& module_qconfig_map,
       bool inplace = false) {
     std::unordered_map<TypePtr, QConfigTypePtrMap> type_remap;
-    IValue::HashAliasedIValueMap memo;
+    IValue::HashIdentityIValueMap memo;
     return clone_impl(
         module, module_qconfig_map, type_remap, inplace, std::move(memo));
   }
@@ -103,7 +103,7 @@ class ModuleCloneHelper {
       const ModuleQConfigMap& module_qconfig_map,
       std::unordered_map<TypePtr, QConfigTypePtrMap>& type_remap,
       bool inplace,
-      IValue::HashAliasedIValueMap memo) {
+      IValue::HashIdentityIValueMap memo) {
     auto qconfig = module_qconfig_map.at(module._ivalue());
     auto type = module.type();
     // Create a new _ivalue in the same compilation unit.
diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp
index 971b6c76ca47..c46762a88615 100644
--- a/torch/csrc/jit/python/script_init.cpp
+++ b/torch/csrc/jit/python/script_init.cpp
@@ -668,13 +668,13 @@ static constexpr std::array<const char*, 48> magic_method_names = {
 };
 
 struct DeepCopyMemoTable {
-  std::shared_ptr<IValue::HashAliasedIValueMap> map;
+  std::shared_ptr<IValue::HashIdentityIValueMap> map;
 };
 
 IValue pyIValueDeepcopy(const IValue& ivalue, const py::dict& memo) {
   if (!memo.contains(py::str("__torch_script_memo_table"))) {
     memo["__torch_script_memo_table"] =
-        DeepCopyMemoTable{std::make_shared<IValue::HashAliasedIValueMap>()};
+        DeepCopyMemoTable{std::make_shared<IValue::HashIdentityIValueMap>()};
   }
   auto& ivalue_memo =
       *py::cast<DeepCopyMemoTable>(memo["__torch_script_memo_table"]).map;
diff --git a/torch/csrc/jit/serialization/export.h b/torch/csrc/jit/serialization/export.h
index 3a56cfc7788f..9a7ab2c4fcc8 100644
--- a/torch/csrc/jit/serialization/export.h
+++ b/torch/csrc/jit/serialization/export.h
@@ -30,6 +30,7 @@ namespace jit {
 using RawDataExportMap = std::unordered_map<std::string, at::Tensor>;
 
 using SymbolDimMap = std::map<c10::ShapeSymbol, std::string>;
+using DimSymbolMap = std::map<std::string, c10::ShapeSymbol>;
 
 using NodeNameMap = std::unordered_map<const Node*, std::string>;
 
diff --git a/torch/csrc/lazy/core/hash.h b/torch/csrc/lazy/core/hash.h
index 57cf1f521030..19f57546c9a4 100644
--- a/torch/csrc/lazy/core/hash.h
+++ b/torch/csrc/lazy/core/hash.h
@@ -135,6 +135,12 @@ static inline hash_t TensorHash(const at::Tensor& tensor) {
       return DataHash(ctensor.const_data_ptr<c10::complex<float>>(), size);
     case at::ScalarType::ComplexDouble:
       return DataHash(ctensor.const_data_ptr<c10::complex<double>>(), size);
+    case at::ScalarType::UInt16:
+      return DataHash(ctensor.const_data_ptr<uint16_t>(), size);
+    case at::ScalarType::UInt32:
+      return DataHash(ctensor.const_data_ptr<uint32_t>(), size);
+    case at::ScalarType::UInt64:
+      return DataHash(ctensor.const_data_ptr<uint64_t>(), size);
     default:
       TORCH_INTERNAL_ASSERT(
           false, "Unsupported scalar type:", ctensor.scalar_type());
diff --git a/torch/csrc/profiler/orchestration/observer.h b/torch/csrc/profiler/orchestration/observer.h
index 423085160760..b77febb2784e 100644
--- a/torch/csrc/profiler/orchestration/observer.h
+++ b/torch/csrc/profiler/orchestration/observer.h
@@ -27,6 +27,7 @@ enum class C10_API_ENUM ProfilerState {
   CUDA, // CPU + CUDA events
   NVTX, // only emit NVTX markers
   ITT, // only emit ITT markers
+  PRIVATEUSE1, // only emit PRIVATEUSE1 markers
   KINETO, // use libkineto
   KINETO_GPU_FALLBACK, // use CUDA events when CUPTI is not available
   KINETO_PRIVATEUSE1_FALLBACK, // use PrivateUse1 events
@@ -39,7 +40,8 @@ enum class C10_API_ENUM ActiveProfilerType {
   LEGACY,
   KINETO,
   NVTX,
-  ITT
+  ITT,
+  PRIVATEUSE1
 };
 
 struct TORCH_API ExperimentalConfig {
diff --git a/torch/csrc/profiler/python/init.cpp b/torch/csrc/profiler/python/init.cpp
index b9e30c61177a..9ecfe5824a38 100644
--- a/torch/csrc/profiler/python/init.cpp
+++ b/torch/csrc/profiler/python/init.cpp
@@ -305,6 +305,7 @@ void initPythonBindings(PyObject* module) {
       .value("CUDA", ProfilerState::CUDA)
       .value("NVTX", ProfilerState::NVTX)
       .value("ITT", ProfilerState::ITT)
+      .value("PRIVATEUSE1", ProfilerState::PRIVATEUSE1)
       .value("KINETO", ProfilerState::KINETO)
       .value("KINETO_GPU_FALLBACK", ProfilerState::KINETO_GPU_FALLBACK)
       .value(
@@ -316,7 +317,8 @@ void initPythonBindings(PyObject* module) {
       .value("LEGACY", ActiveProfilerType::LEGACY)
       .value("KINETO", ActiveProfilerType::KINETO)
       .value("NVTX", ActiveProfilerType::NVTX)
-      .value("ITT", ActiveProfilerType::ITT);
+      .value("ITT", ActiveProfilerType::ITT)
+      .value("PRIVATEUSE1", ActiveProfilerType::PRIVATEUSE1);
 
   py::enum_<ActivityType>(m, "ProfilerActivity")
       .value("CPU", ActivityType::CPU)
diff --git a/torch/csrc/profiler/standalone/execution_trace_observer.cpp b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
index d100d8090c07..9e8a995ec977 100644
--- a/torch/csrc/profiler/standalone/execution_trace_observer.cpp
+++ b/torch/csrc/profiler/standalone/execution_trace_observer.cpp
@@ -28,8 +28,27 @@
 #include <torch/csrc/profiler/standalone/execution_trace_observer.h>
 #include <torch/csrc/profiler/util.h>
 
+#ifdef USE_DISTRIBUTED
+#include <torch/csrc/distributed/c10d/ParamCommsUtils.hpp>
+#endif // USE_DISTRIBUTED
+
 using namespace at;
 
+// Collective property attributes
+// https://github.com/pytorch/pytorch/issues/124674
+#ifdef USE_DISTRIBUTED
+constexpr auto kETCommsName = "collective_name";
+constexpr auto kETInMsgNelems = "in_msg_nelems";
+constexpr auto kETOutMsgNelems = "out_msg_nelems";
+constexpr auto kETInSplit = "in_split_size";
+constexpr auto kETOutSplit = "out_split_size";
+constexpr auto kETGlobalRankStart = "global_rank_start";
+constexpr auto kETGlobalRankStride = "global_rank_stride";
+constexpr auto kETGroupSize = "pg_size";
+constexpr auto kETProcessGroupName = "pg_name";
+constexpr auto kETProcessGroupDesc = "pg_desc";
+#endif // USE_DISTRIBUTED
+
 namespace torch {
 namespace profiler {
 namespace impl {
@@ -258,6 +277,19 @@ static std::ofstream openOutputFile(const std::string& name) {
   return stream;
 }
 
+static inline std::string getAttrJson(
+    const std::string& name,
+    const std::string& type,
+    const std::string& value) {
+  // note name and type are not quoted but value should be if it is a string.
+  return fmt::format(
+      R"JSON(
+  {{"name": "{}", "type": "{}", "value": {}}})JSON",
+      name,
+      type,
+      value);
+}
+
 static void writeJsonNode(
     std::ofstream& out,
     const std::string& name,
@@ -277,14 +309,15 @@ static void writeJsonNode(
     const std::string& output_types = "[]",
     const std::string& operator_schema = "",
     const std::string& kernel_backend = "",
-    const std::string& kernel_file = "") {
+    const std::string& kernel_file = "",
+    const std::string& additiona_attrs = "") {
   out << fmt::format(
       R"JSON(
     {{
       "id": {}, "name": "{}", "ctrl_deps": {},
       "inputs": {{"values": {}, "shapes": {}, "types": {}}},
       "outputs": {{"values": {}, "shapes": {}, "types": {}}},
-      "attrs": [{{"name": "rf_id", "type": "uint64", "value": {}}},{{"name": "fw_parent", "type": "uint64", "value": {}}},{{"name": "seq_id", "type": "int64", "value": {}}},{{"name": "scope", "type": "uint64", "value": {}}},{{"name": "tid", "type": "uint64", "value": {}}},{{"name": "fw_tid", "type": "uint64", "value": {}}},{{"name": "op_schema", "type": "string", "value": "{}"}},{{"name": "kernel_backend", "type": "string", "value": "{}"}},{{"name": "kernel_file", "type": "string", "value": "{}"}}]
+      "attrs": [{{"name": "rf_id", "type": "uint64", "value": {}}},{{"name": "fw_parent", "type": "uint64", "value": {}}},{{"name": "seq_id", "type": "int64", "value": {}}},{{"name": "scope", "type": "uint64", "value": {}}},{{"name": "tid", "type": "uint64", "value": {}}},{{"name": "fw_tid", "type": "uint64", "value": {}}},{{"name": "op_schema", "type": "string", "value": "{}"}},{{"name": "kernel_backend", "type": "string", "value": "{}"}},{{"name": "kernel_file", "type": "string", "value": "{}"}}{}]
     }})JSON",
       id,
       name,
@@ -303,7 +336,8 @@ static void writeJsonNode(
       fw_tid,
       operator_schema,
       kernel_backend,
-      kernel_file);
+      kernel_file,
+      additiona_attrs);
 }
 
 inline std::string timeString(const std::time_t timepoint) {
@@ -332,7 +366,7 @@ static bool initExecutionTraceStart(ExecutionTraceObserver& ob) {
 
   ob.out << fmt::format(
       R"JSON({{
-  "schema": "1.0.4-chakra.0.0.4", "pid": {}, "time": "{}", "start_ts": {},
+  "schema": "1.1.0-chakra.0.0.4", "pid": {}, "time": "{}", "start_ts": {},
   "nodes": [)JSON",
       ob.pid,
       ob.record_time,
@@ -486,6 +520,56 @@ inline void handleKernelBackendInfo(
   }
 }
 
+// Additional attributes for commounication collectives
+inline std::string getCommsNodeAttrs(const RecordFunction& fn) {
+  std::vector<std::string> attrs;
+
+#ifdef USE_DISTRIBUTED
+  // We rely on paramcommsdebug object that is available in thread local info
+  auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
+      c10::ThreadLocalDebugInfo::get(c10::DebugInfoKind::PARAM_COMMS_INFO));
+  if (debugInfo == nullptr) {
+    LOG(WARNING) << "ParamCommsDebugInfo not available for function: "
+                 << fn.name();
+    return ", " + getAttrJson("debug", "string", "\"missing comms info\"");
+  }
+
+  // get NcclMeta from record function, this used ParamCommsDebugInfo above
+  auto meta = saveNcclMeta(fn, false /*truncate*/);
+
+  auto addAttr =
+      [&](const char* commsMetaName, const char* etMetaName, const char* type) {
+        auto it = meta.find(commsMetaName);
+        if (it != meta.end()) {
+          attrs.push_back(getAttrJson(etMetaName, type, it->second));
+        }
+      };
+
+  addAttr(kCommsName, kETCommsName, "string");
+  addAttr(kDtype, kDtype, "string");
+
+  addAttr(kInMsgNelems, kETInMsgNelems, "uint64");
+  addAttr(kOutMsgNelems, kETOutMsgNelems, "uint64");
+
+  // following two metadata are lists.
+  addAttr(kInSplit, kETInSplit, "string");
+  addAttr(kOutSplit, kETOutSplit, "string");
+
+  addAttr(kGlobalRankStart, kETGlobalRankStart, "uint64");
+  addAttr(kGlobalRankStride, kETGlobalRankStride, "uint64");
+
+  // pg_name is a string.
+  addAttr(kProcessGroupName, kETProcessGroupName, "string");
+  addAttr(kProcessGroupDesc, kETProcessGroupDesc, "string");
+
+  addAttr(kGroupSize, kETGroupSize, "uint64");
+
+#endif // USE_DISTRIBUTED
+
+  // XXX consider using as string stream?
+  return attrs.size() == 0 ? "" : fmt::format(", {}", fmt::join(attrs, ", "));
+}
+
 static void recordOperatorStart(
     ExecutionTraceObserver& ob,
     FunctionCallContext& fc,
@@ -645,6 +729,9 @@ static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
         op_schema_str = json_str_escape(c10::toString(op_schema.value()));
       }
 
+      const std::string additiona_attrs =
+          fn.isNcclMeta() ? getCommsNodeAttrs(fn) : "";
+
       writeJsonNode(
           ob->out,
           fc.name,
@@ -664,7 +751,8 @@ static void onFunctionExit(const RecordFunction& fn, ObserverContext* ctx_ptr) {
           vectorToString(output_types),
           op_schema_str,
           fc.kernel_backend,
-          fc.kernel_file);
+          fc.kernel_file,
+          additiona_attrs);
       ob->out << ",";
     } catch (const std::exception& e) {
       LOG(WARNING) << "Exception in execution trace observer: [" << fc.name
diff --git a/torch/csrc/profiler/standalone/privateuse1_observer.cpp b/torch/csrc/profiler/standalone/privateuse1_observer.cpp
new file mode 100644
index 000000000000..81eb3074fb3a
--- /dev/null
+++ b/torch/csrc/profiler/standalone/privateuse1_observer.cpp
@@ -0,0 +1,11 @@
+#include <torch/csrc/profiler/standalone/privateuse1_observer.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+PushPRIVATEUSE1CallbacksStub pushPRIVATEUSE1CallbacksStub;
+
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/standalone/privateuse1_observer.h b/torch/csrc/profiler/standalone/privateuse1_observer.h
new file mode 100644
index 000000000000..39259b7444cf
--- /dev/null
+++ b/torch/csrc/profiler/standalone/privateuse1_observer.h
@@ -0,0 +1,46 @@
+#pragma once
+#include <torch/csrc/profiler/api.h>
+
+namespace torch {
+namespace profiler {
+namespace impl {
+
+using CallBackFnPtr = void (*)(
+    const ProfilerConfig& config,
+    const std::unordered_set<at::RecordScope>& scopes);
+
+struct PushPRIVATEUSE1CallbacksStub {
+  PushPRIVATEUSE1CallbacksStub() = default;
+  PushPRIVATEUSE1CallbacksStub(const PushPRIVATEUSE1CallbacksStub&) = delete;
+  PushPRIVATEUSE1CallbacksStub& operator=(const PushPRIVATEUSE1CallbacksStub&) =
+      delete;
+
+  template <typename... ArgTypes>
+  void operator()(ArgTypes&&... args) {
+    return (*push_privateuse1_callbacks_fn)(std::forward<ArgTypes>(args)...);
+  }
+
+  void set_privateuse1_dispatch_ptr(CallBackFnPtr fn_ptr) {
+    push_privateuse1_callbacks_fn = fn_ptr;
+  }
+
+ private:
+  CallBackFnPtr push_privateuse1_callbacks_fn = nullptr;
+};
+
+extern TORCH_API struct PushPRIVATEUSE1CallbacksStub
+    pushPRIVATEUSE1CallbacksStub;
+
+struct RegisterPRIVATEUSE1Observer {
+  RegisterPRIVATEUSE1Observer(
+      PushPRIVATEUSE1CallbacksStub& stub,
+      CallBackFnPtr value) {
+    stub.set_privateuse1_dispatch_ptr(value);
+  }
+};
+
+#define REGISTER_PRIVATEUSE1_OBSERVER(name, fn) \
+  static RegisterPRIVATEUSE1Observer name##__register(name, fn);
+} // namespace impl
+} // namespace profiler
+} // namespace torch
diff --git a/torch/csrc/profiler/util.cpp b/torch/csrc/profiler/util.cpp
index f301596fca81..21e16a7e7eae 100644
--- a/torch/csrc/profiler/util.cpp
+++ b/torch/csrc/profiler/util.cpp
@@ -334,25 +334,22 @@ std::vector<std::string> inputTypes(const at::RecordFunction& fn) {
 // ----------------------------------------------------------------------------
 // -- NCCL Metadata -----------------------------------------------------------
 // ----------------------------------------------------------------------------
-#ifdef USE_DISTRIBUTED
-static constexpr auto kCommsName = "Collective name";
-static constexpr auto kDtype = "dtype";
-static constexpr auto kInMsgNelems = "In msg nelems";
-static constexpr auto kOutMsgNelems = "Out msg nelems";
-static constexpr auto kInSplit = "In split size";
-static constexpr auto kOutSplit = "Out split size";
-static constexpr auto kGlobalRankStart = "Global rank start";
-static constexpr auto kGlobalRankStride = "Global rank stride";
-static constexpr auto kGroupSize = "Group size";
-static constexpr auto kProcessGroupName = "Process Group Name";
-static constexpr auto kProcessGroupDesc = "Process Group Description";
-static constexpr auto kGroupRanks = "Process Group Ranks";
 
 static constexpr int32_t kTruncatLength = 30;
-#endif // USE_DISTRIBUTED
+
+template <typename ListLikeType>
+inline std::string format_list(ListLikeType list, bool truncate) {
+  if (truncate && list.size() > kTruncatLength) {
+    return fmt::format(
+        "\"[{}, ...]\"",
+        fmt::join(list.begin(), list.begin() + kTruncatLength, ", "));
+  }
+  return fmt::format("\"[{}]\"", fmt::join(list.begin(), list.end(), ", "));
+}
 
 std::unordered_map<std::string, std::string> saveNcclMeta(
-    const at::RecordFunction& fn) {
+    const at::RecordFunction& fn,
+    bool truncate) {
   std::unordered_map<std::string, std::string> map;
 #ifdef USE_DISTRIBUTED
   auto debugInfo = dynamic_cast<ParamCommsDebugInfo*>(
@@ -369,34 +366,13 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
       kDtype, fmt::format("\"{}\"", c10::toString(debugInfo->getDType())));
   map.emplace(kInMsgNelems, std::to_string(debugInfo->getInMessageNelems()));
   map.emplace(kOutMsgNelems, std::to_string(debugInfo->getOutMessageNelems()));
+
   auto& inSplitSizes = debugInfo->getInputSplitSizes();
-  if (!inSplitSizes.empty() && inSplitSizes.size() <= kTruncatLength) {
-    map.emplace(
-        kInSplit, fmt::format("\"[{}]\"", fmt::join(inSplitSizes, ", ")));
-  } else if (inSplitSizes.size() > kTruncatLength) {
-    map.emplace(
-        kInSplit,
-        fmt::format(
-            "\"[{}, ...]\"",
-            fmt::join(
-                inSplitSizes.begin(),
-                inSplitSizes.begin() + kTruncatLength,
-                ", ")));
-  }
+  map.emplace(kInSplit, format_list(inSplitSizes, truncate));
+
   auto& outSplitSizes = debugInfo->getOutputSplitSizes();
-  if (!outSplitSizes.empty() && outSplitSizes.size() <= kTruncatLength) {
-    map.emplace(
-        kOutSplit, fmt::format("\"[{}]\"", fmt::join(outSplitSizes, ", ")));
-  } else if (outSplitSizes.size() > kTruncatLength) {
-    map.emplace(
-        kOutSplit,
-        fmt::format(
-            "\"[{}, ...]\"",
-            fmt::join(
-                outSplitSizes.begin(),
-                outSplitSizes.begin() + kTruncatLength,
-                ", ")));
-  }
+  map.emplace(kOutSplit, format_list(outSplitSizes, truncate));
+
   auto globalRankStart = debugInfo->getGlobalRankStart();
   if (globalRankStart >= 0) {
     map.emplace(kGlobalRankStart, std::to_string(globalRankStart));
@@ -415,20 +391,7 @@ std::unordered_map<std::string, std::string> saveNcclMeta(
     map.emplace(kProcessGroupDesc, fmt::format("\"{}\"", group_desc));
   }
   auto& groupRanks = debugInfo->getGroupRanks();
-  if (!groupRanks.empty() && groupRanks.size() <= kTruncatLength) {
-    map.emplace(
-        kGroupRanks, fmt::format("\"[{}]\"", fmt::join(groupRanks, ", ")));
-  } else if (groupRanks.size() > kTruncatLength) {
-    map.emplace(
-        kGroupRanks,
-        fmt::format(
-            "\"[{}, ..., {}]\"",
-            fmt::join(
-                groupRanks.begin(),
-                groupRanks.begin() + kTruncatLength - 1,
-                ", "),
-            groupRanks.back()));
-  }
+  map.emplace(kGroupRanks, format_list(groupRanks, truncate));
 #endif // USE_DISTRIBUTED
   return map;
 }
diff --git a/torch/csrc/profiler/util.h b/torch/csrc/profiler/util.h
index c8216c93f41c..3c995b49e602 100644
--- a/torch/csrc/profiler/util.h
+++ b/torch/csrc/profiler/util.h
@@ -100,7 +100,7 @@ TORCH_API std::vector<std::string> inputTypes(const at::RecordFunction& fn);
 std::unordered_map<std::string, c10::IValue> TORCH_API
 saveExtraArgs(const at::RecordFunction& fn);
 std::unordered_map<std::string, std::string> TORCH_API
-saveNcclMeta(const at::RecordFunction& fn);
+saveNcclMeta(const at::RecordFunction& fn, bool truncate = true);
 
 uint64_t TORCH_API computeFlops(
     const std::string& op_name,
@@ -157,6 +157,21 @@ struct HashCombine {
   }
 };
 
+#ifdef USE_DISTRIBUTED
+constexpr auto kCommsName = "Collective name";
+constexpr auto kDtype = "dtype";
+constexpr auto kInMsgNelems = "In msg nelems";
+constexpr auto kOutMsgNelems = "Out msg nelems";
+constexpr auto kInSplit = "In split size";
+constexpr auto kOutSplit = "Out split size";
+constexpr auto kGlobalRankStart = "Global rank start";
+constexpr auto kGlobalRankStride = "Global rank stride";
+constexpr auto kGroupSize = "Group size";
+constexpr auto kProcessGroupName = "Process Group Name";
+constexpr auto kProcessGroupDesc = "Process Group Description";
+constexpr auto kGroupRanks = "Process Group Ranks";
+#endif // USE_DISTRIBUTED
+
 } // namespace impl
 } // namespace profiler
 } // namespace torch
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
index 1344de8b9fde..1cc88e8adc57 100644
--- a/torch/cuda/__init__.py
+++ b/torch/cuda/__init__.py
@@ -154,6 +154,14 @@ def _sleep(cycles):
     torch._C._cuda_sleep(cycles)
 
 
+def _extract_arch_version(arch_string: str):
+    """Extracts the architecture string from a CUDA version"""
+    base = arch_string.split("_")[1]
+    if base.endswith("a"):
+        base = base[:-1]
+    return int(base)
+
+
 def _check_capability():
     incorrect_binary_warn = """
     Found GPU%d %s which requires CUDA_VERSION >= %d to
@@ -177,7 +185,7 @@ def _check_capability():
             name = get_device_name(d)
             current_arch = major * 10 + minor
             min_arch = min(
-                (int(arch.split("_")[1]) for arch in torch.cuda.get_arch_list()),
+                (_extract_arch_version(arch) for arch in torch.cuda.get_arch_list()),
                 default=35,
             )
             if current_arch < min_arch:
@@ -198,7 +206,7 @@ def _check_cubins():
     arch_list = get_arch_list()
     if len(arch_list) == 0:
         return
-    supported_sm = [int(arch.split("_")[1]) for arch in arch_list if "sm_" in arch]
+    supported_sm = [_extract_arch_version(arch) for arch in arch_list if "sm_" in arch]
     for idx in range(device_count()):
         cap_major, cap_minor = get_device_capability(idx)
         # NVIDIA GPU compute architectures are backward compatible within major version
@@ -1336,7 +1344,6 @@ def addmm_kernel_impl(*args, **kwargs):
     "DeferredCudaCallError",
     "Event",
     "ExternalStream",
-    "OutOfMemoryError",
     "Stream",
     "StreamContext",
     "amp",
diff --git a/torch/cuda/amp/autocast_mode.py b/torch/cuda/amp/autocast_mode.py
index 88ff04d86648..47f1f8e6eb00 100644
--- a/torch/cuda/amp/autocast_mode.py
+++ b/torch/cuda/amp/autocast_mode.py
@@ -1,5 +1,6 @@
 import collections
 import functools
+import warnings
 
 import torch
 
@@ -17,7 +18,7 @@
 class autocast(torch.amp.autocast_mode.autocast):
     r"""See :class:`torch.autocast`.
 
-    ``torch.cuda.amp.autocast(args...)`` is equivalent to ``torch.autocast("cuda", args...)``
+    ``torch.cuda.amp.autocast(args...)`` is deprecated. Please use ``torch.amp.autocast("cuda", args...)`` instead.
     """
 
     def __init__(
@@ -31,6 +32,10 @@ def __init__(
             self.device = "cuda"
             self.fast_dtype = dtype
             return
+        warnings.warn(
+            "torch.cuda.amp.autocast(args...) is deprecated. Please use torch.amp.autocast('cuda', args...) instead.",
+            DeprecationWarning,
+        )
         super().__init__(
             "cuda", enabled=enabled, dtype=dtype, cache_enabled=cache_enabled
         )
diff --git a/torch/cuda/amp/grad_scaler.py b/torch/cuda/amp/grad_scaler.py
index 0ebaa9bced2c..4defb9d3b160 100644
--- a/torch/cuda/amp/grad_scaler.py
+++ b/torch/cuda/amp/grad_scaler.py
@@ -1,7 +1,6 @@
 import torch
-from torch.amp.grad_scaler import OptState
 
-__all__ = ["GradScaler", "OptState"]
+__all__ = ["GradScaler"]
 
 
 class GradScaler(torch.amp.GradScaler):
diff --git a/torch/cuda/graphs.py b/torch/cuda/graphs.py
index b3bfbab6ad16..9d9df283ced6 100644
--- a/torch/cuda/graphs.py
+++ b/torch/cuda/graphs.py
@@ -1,5 +1,5 @@
 import gc
-from typing import Optional
+import typing
 
 import torch
 from torch.utils import _pytree
@@ -142,7 +142,7 @@ class graph:
         https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html#group__CUDART__STREAM_1g9d0535d93a214cbf126835257b16ba85
     """  # noqa: B950
 
-    default_capture_stream: Optional["torch.cuda.Stream"] = None
+    default_capture_stream: typing.Optional["torch.cuda.Stream"] = None
 
     def __init__(
         self,
diff --git a/torch/cuda/streams.py b/torch/cuda/streams.py
index 22d541f4e287..d36121381586 100644
--- a/torch/cuda/streams.py
+++ b/torch/cuda/streams.py
@@ -36,7 +36,7 @@ def __new__(cls, device=None, priority=0, **kwargs):
             with torch.cuda.device(device):
                 return super().__new__(cls, priority=priority, **kwargs)
 
-    def wait_event(self, event):
+    def wait_event(self, event) -> None:
         r"""Make all future work submitted to the stream wait for an event.
 
         Args:
@@ -53,7 +53,7 @@ def wait_event(self, event):
         """
         event.wait(self)
 
-    def wait_stream(self, stream):
+    def wait_stream(self, stream) -> None:
         r"""Synchronize with another stream.
 
         All future work submitted to this stream will wait until all kernels
@@ -82,7 +82,7 @@ def record_event(self, event=None):
         event.record(self)
         return event
 
-    def query(self):
+    def query(self) -> bool:
         r"""Check if all the work submitted has been completed.
 
         Returns:
@@ -90,7 +90,7 @@ def query(self):
         """
         return super().query()
 
-    def synchronize(self):
+    def synchronize(self) -> None:
         r"""Wait for all the kernels in this stream to complete.
 
         .. note:: This is a wrapper around ``cudaStreamSynchronize()``: see
@@ -102,7 +102,7 @@ def synchronize(self):
     def _as_parameter_(self):
         return ctypes.c_void_p(self.cuda_stream)
 
-    def __eq__(self, o):
+    def __eq__(self, o) -> bool:
         if isinstance(o, Stream):
             return super().__eq__(o)
         return False
@@ -128,7 +128,7 @@ class ExternalStream(Stream):
         stream_ptr(int): Integer representation of the `cudaStream_t` value.
             allocated externally.
         device(torch.device or int, optional): the device where the stream
-            was originally allocated. if device is specified incorrectly,
+            was originally allocated. If device is specified incorrectly,
             subsequent launches using this stream may fail.
     """
 
@@ -183,7 +183,7 @@ def record(self, stream=None):
             stream = torch.cuda.current_stream()
         super().record(stream)
 
-    def wait(self, stream=None):
+    def wait(self, stream=None) -> None:
         r"""Make all future work submitted to the given stream wait for this event.
 
         Use ``torch.cuda.current_stream()`` if no stream is specified.
@@ -212,7 +212,7 @@ def elapsed_time(self, end_event):
         """
         return super().elapsed_time(end_event)
 
-    def synchronize(self):
+    def synchronize(self) -> None:
         r"""Wait for the event to complete.
 
         Waits until the completion of all work currently captured in this event.
@@ -234,7 +234,7 @@ def ipc_handle(self):
     def _as_parameter_(self):
         return ctypes.c_void_p(self.cuda_event)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         if self.cuda_event:
             return f"<torch.cuda.Event {self._as_parameter_.value:#x}>"
         else:
diff --git a/torch/distributed/__init__.py b/torch/distributed/__init__.py
index eb7a690fa958..3e7dce97b54c 100644
--- a/torch/distributed/__init__.py
+++ b/torch/distributed/__init__.py
@@ -54,6 +54,8 @@ def is_available() -> bool:
         set_debug_level,
         set_debug_level_from_env,
         _make_nccl_premul_sum,
+        _ControlCollectives,
+        _StoreCollectives,
     )
 
     class _DistributedPdb(pdb.Pdb):
diff --git a/torch/distributed/_composable/fsdp/_fsdp_collectives.py b/torch/distributed/_composable/fsdp/_fsdp_collectives.py
index f27970315159..b7264cb34d6d 100644
--- a/torch/distributed/_composable/fsdp/_fsdp_collectives.py
+++ b/torch/distributed/_composable/fsdp/_fsdp_collectives.py
@@ -125,9 +125,11 @@ def foreach_reduce(
     orig_dtype: torch.dtype,
     reduce_dtype: Optional[torch.dtype],
     device: torch.device,
-    all_reduce_group: Optional[dist.ProcessGroup],
+    all_reduce_group: Optional[dist.ProcessGroup],  # not `None` iff HSDP
     all_reduce_stream: torch.cuda.Stream,
-) -> torch.cuda.Event:
+    all_reduce_grads: bool,
+    partial_reduce_output: Optional[torch.Tensor],  # only used for HSDP
+) -> Tuple[torch.cuda.Event, Optional[torch.Tensor]]:
     """
     ``unsharded_grads`` owns the references to the gradients computed by
     autograd, so clearing the list frees the gradients.
@@ -163,36 +165,43 @@ def foreach_reduce(
         # computed in the default stream
         current_stream.wait_stream(reduce_scatter_stream)
         unsharded_grads.clear()
-        post_reduce_output = reduce_scatter_input.new_empty(
-            (reduce_scatter_output_numel,)
-        )
+        reduce_output = reduce_scatter_input.new_empty((reduce_scatter_output_numel,))
         _div_if_needed(reduce_scatter_input, predivide_factor)
         dist.reduce_scatter_tensor(
-            output=post_reduce_output,
+            output=reduce_output,
             input=reduce_scatter_input,
             group=reduce_scatter_group,
             op=ReduceOp.AVG if predivide_factor is None else ReduceOp.SUM,
         )
-    view_out_stream = reduce_scatter_stream
-    if all_reduce_group is not None:
-        view_out_stream = all_reduce_stream
-        all_reduce_stream.wait_stream(reduce_scatter_stream)
-        with torch.cuda.stream(all_reduce_stream):
-            dist.all_reduce(
-                post_reduce_output,
-                group=all_reduce_group,
-                op=ReduceOp.AVG if predivide_factor is None else ReduceOp.SUM,
-            )
-    with torch.cuda.stream(view_out_stream):
-        _div_if_needed(post_reduce_output, postdivide_factor)
-        post_reduce_output = _to_dtype_if_needed(post_reduce_output, orig_dtype)
-        # - View out and accumulate
+        post_reduce_stream = reduce_scatter_stream
+        if all_reduce_group is not None:  # HSDP
+            # Accumulations must run in the reduce-scatter stream
+            if not all_reduce_grads:
+                if partial_reduce_output is not None:
+                    partial_reduce_output += reduce_output
+                else:
+                    partial_reduce_output = reduce_output
+                return post_reduce_stream.record_event(), partial_reduce_output
+            if partial_reduce_output is not None:
+                reduce_output += partial_reduce_output
+            post_reduce_stream = all_reduce_stream
+            all_reduce_stream.wait_stream(reduce_scatter_stream)
+            with torch.cuda.stream(all_reduce_stream):
+                dist.all_reduce(
+                    reduce_output,
+                    group=all_reduce_group,
+                    op=ReduceOp.AVG if predivide_factor is None else ReduceOp.SUM,
+                )
+    with torch.cuda.stream(post_reduce_stream):
+        _div_if_needed(reduce_output, postdivide_factor)
+        reduce_output = _to_dtype_if_needed(reduce_output, orig_dtype)
+        # View out and accumulate sharded gradients
         flat_grad_offset = 0  # [0, reduce_scatter_output_numel - 1]
         for padded_unsharded_size, fsdp_param in zip(
             padded_unsharded_sizes, fsdp_params
         ):
             new_sharded_grad = torch.as_strided(
-                post_reduce_output,
+                reduce_output,
                 size=fsdp_param.sharded_size,
                 stride=fsdp_param.contiguous_sharded_stride,
                 storage_offset=flat_grad_offset,
@@ -220,12 +229,12 @@ def foreach_reduce(
                 fsdp_param.sharded_param.grad = new_sharded_dtensor_grad
             padded_sharded_numel = padded_unsharded_size.numel() // world_size
             flat_grad_offset += padded_sharded_numel
-        post_reduce_view_out_event = view_out_stream.record_event()
+        post_reduce_event = post_reduce_stream.record_event()
     # The RS output is allocated in the RS stream and used in the default
     # stream (for optimizer). To ensure its memory is not reused for later
     # RSs, we do not need extra synchronization since the sharded parameters
     # hold refs through the end of backward.
-    return post_reduce_view_out_event
+    return post_reduce_event, None
 
 
 def foreach_reduce_scatter_copy_in(
diff --git a/torch/distributed/_composable/fsdp/_fsdp_common.py b/torch/distributed/_composable/fsdp/_fsdp_common.py
index 94b024917769..1395e3487847 100644
--- a/torch/distributed/_composable/fsdp/_fsdp_common.py
+++ b/torch/distributed/_composable/fsdp/_fsdp_common.py
@@ -117,20 +117,29 @@ def _from_local_no_grad(
     global_stride: Tuple[int, ...],
 ) -> DTensor:
     """
-    This method is similar to ``DTensor.from_local()`` except it avoids some
-    CPU overhead by avoiding default args and not being differentiable.
+    This method is similar to ``DTensor.from_local()`` except that in eager mode
+    it avoids some CPU overhead by avoiding default args and not being differentiable.
     """
-    return DTensor(
-        # Use the local tensor directly instead of constructing a new tensor
-        # variable, e.g. with `view_as()`, since this is not differentiable
-        local_tensor,
-        device_mesh,
-        placements,
-        shape=global_size,
-        dtype=local_tensor.dtype,
-        requires_grad=local_tensor.requires_grad,
-        stride=global_stride,
-    )
+    if not torch._dynamo.compiled_autograd.compiled_autograd_enabled:
+        return DTensor(
+            # Use the local tensor directly instead of constructing a new tensor
+            # variable, e.g. with `view_as()`, since this is not differentiable
+            local_tensor,
+            device_mesh,
+            placements,
+            shape=global_size,
+            dtype=local_tensor.dtype,
+            requires_grad=local_tensor.requires_grad,
+            stride=global_stride,
+        )
+    else:
+        return DTensor.from_local(
+            local_tensor,
+            device_mesh,
+            placements,
+            shape=global_size,
+            stride=global_stride,
+        )
 
 
 def _to_dtype_if_needed(
diff --git a/torch/distributed/_composable/fsdp/_fsdp_param.py b/torch/distributed/_composable/fsdp/_fsdp_param.py
index 736a3789e823..f0d64aa3e8f1 100644
--- a/torch/distributed/_composable/fsdp/_fsdp_param.py
+++ b/torch/distributed/_composable/fsdp/_fsdp_param.py
@@ -245,7 +245,7 @@ def _init_sharded_param(self, param: nn.Parameter, device: torch.device):
         self.padded_sharded_param_size = padded_sharded_param.size()
         if sharded_param.numel() > 0:
             padded_sharded_param[: sharded_param.size(0)].copy_(sharded_param)
-        if self.offload_to_cpu:
+        if self.offload_to_cpu and not padded_sharded_param.is_meta:
             padded_sharded_param = padded_sharded_param.cpu()
             if self.pin_memory:
                 padded_sharded_param = padded_sharded_param.pin_memory()
@@ -584,6 +584,8 @@ def reset_sharded_param(self):
                 )
             self.sharded_param = new_param
         local_tensor = new_param._local_tensor
+        if local_tensor.is_meta:
+            return
         padded_sharded_size = self.padded_sharded_param_size
         if local_tensor.size() != padded_sharded_size:
             padded_local_tensor = local_tensor.new_zeros(padded_sharded_size)
diff --git a/torch/distributed/_composable/fsdp/_fsdp_param_group.py b/torch/distributed/_composable/fsdp/_fsdp_param_group.py
index 9e9813102db3..ea2307222ce1 100644
--- a/torch/distributed/_composable/fsdp/_fsdp_param_group.py
+++ b/torch/distributed/_composable/fsdp/_fsdp_param_group.py
@@ -138,11 +138,15 @@ def __init__(
         # Holds the reduce-scatter/all-reduce view-out CUDA event that marks the end of
         # the group's post-backward (e.g. reduce-scatter, all-reduce and div), which
         # should be waited on at the end of backward
-        self._post_reduce_view_out_event: Optional[torch.cuda.Event] = None
+        self._post_reduce_event: Optional[torch.cuda.Event] = None
         # Holds the reshard-after-forward CUDA event when resharding to a
         # different world size, which should be waited on in the next unshard
         self._reshard_after_forward_event: Optional[torch.cuda.Event] = None
 
+        # Only for HSDP, if accumulating gradients without all-reduce, save the
+        # partial reduce output (only reduce-scattered but not all-reduced)
+        self._partial_reduce_output: Optional[torch.Tensor] = None
+
     # Initialization #
     def _init_mp_dtypes(self) -> None:
         for fsdp_param in self.fsdp_params:
@@ -273,6 +277,8 @@ def _record_post_forward(self) -> None:
         self._post_forward_indices.append(post_forward_index)
 
     def pre_backward(self, *unused: Any):
+        if self._training_state == TrainingState.PRE_BACKWARD:
+            return
         with torch.profiler.record_function("FSDP::pre_backward"):
             self._training_state = TrainingState.PRE_BACKWARD
             self.unshard()  # no-op if prefetched
@@ -311,7 +317,7 @@ def post_backward(self, *unused: Any):
         if len(fsdp_params_with_grad) == 0:
             return
         with torch.profiler.record_function("FSDP::post_backward_reduce"):
-            self._post_reduce_view_out_event = foreach_reduce(
+            self._post_reduce_event, self._partial_reduce_output = foreach_reduce(
                 fsdp_params_with_grad,
                 unsharded_grads,
                 self._reduce_scatter_process_group,
@@ -319,16 +325,16 @@ def post_backward(self, *unused: Any):
                 self._orig_dtype,
                 self._reduce_dtype,
                 self.device,
-                self._all_reduce_process_group
-                if self._is_hsdp and self.all_reduce_grads
-                else None,
+                self._all_reduce_process_group if self._is_hsdp else None,
                 self.comm_ctx.all_reduce_stream,
+                self.all_reduce_grads,
+                self._partial_reduce_output,
             )
 
     def finalize_backward(self):
-        if self._post_reduce_view_out_event is not None:
-            torch.cuda.current_stream().wait_event(self._post_reduce_view_out_event)
-            self._post_reduce_view_out_event = None
+        if self._post_reduce_event is not None:
+            torch.cuda.current_stream().wait_event(self._post_reduce_event)
+            self._post_reduce_event = None
         for fsdp_param in self.fsdp_params:
             if fsdp_param.grad_offload_event is not None:
                 fsdp_param.grad_offload_event.synchronize()
diff --git a/torch/distributed/_composable/fsdp/_fsdp_state.py b/torch/distributed/_composable/fsdp/_fsdp_state.py
index bab24c283063..15a00e83f086 100644
--- a/torch/distributed/_composable/fsdp/_fsdp_state.py
+++ b/torch/distributed/_composable/fsdp/_fsdp_state.py
@@ -5,7 +5,6 @@
 import torch
 import torch.nn as nn
 from torch.autograd import Variable
-from torch.autograd.graph import register_multi_grad_hook
 from torch.distributed._composable_state import (
     _get_module_state,
     _insert_module_state,
@@ -201,11 +200,12 @@ def _post_forward(self, module: nn.Module, input: Any, output: Any) -> Any:
                 )
         return output
 
-    def _pre_backward(self, *unused: Any) -> None:
+    def _pre_backward(self, grad: torch.Tensor) -> torch.Tensor:
         self._training_state = TrainingState.PRE_BACKWARD
         self._register_root_post_backward_final_callback()
         if self._fsdp_param_group:
-            self._fsdp_param_group.pre_backward(*unused)
+            self._fsdp_param_group.pre_backward()
+        return grad
 
     def _root_post_backward_final_callback(self) -> None:
         with torch.profiler.record_function("FSDP::root_post_backward_callback"):
@@ -235,7 +235,8 @@ def _register_pre_backward_hook(self, output: Any) -> Any:
             t for t in flat_outputs if (torch.is_tensor(t) and t.requires_grad)
         )
         if tensors:
-            register_multi_grad_hook(tensors, self._pre_backward, mode="any")
+            for tensor in tensors:
+                tensor.register_hook(self._pre_backward)
         return output
 
     def _register_root_post_backward_final_callback(self):
diff --git a/torch/distributed/_composable/fsdp/fully_shard.py b/torch/distributed/_composable/fsdp/fully_shard.py
index a5204701731c..981b82987462 100644
--- a/torch/distributed/_composable/fsdp/fully_shard.py
+++ b/torch/distributed/_composable/fsdp/fully_shard.py
@@ -208,7 +208,7 @@ def set_is_last_backward(self, is_last_backward: bool) -> None:
         state._state_ctx.is_last_backward = is_last_backward
 
     def set_requires_gradient_sync(
-        self, requires_gradient_sync: bool, recurse: bool = True
+        self, requires_gradient_sync: bool, *, recurse: bool = True
     ) -> None:
         """
         Sets if the module should sync gradients. This can be used to implement
@@ -231,16 +231,13 @@ def set_requires_gradient_sync(
                     fsdp_param_group.all_reduce_grads = requires_gradient_sync
 
     def set_requires_all_reduce(
-        self, requires_all_reduce: bool, recurse: bool = True
+        self, requires_all_reduce: bool, *, recurse: bool = True
     ) -> None:
         """
         Sets if the module should all-reduce gradients. This can be used to
         implement gradient accumulation with only reduce-scatter but not
         all-reduce for HSDP.
         """
-        # TODO: post_reduce_output += fsdp_param.sharded_param.grad
-        # after reduce-scatter and before all-reduce
-        raise NotImplementedError("requires_all_reduce is not yet supported in HSDP")
         self_module = cast(nn.Module, self)
         modules = list(self_module.modules()) if recurse else [self_module]
         for module in modules:
@@ -250,7 +247,7 @@ def set_requires_all_reduce(
                     fsdp_param_group.all_reduce_grads = requires_all_reduce
 
     def set_reshard_after_backward(
-        self, reshard_after_backward: bool, recurse: bool = True
+        self, reshard_after_backward: bool, *, recurse: bool = True
     ) -> None:
         """
         Sets if the module should reshard parameters after backward. This can
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
index b1250eddf037..8d598713cf50 100644
--- a/torch/distributed/_functional_collectives.py
+++ b/torch/distributed/_functional_collectives.py
@@ -894,6 +894,12 @@ def _all_to_all_single_meta(
         return input.new_empty(out_size)
 
 
+def _all_gather_into_tensor_out_native_meta(input, group_size, group_name, *, out):
+    shape = list(input.size())
+    shape[0] *= group_size
+    return input.new_empty(shape)
+
+
 def _all_gather_into_tensor_native_meta(input, group_size, group_name):
     shape = list(input.size())
     shape[0] *= group_size
@@ -932,6 +938,9 @@ def _reduce_scatter_tensor_coalesced_native_meta(
     lib_impl.impl("all_reduce_coalesced", _all_reduce_coalesced_meta, "Meta")
     lib_impl.impl("all_reduce_coalesced_", _all_reduce_coalesced__meta, "Meta")
     lib_impl.impl("wait_tensor", _wait_tensor_meta, "Meta")
+    lib_impl.impl(
+        "all_gather_into_tensor_out", _all_gather_into_tensor_out_native_meta, "Meta"
+    )
     lib_impl.impl("all_gather_into_tensor", _all_gather_into_tensor_native_meta, "Meta")
     lib_impl.impl(
         "all_gather_into_tensor_coalesced",
diff --git a/torch/distributed/_spmd/batch_dim_utils.py b/torch/distributed/_spmd/batch_dim_utils.py
index afb9dd2e7d3b..6d36b2e38118 100644
--- a/torch/distributed/_spmd/batch_dim_utils.py
+++ b/torch/distributed/_spmd/batch_dim_utils.py
@@ -9,11 +9,7 @@
 from torch import Tensor
 
 from torch.distributed._tensor import DeviceMesh, Replicate, Shard
-from torch.distributed._tensor.ops.view_ops import (
-    DimSpec,
-    InputDim,
-    ops as view_op_rules,
-)
+from torch.distributed._tensor.ops.view_ops import dim_maps, DimSpec, InputDim
 from torch.distributed._tensor.placement_types import _Partial, DTensorSpec
 
 aten = torch.ops.aten
@@ -80,12 +76,12 @@ def compute_batch_dim(self, node: fx.Node, full_reduction=False) -> int:
             return self.batch_dim_map[node]
 
         if node.target in self.dim_rule_map:
-            view_op_rule = view_op_rules[self.dim_rule_map[node.target]]  # type: ignore[index]
+            dim_map = dim_maps[self.dim_rule_map[node.target]]  # type: ignore[index]
             args_val = pytree.tree_map_only(fx.Node, lambda n: n.meta["val"], node.args)
             kwargs_val = pytree.tree_map_only(
                 fx.Node, lambda n: n.meta["val"], node.kwargs
             )
-            output_dim_rules = view_op_rule.dim_map(*args_val, **kwargs_val)
+            output_dim_rules = dim_map(*args_val, **kwargs_val)
 
             def collect_input_dim(cmd: DimSpec, input_dims: Set[int]):
                 if isinstance(cmd, InputDim):
diff --git a/torch/distributed/_tensor/__init__.py b/torch/distributed/_tensor/__init__.py
index 3e5e628b0522..7c391c4821aa 100644
--- a/torch/distributed/_tensor/__init__.py
+++ b/torch/distributed/_tensor/__init__.py
@@ -10,6 +10,13 @@
 from torch.distributed._tensor.ops.utils import normalize_to_torch_size
 from torch.distributed._tensor.placement_types import Placement, Replicate, Shard
 from torch.distributed.device_mesh import _mesh_resources, DeviceMesh, init_device_mesh
+from torch.optim.optimizer import (
+    _foreach_supported_types as _optim_foreach_supported_types,
+)
+from torch.utils._foreach_utils import (
+    _foreach_supported_types as _util_foreach_supported_types,
+)
+
 
 # All public APIs from dtensor package
 __all__ = [
@@ -23,6 +30,15 @@
 ]
 
 
+# Append DTensor to the list of supported types for foreach implementation for optimizer
+# and clip_grad_norm_ so that we will try to use foreach over the for-loop implementation on CUDA.
+if DTensor not in _optim_foreach_supported_types:
+    _optim_foreach_supported_types.append(DTensor)
+
+if DTensor not in _util_foreach_supported_types:
+    _util_foreach_supported_types.append(DTensor)
+
+
 def _dtensor_init_helper(
     init_op,
     size: torch.Size,
diff --git a/torch/distributed/_tensor/op_schema.py b/torch/distributed/_tensor/op_schema.py
index 7d5bd691395b..4918bffec621 100644
--- a/torch/distributed/_tensor/op_schema.py
+++ b/torch/distributed/_tensor/op_schema.py
@@ -161,6 +161,14 @@ def output_ndim(self):
     def output_shape(self):
         return self.strategies[0].output_spec.shape
 
+    @property
+    def ndim(self):
+        return self.output_ndim
+
+    @property
+    def shape(self):
+        return self.output_shape
+
 
 class TupleStrategy(StrategyType):
     """
diff --git a/torch/distributed/_tensor/ops/view_ops.py b/torch/distributed/_tensor/ops/view_ops.py
index be72cc9509f5..598c973170f4 100644
--- a/torch/distributed/_tensor/ops/view_ops.py
+++ b/torch/distributed/_tensor/ops/view_ops.py
@@ -16,23 +16,24 @@
 import torch
 
 from torch import Tensor
-from torch._subclasses.fake_tensor import unset_fake_temporarily
-from torch.distributed._tensor._utils import compute_local_shape
 from torch.distributed._tensor.api import Shard
 from torch.distributed._tensor.op_schema import (
     OpSchema,
-    OutputSharding,
+    OpStrategy,
+    PlacementStrategy,
     RuntimeSchemaInfo,
+    StrategyType,
 )
 from torch.distributed._tensor.ops.utils import (
+    generate_redistribute_costs,
     normalize_dim,
     normalize_dims,
     prod,
-    register_prop_rule,
+    register_op_strategy,
 )
 
 from torch.distributed._tensor.placement_types import DTensorSpec, Placement, Replicate
-from torch.fx.experimental.proxy_tensor import disable_proxy_modes_tracing
+from torch.distributed.device_mesh import DeviceMesh
 
 aten = torch.ops.aten
 
@@ -454,68 +455,41 @@ def dim_reduction(
     )
 
 
-@dataclass
-class Op:
-    dim_map: Callable[..., DimMap]
-    shape_argnum: Optional[int] = None
-
-
-ops: Dict[Callable[..., torch.Tensor], Op] = {
-    torch.atleast_1d: Op(dim_map=lambda x: dim_pad_left(x.ndim, 1)),
-    torch.atleast_2d: Op(dim_map=lambda x: dim_pad_left(x.ndim, 2)),
-    torch.atleast_3d: Op(dim_map=lambda x: dim_atleast_3d(x.ndim)),
-    torch.broadcast_to: Op(
-        dim_map=lambda input, shape: expand(input.shape, shape), shape_argnum=1
-    ),
-    Tensor.expand: Op(
-        dim_map=lambda self, *sizes: expand(self.shape, normalize_sizes(sizes)),
-        shape_argnum=1,
-    ),
-    torch.flatten: Op(dim_map=lambda tensor: dim_flatten(tensor.ndim)),
-    torch.movedim: Op(
-        dim_map=lambda input, source, destination: dim_movedim(
-            input.ndim, source, destination
-        )
-    ),
-    torch.permute: Op(
-        dim_map=lambda input, dims: tuple(
-            InputDim(i) for i in normalize_dims(dims, input.ndim)
-        )
-    ),
-    torch.ravel: Op(dim_map=lambda tensor: dim_flatten(tensor.ndim)),
-    Tensor.repeat: Op(dim_map=lambda self, *sizes: dim_repeat(self.ndim, sizes)),
-    torch.reshape: Op(
-        dim_map=lambda input, shape: view_groups(input.shape, shape),
-        shape_argnum=1,
+dim_maps: Dict[Callable[..., torch.Tensor], Callable[..., DimMap]] = {
+    torch.atleast_1d: lambda x: dim_pad_left(x.ndim, 1),
+    torch.atleast_2d: lambda x: dim_pad_left(x.ndim, 2),
+    torch.atleast_3d: lambda x: dim_atleast_3d(x.ndim),
+    torch.broadcast_to: lambda input, shape: expand(input.shape, shape),
+    Tensor.expand: lambda self, *sizes: expand(self.shape, normalize_sizes(sizes)),
+    torch.flatten: lambda tensor: dim_flatten(tensor.ndim),
+    torch.movedim: lambda input, source, destination: dim_movedim(
+        input.ndim, source, destination
     ),
-    torch.squeeze: Op(dim_map=lambda input, dim=None: dim_squeeze(input.shape, dim)),
-    torch.tile: Op(dim_map=lambda input, dims: dim_tile(input.ndim, dims)),
-    torch.transpose: Op(
-        dim_map=lambda input, dim0, dim1: dim_transpose(input.ndim, dim0, dim1)
+    torch.permute: lambda input, dims: tuple(
+        InputDim(i) for i in normalize_dims(dims, input.ndim)
     ),
-    torch.unsqueeze: Op(dim_map=lambda input, dim: dim_unsqueeze(input.ndim, dim)),
-    Tensor.view: Op(
-        dim_map=lambda input, *shape: view_groups(input.shape, shape),
-        shape_argnum=1,
-    ),
-    torch.view_as_complex: Op(
-        dim_map=lambda input: dim_flatten(input.ndim, input.ndim - 2)
-    ),
-    torch.view_as_real: Op(dim_map=lambda input: dim_view_as_real(input.shape)),
+    torch.ravel: lambda tensor: dim_flatten(tensor.ndim),
+    Tensor.repeat: lambda self, *sizes: dim_repeat(self.ndim, sizes),
+    torch.reshape: lambda input, shape: view_groups(input.shape, shape),
+    torch.squeeze: lambda input, dim=None: dim_squeeze(input.shape, dim),
+    torch.tile: lambda input, dims: dim_tile(input.ndim, dims),
+    torch.transpose: lambda input, dim0, dim1: dim_transpose(input.ndim, dim0, dim1),
+    torch.unsqueeze: lambda input, dim: dim_unsqueeze(input.ndim, dim),
+    Tensor.view: lambda input, *shape: view_groups(input.shape, shape),
+    torch.view_as_complex: lambda input: dim_flatten(input.ndim, input.ndim - 2),
+    torch.view_as_real: lambda input: dim_view_as_real(input.shape),
 }
 
 
 def propagate_shape_and_sharding(
-    in_shard: Sequence[Placement],
+    input_src_placements: Sequence[Placement],
     local_in_shape: Shape,
     rule: DimMap,
     mesh_sizes: Shape,
-) -> Tuple[Shape, Optional[Sequence[Placement]], torch.Tensor]:
+) -> Tuple[Sequence[Placement], Sequence[Placement]]:
     """
-    Determine output sharding and tensor shape based on given global tensor shape and input sharding.
-
-    Takes as input the global shape of the tensor, and the input sharding,
-    and produce corresponding output sharding and shape of the output tensor.
+    Determine input target sharding and output sharding based on
+    given global tensor shape and input source sharding.
 
     Sharding propagation follows mapped dimensions:
     - An output dimension that maps directly to an input dimension is sharded equally
@@ -524,16 +498,13 @@ def propagate_shape_and_sharding(
     - An output dimension that is a split of the input dimension can only be sharded
       if the leftmost split size is divisible by the mesh dimension
     """
-    assert len(in_shard) == len(mesh_sizes)
-    sharded_in_dims: Set[int] = {s.dim for s in in_shard if isinstance(s, Shard)}
+    assert len(input_src_placements) == len(mesh_sizes)
     # for each input dim, for each mesh dim, provides a list of possible shardable dimensions
-    shardable_dims: torch.Tensor = torch.ones(
-        (len(local_in_shape), len(mesh_sizes)), dtype=torch.bool
-    )
+    mesh_ndim = len(mesh_sizes)
+    shardable_dims: Dict[int, List[bool]] = {}
 
     # in case an input dimension disappears (e.g. collapsing, reduction)
     # we cannot shard in that dimension (we need a replication fall-back rule)
-
     seen_input_dims: Set[int] = set()
 
     def collect_used_inputs(cmd: DimSpec) -> None:
@@ -545,28 +516,19 @@ def collect_used_inputs(cmd: DimSpec) -> None:
     for cmd in rule:
         collect_used_inputs(cmd)
     for dim in range(len(local_in_shape)):
-        shardable_dims[dim, :] = dim in seen_input_dims
+        shardable_dims[dim] = [dim in seen_input_dims] * mesh_ndim
 
-    def get_dim_size(cmd: DimSpec) -> Tuple[int, Optional[InputDim]]:
+    def get_in_dim_to_shard(cmd: DimSpec) -> Optional[InputDim]:
         if isinstance(cmd, InputDim):
-            seen_input_dims.add(cmd.input_dim)
-            return (
-                local_in_shape[cmd.input_dim],
-                cmd if cmd.input_dim in sharded_in_dims else None,
-            )
+            return cmd
         elif isinstance(cmd, Flatten):
             for dim in cmd.input_dims[1:]:
                 if isinstance(dim, InputDim):
-                    shardable_dims[dim.input_dim, :] = False
+                    shardable_dims[dim.input_dim] = [False] * mesh_ndim
             dim0 = cmd.input_dims[0]
-            return (
-                prod(get_dim_size(a)[0] for a in cmd.input_dims),
-                dim0
-                if isinstance(dim0, InputDim) and dim0.input_dim in sharded_in_dims
-                else None,
-            )
+            return dim0 if isinstance(dim0, InputDim) else None
         elif isinstance(cmd, Split):
-            _, in_dim = get_dim_size(cmd.input_dim)
+            in_dim = get_in_dim_to_shard(cmd.input_dim)
             out_size = cmd.group_shape[cmd.split_id]
             if cmd.split_id == 0 and in_dim is not None:
                 # we need to check that the input dimension is divisible
@@ -579,14 +541,13 @@ def get_dim_size(cmd: DimSpec) -> Tuple[int, Optional[InputDim]]:
                 # but we will allow it if that's the input and it's compatible
 
                 # 1. is this dimension shardable on each individual mesh dim?
-                for mesh_dim, mesh_dim_size in enumerate(mesh_sizes):
-                    shardable_dims[in_dim.input_dim, mesh_dim] = (
-                        out_size % mesh_dim_size == 0
-                    )
+                shardable_dims[in_dim.input_dim] = [
+                    out_size % mesh_dim_size == 0 for mesh_dim_size in mesh_sizes
+                ]
 
                 # 2. here we special case things like [Shard(0), Shard(0)]
                 submesh_size = 1
-                for size, shard in zip(mesh_sizes, in_shard):
+                for size, shard in zip(mesh_sizes, input_src_placements):
                     if isinstance(shard, Shard) and shard.dim == in_dim:
                         submesh_size *= size
                 assert (
@@ -594,158 +555,113 @@ def get_dim_size(cmd: DimSpec) -> Tuple[int, Optional[InputDim]]:
                 ), f"Resulting dimension size {out_size} is not divisible by its mesh dimension {submesh_size}."
 
             # we will only shard our first component of the split
-            return out_size, in_dim if cmd.split_id == 0 else None
-        elif isinstance(cmd, Singleton):
-            return 1, None
-        elif isinstance(cmd, Broadcast):
-            return cmd.dim_size, None
-        elif isinstance(cmd, NewDim):
-            return cmd.size, None
+            return in_dim if cmd.split_id == 0 else None
         elif isinstance(cmd, Repeat):
-            size, in_dim = get_dim_size(cmd.input_dim)
+            in_dim = get_in_dim_to_shard(cmd.input_dim)
             if in_dim is not None:
-                shardable_dims[in_dim.input_dim, :] = False
-            return size * cmd.times, None
+                shardable_dims[in_dim.input_dim] = [False] * mesh_ndim
+            return None
         else:
-            raise RuntimeError(f"cmd not found: {cmd}, in rule: {rule}")
+            return None
 
-    dim_map = {}
-    out_shape = []
+    # for each output dim, find the corresponding input dim in terms of sharding prop
+    shard_dim_map = {}
     for dim, cmd in enumerate(rule):
-        out_size, in_dim = get_dim_size(cmd)
-        out_shape.append(out_size)
+        in_dim = get_in_dim_to_shard(cmd)
         if in_dim is not None:
-            dim_map[in_dim.input_dim] = dim
+            shard_dim_map[in_dim.input_dim] = dim
 
-    needs_reshard = any(
-        isinstance(placement, Shard) and not shardable_dims[placement.dim][mesh_dim]
-        for mesh_dim, placement in enumerate(in_shard)
-    )
-
-    output_placements = (
-        None
-        if needs_reshard
-        else [Shard(dim_map[s.dim]) if isinstance(s, Shard) else s for s in in_shard]
-    )
+    input_tgt_placements = [
+        Replicate()
+        if isinstance(p, Shard) and not shardable_dims[p.dim][mesh_dim]
+        else p
+        for mesh_dim, p in enumerate(input_src_placements)
+    ]
+    output_placements = [
+        Shard(shard_dim_map[p.dim]) if isinstance(p, Shard) else p
+        for p in input_tgt_placements
+    ]
 
-    return (tuple(out_shape), output_placements, shardable_dims)
+    return input_tgt_placements, output_placements
 
 
-def register_prop_rule_map(
+def register_op_strategy_map(
     aten_op_overload: torch._ops.OpOverload,
     local_op_name: Callable[..., torch.Tensor],
     schema_info: Optional[RuntimeSchemaInfo] = None,
 ) -> None:
-    spec: Op = ops[local_op_name]
-
-    @register_prop_rule(aten_op_overload, schema_info=schema_info)
-    def reshape_prop(op_schema: OpSchema) -> OutputSharding:
-        rules = spec.dim_map(*op_schema.args_schema, **op_schema.kwargs_schema)
-        input_dtensor_spec = cast(DTensorSpec, op_schema.args_schema[0])
-        mesh = input_dtensor_spec.mesh
-
-        assert isinstance(
-            input_dtensor_spec, DTensorSpec
-        ), "Expected first input to be a DTensorSpec"
-        global_in_shape = input_dtensor_spec.shape
+    dim_map: Callable[..., DimMap] = dim_maps[local_op_name]
+
+    @register_op_strategy(aten_op_overload, schema_info=schema_info)
+    def reshape_strategy(mesh: DeviceMesh, op_schema: OpSchema) -> StrategyType:
+        rules = dim_map(*op_schema.args_schema, **op_schema.kwargs_schema)
+        input_strategy = cast(OpStrategy, op_schema.args_schema[0])
+        global_in_shape = input_strategy.output_shape
         assert global_in_shape is not None, "Shape required."
 
-        with disable_proxy_modes_tracing(), unset_fake_temporarily():
-            (
-                global_out_shape,
-                shard_out,
-                shardable_dims,
-            ) = propagate_shape_and_sharding(
-                input_dtensor_spec.placements,
+        output_strategy = OpStrategy([])
+        for input_placement_strategy in input_strategy.strategies:
+            input_src_spec = input_placement_strategy.output_spec
+
+            input_tgt_placements, output_placements = propagate_shape_and_sharding(
+                input_src_spec.placements,
                 tuple(global_in_shape),
                 rules,
                 mesh.shape,
             )
 
-        if shard_out is not None:
-            # no reshard needed
-            output_dtensor_spec = DTensorSpec(mesh=mesh, placements=tuple(shard_out))
-
-            # We only need the local shape to lower the call into the local op
-            args = op_schema.args_schema
-            shape_argnum = spec.shape_argnum
-            if shape_argnum is not None:
-                # compute the local shape from the global shape, then return
-                # a resharding even if we don't really reshard, the only reason
-                # for this type of resharding is to lower the global shape to
-                # local shape
-                local_out_shape = compute_local_shape(
-                    list(global_out_shape), mesh, shard_out
-                )
-
-                suggested_schema = OpSchema(
-                    op=op_schema.op,
-                    args_schema=args[:shape_argnum]
-                    + (tuple(local_out_shape),)
-                    + args[shape_argnum + 1 :],
-                    kwargs_schema=op_schema.kwargs_schema,
-                )
-                return OutputSharding(
-                    output_spec=output_dtensor_spec,
-                    redistribute_schema=suggested_schema,
-                    needs_redistribute=True,
-                )
-
-            return OutputSharding(output_spec=output_dtensor_spec)
-
-        else:
             # TODO: optimize this. we shouldn't simply blindly replicate
             #       unshardable dims ...
             # FIXME: this can be wrong for situations where we have
             #        [Shard(0), Shard(0)]
-            suggested_placements = [
-                p
-                if not isinstance(p, Shard) or shardable_dims[p.dim][mesh_dim]
-                else Replicate()
-                for mesh_dim, p in enumerate(input_dtensor_spec.placements)
+            input_tgt_spec = DTensorSpec(
+                placements=tuple(input_tgt_placements),
+                mesh=input_src_spec.mesh,
+                tensor_meta=input_src_spec.tensor_meta,
+            )
+            redistribute_costs = [
+                generate_redistribute_costs(input_strategy, input_tgt_spec)
             ]
-            return OutputSharding(
-                output_spec=None,
-                redistribute_schema=OpSchema(
-                    op=op_schema.op,
-                    args_schema=(
-                        DTensorSpec(
-                            placements=tuple(suggested_placements),
-                            mesh=input_dtensor_spec.mesh,
-                            tensor_meta=input_dtensor_spec.tensor_meta,
-                        ),
-                    )
-                    + op_schema.args_schema[1:],
-                    kwargs_schema=op_schema.kwargs_schema,
-                ),
+
+            output_spec = DTensorSpec(mesh=mesh, placements=tuple(output_placements))
+            output_strategy.strategies.append(
+                PlacementStrategy(
+                    output_specs=output_spec,
+                    input_specs=(input_tgt_spec,),
+                    redistribute_cost=redistribute_costs,
+                )
             )
 
+        return output_strategy
 
-register_prop_rule_map(aten.squeeze.default, torch.squeeze)
-register_prop_rule_map(
+
+register_op_strategy_map(aten.squeeze.default, torch.squeeze)
+register_op_strategy_map(
     aten.squeeze.dim, torch.squeeze, schema_info=RuntimeSchemaInfo(1)
 )
-register_prop_rule_map(aten.view.default, Tensor.view, schema_info=RuntimeSchemaInfo(1))
-register_prop_rule_map(
+register_op_strategy_map(
+    aten.view.default, Tensor.view, schema_info=RuntimeSchemaInfo(1)
+)
+register_op_strategy_map(
     aten.reshape.default, torch.reshape, schema_info=RuntimeSchemaInfo(1)
 )
-register_prop_rule_map(
+register_op_strategy_map(
     aten._unsafe_view.default, Tensor.view, schema_info=RuntimeSchemaInfo(1)
 )
-register_prop_rule_map(
+register_op_strategy_map(
     aten.unsqueeze.default, torch.unsqueeze, schema_info=RuntimeSchemaInfo(1)
 )
-register_prop_rule_map(
+register_op_strategy_map(
     aten.expand.default, Tensor.expand, schema_info=RuntimeSchemaInfo(1)
 )
-register_prop_rule_map(
+register_op_strategy_map(
     aten.permute.default, torch.permute, schema_info=RuntimeSchemaInfo(1)
 )
-register_prop_rule_map(
+register_op_strategy_map(
     aten.repeat.default, Tensor.repeat, schema_info=RuntimeSchemaInfo(1)
 )
-register_prop_rule_map(
+register_op_strategy_map(
     aten.transpose.int, torch.transpose, schema_info=RuntimeSchemaInfo(1)
 )
-register_prop_rule_map(aten.view_as_complex.default, torch.view_as_complex)
-register_prop_rule_map(aten.view_as_real.default, torch.view_as_real)
+register_op_strategy_map(aten.view_as_complex.default, torch.view_as_complex)
+register_op_strategy_map(aten.view_as_real.default, torch.view_as_real)
diff --git a/torch/distributed/_tensor/sharding_prop.py b/torch/distributed/_tensor/sharding_prop.py
index 9acf6aa0c919..d173a91a771c 100644
--- a/torch/distributed/_tensor/sharding_prop.py
+++ b/torch/distributed/_tensor/sharding_prop.py
@@ -45,15 +45,21 @@ def __init__(self) -> None:
         # op map to save static argnum to decide to reuse sharding prop cache or re-run sharding prop
         self.op_to_schema_info: Dict[OpOverload, RuntimeSchemaInfo] = {}
         self.propagate_op_sharding = lru_cache(None)(self.propagate_op_sharding_non_cached)  # type: ignore[method-assign]
-        # op map to save indices of size (and stride) args which may need to be modified in sharding prop
-        self.op_to_size_and_stride_idx: Dict[
+        # op map to save indices of shape (and stride) args which may need to be modified in sharding prop
+        self.op_to_shape_and_stride_idx: Dict[
             OpOverload, Union[int, Tuple[int, int]]
         ] = {
+            # new factory ops
             aten.new_empty.default: 1,
             aten.new_full.default: 1,
             aten.new_ones.default: 1,
             aten.new_zeros.default: 1,
             aten.new_empty_strided.default: (1, 2),
+            # view ops
+            aten.expand.default: 1,
+            aten.reshape.default: 1,
+            aten.view.default: 1,
+            aten._unsafe_view.default: 1,
         }
 
     def register_sharding_prop_rule(
@@ -260,16 +266,19 @@ def spec_to_strategy(spec: object) -> object:
                     )
                     suggestion_schema._inplace_rewrap_schema_suggestion(op_schema)
 
-                # size and stride args need to be modified for new factory ops, potentially
-                if op_schema.op in self.op_to_size_and_stride_idx:
+                # shape and stride args need to be modified for
+                # view ops and new factory ops, potentially
+                if op_schema.op in self.op_to_shape_and_stride_idx:
                     assert isinstance(output_strategy.output_spec, DTensorSpec)
                     # It happens when the output has the same shape as the input
                     # and the input placements are not all Replicate().
                     if output_strategy.output_spec.is_sharded():
-                        needs_redistribute = True
-                        suggestion_schema = self._adjust_size_and_stride_args(
-                            op_schema, output_strategy.output_spec, mesh
+                        schema = suggestion_schema or op_schema
+                        assert isinstance(out_tensor_meta, TensorMeta)
+                        suggestion_schema = self._adjust_shape_and_stride_args(
+                            out_tensor_meta, schema, output_strategy.output_spec, mesh
                         )
+                        needs_redistribute = True
 
                 # construct output spec for the op
                 if op_schema.return_type_tuple_tensor_like():
@@ -442,29 +451,31 @@ def _select_strategy(self, strategy: OpStrategy) -> PlacementStrategy:
         # for eager execution, we just select the one with the minimal redistribute cost
         return strategy.strategies[strategy_costs.index(min(strategy_costs))]
 
-    def _adjust_size_and_stride_args(
-        self, op_schema: OpSchema, spec: DTensorSpec, mesh: DeviceMesh
+    def _adjust_shape_and_stride_args(
+        self,
+        out_tensor_meta: TensorMeta,
+        schema: OpSchema,
+        spec: DTensorSpec,
+        mesh: DeviceMesh,
     ) -> OpSchema:
-        size_stride_idx = self.op_to_size_and_stride_idx[op_schema.op]
-        if isinstance(size_stride_idx, tuple):
-            size_idx, stride_idx = size_stride_idx
+        shape_stride_idx = self.op_to_shape_and_stride_idx[schema.op]
+        if isinstance(shape_stride_idx, tuple):
+            shape_idx, stride_idx = shape_stride_idx
         else:
-            size_idx = size_stride_idx
+            shape_idx = shape_stride_idx
             stride_idx = None
 
-        expected_input_schema = list(op_schema.args_schema)
-        size = cast(list, expected_input_schema[size_idx])
-        # # adjust size to be the same as that of the _local_tensor
-        # # of the DTensor input arg at index 0, which is inferred
-        expected_input_schema[size_idx] = compute_local_shape(
-            size, mesh, spec.placements
+        expected_input_schema = list(schema.args_schema)
+        # adjust shape to be the same as that of the _local_tensor
+        # of the DTensor input arg at index 0, which is inferred
+        expected_input_schema[shape_idx] = compute_local_shape(
+            out_tensor_meta.shape, mesh, spec.placements
         )
 
         # adjust the stride arg for aten.new_empty_strided.default
         if stride_idx:
-            stride = cast(list, expected_input_schema[stride_idx])
             expected_input_schema[stride_idx] = compute_local_stride(
-                stride, mesh, spec.placements
+                out_tensor_meta.stride, mesh, spec.placements
             )
 
-        return OpSchema(op_schema.op, tuple(expected_input_schema), {})
+        return OpSchema(schema.op, tuple(expected_input_schema), schema.kwargs_schema)
diff --git a/torch/distributed/checkpoint/_fsspec_filesystem.py b/torch/distributed/checkpoint/_fsspec_filesystem.py
index 98bb637dacd2..7fdd04dff311 100644
--- a/torch/distributed/checkpoint/_fsspec_filesystem.py
+++ b/torch/distributed/checkpoint/_fsspec_filesystem.py
@@ -60,12 +60,19 @@ def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
 
         try:
             url_to_fs(checkpoint_id)
-        except ValueError as e:
+        except ValueError:
             return False
 
         return True
 
+    def exists(self, path: Union[str, os.PathLike]) -> bool:
+        return self.fs.exists(path)
 
+    def rm_file(self, path: Union[str, os.PathLike]) -> None:
+        self.fs.rm(path)
+
+
+# TODO: add the dcp.async_save mixin
 class FsspecWriter(FileSystemWriter):
     """
     Basic implementation of StorageWriter using FFspec.
@@ -87,6 +94,7 @@ def __init__(
         sync_files: bool = True,
         thread_count: int = 1,
         per_thread_copy_ahead: int = 10_000_000,
+        overwrite: bool = True,
     ) -> None:
         """
         Initialize the writer pointing to `path`.
@@ -97,11 +105,17 @@ def __init__(
             sync_files : force files to be synced to permanent storage. Default to True.
             thread_count: Number of IO threads to use to write. Default to 1.
             per_thread_copy_ahead: How many bytes to copy from the GPU ahead of saving then. Default 10Mb.
+            overwrite: Whether to allow overwriting existing checkpoints. Defaults to True.
 
         N. B. If sync_files is disabled, there's no guarantee that the checkpoint will be consistent in the case of a failure.
         """
         super().__init__(
-            path, single_file_per_rank, sync_files, thread_count, per_thread_copy_ahead
+            path,
+            single_file_per_rank,
+            sync_files,
+            thread_count,
+            per_thread_copy_ahead,
+            overwrite=overwrite,
         )
         self.fs = FileSystem()
         self.path = self.fs.init_path(path)
diff --git a/torch/distributed/checkpoint/filesystem.py b/torch/distributed/checkpoint/filesystem.py
index 605a2cbc1215..aa25d1fb5369 100644
--- a/torch/distributed/checkpoint/filesystem.py
+++ b/torch/distributed/checkpoint/filesystem.py
@@ -7,6 +7,7 @@
 import queue
 import threading
 import uuid
+import warnings
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from dataclasses import dataclass
@@ -56,7 +57,9 @@
 from torch.distributed.checkpoint.utils import _create_file_view
 from torch.futures import Future
 
-__all__ = ["FileSystemWriter", "FileSystemReader"]
+__all__ = ["FileSystemWriter", "FileSystemReader", "FileSystem", "FileSystemBase"]
+
+_metadata_fn: str = ".metadata"
 
 
 @dataclass
@@ -367,6 +370,14 @@ def mkdir(self, path: Union[str, os.PathLike]) -> None:
     def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
         ...
 
+    @abstractmethod
+    def exists(self, path: Union[str, os.PathLike]) -> bool:
+        ...
+
+    @abstractmethod
+    def rm_file(self, path: Union[str, os.PathLike]) -> None:
+        ...
+
 
 class FileSystem(FileSystemBase):
     @contextmanager
@@ -408,6 +419,12 @@ def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool:
 
         return False
 
+    def exists(self, path: Union[str, os.PathLike]) -> bool:
+        return cast(Path, path).exists()
+
+    def rm_file(self, path: Union[str, os.PathLike]) -> None:
+        cast(Path, path).unlink()
+
 
 class _FileSystemWriter(StorageWriter):
     """
@@ -430,6 +447,7 @@ def __init__(
         sync_files: bool = True,
         thread_count: int = 1,
         per_thread_copy_ahead: int = 10_000_000,
+        overwrite: bool = True,
         *args: Any,
         **kwargs: Any,
     ) -> None:
@@ -442,6 +460,7 @@ def __init__(
             sync_files : force files to be synced to permanent storage. Default to True.
             thread_count: Number of IO threads to use to write. Default to 1.
             per_thread_copy_ahead: How many bytes to copy from the GPU ahead of saving then. Default 10Mb.
+            overwrite: Whether to allow overwriting existing checkpoints. Defaults to True.
 
         N. B. If sync_files is disabled, there's no guarantee that the checkpoint will be consistent in the case of a failure.
         """
@@ -453,6 +472,7 @@ def __init__(
         self.thread_count = thread_count
         self.per_thread_copy_ahead = per_thread_copy_ahead
         self.save_id = _generate_uuid()
+        self.overwrite = overwrite
 
     def reset(self, checkpoint_id: Union[str, os.PathLike, None] = None) -> None:
         if checkpoint_id:
@@ -464,6 +484,16 @@ def set_up_storage_writer(self, is_coordinator: bool) -> None:
 
     def prepare_local_plan(self, plan: SavePlan) -> SavePlan:
         self.fs.mkdir(self.path)
+        if self.fs.exists(self.metadata_path):
+            if self.overwrite:
+                warnings.warn(
+                    f"Detected an existing checkpoint in {self.metadata_path}, overwriting since {self.overwrite=}."
+                    " Past version 2.5 of PyTorch, `overwrite` will default to False. Set this variable to True to"
+                    " maintain this functionality or False to raise when an existing checkpoint is found."
+                )
+            else:
+                raise RuntimeError(f"Checkpoint already exists and {self.overwrite=}.")
+
         return plan
 
     def prepare_global_plan(self, plans: List[SavePlan]) -> List[SavePlan]:
@@ -550,8 +580,7 @@ def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
 
         metadata.storage_meta = self.storage_meta()
 
-        tmp_path = cast(Path, self.fs.concat_path(self.path, ".metadata.tmp"))
-        meta_path = cast(Path, self.fs.concat_path(self.path, ".metadata"))
+        tmp_path = cast(Path, self.fs.concat_path(self.path, f"{_metadata_fn}.tmp"))
         with self.fs.create_stream(tmp_path, "wb") as metadata_file:
             pickle.dump(metadata, metadata_file)
             if self.sync_files:
@@ -560,11 +589,19 @@ def finish(self, metadata: Metadata, results: List[List[WriteResult]]) -> None:
                 except AttributeError:
                     os.sync()
 
-        self.fs.rename(tmp_path, meta_path)
+        # delete in-case other checkpoints were present.
+        if self.fs.exists(self.metadata_path):
+            self.fs.rm_file(self.metadata_path)
+
+        self.fs.rename(tmp_path, self.metadata_path)
 
     def storage_meta(self) -> Optional[StorageMeta]:
         return StorageMeta(checkpoint_id=self.checkpoint_id, save_id=self.save_id)
 
+    @property
+    def metadata_path(self) -> Union[str, os.PathLike]:
+        return cast(Path, self.fs.concat_path(self.path, _metadata_fn))
+
     @property
     def checkpoint_id(self) -> Union[str, os.PathLike]:
         """
@@ -689,6 +726,7 @@ def __init__(
         thread_count: int = 1,
         per_thread_copy_ahead: int = 10_000_000,
         cache_staged_state_dict: bool = False,
+        overwrite: bool = True,
     ) -> None:
         """
         Initialize the writer pointing to `path`.
@@ -702,6 +740,7 @@ def __init__(
             cache_staged_state_dict: Whether to cache the staged state_dict. This option decreases staging latency
                 at the cost of increases memory usage. Additionally, if this parameter is set to True, it's the expectation
                 that the stager is maintained and re-used for multiple dcp.async_save calls. Default to False.
+            overwrite: Whether to allow overwriting existing checkpoints. Defaults to True.
 
         N. B. If sync_files is disabled, there's no guarantee that the checkpoint will be consistent in the case of a failure.
         """
@@ -712,6 +751,7 @@ def __init__(
             thread_count=thread_count,
             per_thread_copy_ahead=per_thread_copy_ahead,
             cache_staged_state_dict=cache_staged_state_dict,
+            overwrite=overwrite,
         )
 
     def stage(self, state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
diff --git a/torch/distributed/checkpoint/logger.py b/torch/distributed/checkpoint/logger.py
index 99030db8647e..c5670a33a023 100644
--- a/torch/distributed/checkpoint/logger.py
+++ b/torch/distributed/checkpoint/logger.py
@@ -29,7 +29,9 @@ def _msg_dict_from_dcp_method_args(*args, **kwargs) -> Dict[str, Any]:
     if not checkpoint_id and (serializer := storage_writer or storage_reader):
         checkpoint_id = getattr(serializer, "checkpoint_id", None)
 
-    msg_dict["checkpoint_id"] = str(checkpoint_id)
+    msg_dict["checkpoint_id"] = (
+        str(checkpoint_id) if checkpoint_id is not None else checkpoint_id
+    )
 
     return msg_dict
 
diff --git a/torch/distributed/checkpoint/metadata.py b/torch/distributed/checkpoint/metadata.py
index 2172ff6abc02..bbcfcbc01e17 100644
--- a/torch/distributed/checkpoint/metadata.py
+++ b/torch/distributed/checkpoint/metadata.py
@@ -13,6 +13,7 @@
     "Metadata",
     "MetadataIndex",
     "TensorProperties",
+    "StorageMeta",
 ]
 
 
diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py
index e3c213a37188..e7072d623012 100644
--- a/torch/distributed/checkpoint/state_dict.py
+++ b/torch/distributed/checkpoint/state_dict.py
@@ -51,6 +51,28 @@
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.utils._pytree import tree_map_only
 
+__all__ = [
+    "FLAT_PARAM",
+    "PG",
+    "PG_PREFIX",
+    "STATE",
+    "STATE_PREFIX",
+    "PARAMS",
+    "FQNS_T",
+    "PrimitiveType",
+    "ValueType",
+    "DictValueType",
+    "ListDictValueType",
+    "OptimizerStateType",
+    "gc_context",
+    "StateDictOptions",
+    "get_model_state_dict",
+    "get_optimizer_state_dict",
+    "get_state_dict",
+    "set_model_state_dict",
+    "set_optimizer_state_dict",
+    "set_state_dict",
+]
 
 FLAT_PARAM = "_flat_param"
 PG = "param_groups"
@@ -113,7 +135,6 @@ class StateDictOptions:
 
     - ``strict``: the ``strict`` option when ``set_state_dict`` calls
       model.load_state_dict().
-      The default value is False.
 
     - ``broadcast_from_rank0``: when the option is True, rank0 should receive a
        full state_dict and will broadcast the tensors in the state_dict/
diff --git a/torch/distributed/device_mesh.py b/torch/distributed/device_mesh.py
index b199e82a1757..c0981a549c6b 100644
--- a/torch/distributed/device_mesh.py
+++ b/torch/distributed/device_mesh.py
@@ -89,7 +89,9 @@ def create_child_mesh(
                     res_sub_mesh = sub_mesh
 
             res_sub_mesh._dim_group_infos = [device_mesh._dim_group_infos[mesh_dim]]  # type: ignore[possibly-undefined]
+            res_sub_mesh._parent_mesh = device_mesh
             # Assign the current DeviceMesh as the parent of the child DeviceMesh.
+            # We need to update the mappings after the child mesh hash update.
             self.child_to_parent_mapping[res_sub_mesh] = device_mesh
             return res_sub_mesh
 
@@ -207,13 +209,14 @@ def __init__(
             self.mesh = (
                 mesh.detach().to(dtype=torch.int)
                 if isinstance(mesh, torch.Tensor)
-                else torch.tensor(mesh, dtype=torch.int)
+                else torch.tensor(mesh, device="cpu", dtype=torch.int)
             )
-            self.mesh_dim_names = mesh_dim_names
+            self.mesh_dim_names = tuple(mesh_dim_names) if mesh_dim_names else None
 
             # private field to pre-generate DeviceMesh's hash
             self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
-            self._hash = hash((self._flatten_mesh_list, self.mesh.shape, id(self)))
+            self._parent_mesh: Optional["DeviceMesh"] = None
+            self._thread_id = threading.get_ident()
 
             # Skip process group initialization if xla device or init backend is False
             # TODO(yeounoh) implement DeviceMesh backend and register XLA backend.
@@ -334,17 +337,35 @@ def __repr__(self) -> str:
             return device_mesh_repr
 
         def __hash__(self):
+            # lazily compute hash
+            self._hash = getattr(self, "_hash", None)
+            if not self._hash:
+                self._hash = hash(
+                    (
+                        self._flatten_mesh_list,
+                        self.mesh.shape,
+                        self.device_type,
+                        self.mesh_dim_names,
+                        self._parent_mesh,
+                        self._thread_id,
+                    )
+                )
             return self._hash
 
         def __eq__(self, other: object) -> bool:
             if not isinstance(other, DeviceMesh):
                 return False
-            if id(self.mesh) == id(other.mesh):
+            if id(self) == id(other):
                 return True
-            return (
-                self.mesh.shape == other.mesh.shape
-                and self._flatten_mesh_list == other._flatten_mesh_list
-            )
+            else:
+                return (
+                    self._flatten_mesh_list == other._flatten_mesh_list
+                    and self.mesh.shape == other.mesh.shape
+                    and self.device_type == other.device_type
+                    and self.mesh_dim_names == other.mesh_dim_names
+                    and self._parent_mesh == other._parent_mesh
+                    and self._thread_id == other._thread_id
+                )
 
         def __getitem__(self, mesh_dim_name: str) -> "DeviceMesh":
             """
@@ -430,21 +451,67 @@ def get_group(
                 return dim_groups
 
         @staticmethod
-        def from_group(group: ProcessGroup, device_type: str) -> "DeviceMesh":
+        def from_group(
+            group: Union[ProcessGroup, List[ProcessGroup]],
+            device_type: str,
+            mesh: Optional[Union[torch.Tensor, "ArrayLike"]] = None,
+            *,
+            mesh_dim_names: Optional[Tuple[str, ...]] = None,
+        ) -> "DeviceMesh":
             """
             Contstructs a :class:`DeviceMesh` with ``device_type`` from an
             existing :class:`ProcessGroup`.
 
-            The constructed device mesh is assumed to be 1D.
+            The constructed device mesh has number of dimensions equal to the
+            number of groups passed. If more than one group is passed, then the
+            ``mesh`` argument is required.
             """
-            # Manually define `_dim_group_infos` instead of relying on the
-            # normal logic since we already have the PG
-            group_ranks = get_process_group_ranks(group)
-            mesh = DeviceMesh(device_type, group_ranks, _init_backend=False)
-            mesh._dim_group_infos = [
-                (_get_group_tag(group), group_ranks, group.group_name)
+            if isinstance(group, ProcessGroup):
+                group_ranks = get_process_group_ranks(group)
+                if (
+                    isinstance(mesh, torch.Tensor) and mesh.tolist() != group_ranks
+                ) or (mesh is not None and mesh != group_ranks):
+                    raise ValueError(
+                        f"Invalid mesh {str(mesh)} for ProcessGroup with ranks {group_ranks}"
+                    )
+                mesh = torch.tensor(group_ranks, device="cpu", dtype=torch.int)
+                device_mesh = DeviceMesh(
+                    device_type,
+                    mesh,
+                    mesh_dim_names=mesh_dim_names,
+                    _init_backend=False,
+                )
+                device_mesh._dim_group_infos = [
+                    (_get_group_tag(group), group_ranks, group.group_name)
+                ]
+                return device_mesh
+            groups = list(group)
+            if len(groups) == 0:
+                raise ValueError("Expects at least one ProcessGroup to be passed")
+            if mesh is None:
+                raise ValueError("Must pass mesh if passing multiple ProcessGroups")
+            mesh = (
+                mesh.detach().to(dtype=torch.int, device="cpu")
+                if isinstance(mesh, torch.Tensor)
+                else torch.tensor(mesh, device="cpu", dtype=torch.int)
+            )
+            if mesh.ndim != len(groups):
+                raise ValueError(
+                    "Expects mesh with ndim equal to number of ProcessGroups but got "
+                    f"mesh {mesh.tolist()} and {len(groups)} ProcessGroups"
+                )
+            device_mesh = DeviceMesh(
+                device_type, mesh, mesh_dim_names=mesh_dim_names, _init_backend=False
+            )
+            device_mesh._dim_group_infos = [
+                (
+                    _get_group_tag(group),
+                    get_process_group_ranks(group),
+                    group.group_name,
+                )
+                for group in groups
             ]
-            return mesh
+            return device_mesh
 
         def size(self, mesh_dim: Optional[int] = None) -> int:
             return self.mesh.numel() if mesh_dim is None else self.mesh.size(mesh_dim)
diff --git a/torch/distributed/distributed_c10d.py b/torch/distributed/distributed_c10d.py
index 2925e2fb36b2..70283cada928 100644
--- a/torch/distributed/distributed_c10d.py
+++ b/torch/distributed/distributed_c10d.py
@@ -66,7 +66,7 @@
     'ProcessGroup', 'ReduceOp', 'ReduceOptions', 'ReduceScatterOptions',
     'ScatterOptions', 'Store', 'DebugLevel', 'get_debug_level', 'Work',
     'default_pg_timeout', 'get_group_rank', 'get_global_rank', 'get_process_group_ranks',
-    'reduce_op', 'all_gather_into_tensor', 'reduce_scatter_tensor',
+    'reduce_op', 'all_gather_into_tensor', 'reduce_scatter_tensor', 'get_node_local_rank',
 ]
 
 _MPI_AVAILABLE = True
@@ -408,6 +408,21 @@ def __new__(cls, op: Callable, tensor: torch.Tensor, peer: int,
         _check_single_tensor(tensor, "tensor")
         return object.__new__(cls)
 
+    def __repr__(self):
+        my_group_rank = get_rank(self.group)
+        peer_group_rank = get_group_rank(self.group, self.peer) if self.group else self.peer
+        op_name = self.op.__name__
+        group_name = self.group.group_name if self.group else "default_pg"
+        if "send" in op_name:
+            s = my_group_rank
+            d = peer_group_rank
+        elif "recv" in op_name:
+            s = peer_group_rank
+            d = my_group_rank
+        else:
+            return super().__repr__()
+
+        return f"P2POp({op_name} pg={group_name}, s={s}, d={d},  {self.tensor.shape}, {self.tensor.dtype})"
 
 class _CollOp:
     """
@@ -737,7 +752,7 @@ def _store_based_barrier(rank, store, group_name, rendezvous_count, timeout, log
             )
 
             if timedelta(seconds=(time.time() - start)) > timeout:
-                raise DistStoreError(  # noqa: TRY200
+                raise DistStoreError(  # noqa: B904
                     "Timed out initializing process group in store based barrier on "
                     f"rank {rank}, for key: {store_key} (world_size={world_size}, "
                     f"num_workers_joined={worker_count}, timeout={timeout} error={e})"
diff --git a/torch/distributed/elastic/agent/server/health_check_server.py b/torch/distributed/elastic/agent/server/health_check_server.py
index 0c2dea63a221..001607305515 100644
--- a/torch/distributed/elastic/agent/server/health_check_server.py
+++ b/torch/distributed/elastic/agent/server/health_check_server.py
@@ -12,6 +12,8 @@
 
 log = get_logger(__name__)
 
+__all__ = ["HealthCheckServer", "create_healthcheck_server"]
+
 
 class HealthCheckServer:
     """
diff --git a/torch/distributed/elastic/multiprocessing/api.py b/torch/distributed/elastic/multiprocessing/api.py
index 72c3955e7d1e..eb0b110f25ee 100644
--- a/torch/distributed/elastic/multiprocessing/api.py
+++ b/torch/distributed/elastic/multiprocessing/api.py
@@ -50,6 +50,8 @@
     "get_std_cm",
     "MultiprocessContext",
     "SubprocessContext",
+    "LogsDest",
+    "LogsSpecs",
 ]
 
 class SignalException(Exception):
@@ -244,7 +246,7 @@ def __init__(
             if not log_dir:
                 log_dir = tempfile.mkdtemp(prefix="torchelastic_")
             elif not os.path.exists(log_dir):
-                os.makedirs(log_dir)
+                os.makedirs(log_dir, exist_ok=True)
             else:
                 if os.path.isfile(log_dir):
                     raise NotADirectoryError(f"log_dir: {log_dir} is a file")
@@ -670,9 +672,13 @@ def _poll(self) -> Optional[RunProcsResult]:
             if self._is_done():
                 # we should ALWAYS have ALL the return values when all the processes are done
                 self._worker_finished_event.set()
-                # Wait untill all processes are finished. At this point workers finished executing
-                # user function
-                self._pc.join()
+
+                # At this point workers finished running the user function
+                # But the child process might still have not exited. Wait for them.
+                # pc.join() blocks [forever] until "a" proc exits. Loop until all of them exits.
+                while not self._pc.join():
+                    logger.debug("entrypoint fn finished, waiting for all child procs to exit...")
+
                 _validate_full_rank(
                     self._return_values, self.nprocs, "return_value queue"
                 )
diff --git a/torch/distributed/elastic/timer/debug_info_logging.py b/torch/distributed/elastic/timer/debug_info_logging.py
index 87af84e281dc..2ac2dc5318be 100644
--- a/torch/distributed/elastic/timer/debug_info_logging.py
+++ b/torch/distributed/elastic/timer/debug_info_logging.py
@@ -12,6 +12,8 @@
 
 logger = get_logger(__name__)
 
+__all__ = ["log_debug_info_for_expired_timers"]
+
 
 def log_debug_info_for_expired_timers(
     run_id: str,
diff --git a/torch/distributed/elastic/utils/distributed.py b/torch/distributed/elastic/utils/distributed.py
index bf4a537bbf0e..1dc4680abc16 100644
--- a/torch/distributed/elastic/utils/distributed.py
+++ b/torch/distributed/elastic/utils/distributed.py
@@ -15,6 +15,7 @@
 from torch.distributed.elastic.utils.logging import get_logger
 from torch.distributed.elastic.utils.store import barrier
 
+__all__ = ["create_c10d_store", "get_free_port", "get_socket_with_port"]
 
 logger = get_logger(__name__)
 
diff --git a/torch/distributed/elastic/utils/store.py b/torch/distributed/elastic/utils/store.py
index 719c83b8265d..080e92eae91e 100644
--- a/torch/distributed/elastic/utils/store.py
+++ b/torch/distributed/elastic/utils/store.py
@@ -13,6 +13,8 @@
 _NUM_MEMBERS = "/num_members"
 _LAST_MEMBER_CHECKIN = "/last_member"
 
+__all__ = ["store_timeout", "get_all", "synchronize", "barrier"]
+
 @contextmanager
 def store_timeout(store, timeout: float):
     """
diff --git a/torch/distributed/fsdp/_common_utils.py b/torch/distributed/fsdp/_common_utils.py
index 7d9394ef1fbd..c1d77bf410b5 100644
--- a/torch/distributed/fsdp/_common_utils.py
+++ b/torch/distributed/fsdp/_common_utils.py
@@ -421,29 +421,14 @@ def f(module: torch.nn.Module, prefix: str, tree_level: int, *args, **kwargs):
                     # ``named_children`` + `named_parameter(recurse=False)``.
                     # This hack is a must to make the traversal work.
                     # TODO: Remove this hack once DMP + FSDP is not supported.
+                    # It turns out that recursive wrapping may trigger this as
+                    # well.
                     if (
                         submodule_name == "_fsdp_wrapped_module"
                         or submodule_name == "_dmp_wrapped_module"
                     ):
-                        if (
-                            not torch.distributed._functional_collectives.is_torchdynamo_compiling()
-                        ):
-                            # TODO(voz): Don't graph break on this
-                            warnings.warn(
-                                "An unexpected prefix is detected. This case "
-                                " should only happen when using DMP with FSDP. "
-                                f"prefix = {prefix}, "
-                                f"submodule_name = {submodule_name}"
-                            )
                         new_prefix = prefix
                     elif submodule_name == "module":
-                        warnings.warn(
-                            "An unexpected prefix is detected. This case "
-                            " should only happen when DDP wraps the outer "
-                            " modules while FSDP wraps the inner ones."
-                            f"prefix = {prefix}, "
-                            f"submodule_name = {submodule_name}"
-                        )
                         new_prefix = prefix
             f(submodule, new_prefix, new_tree_level, *args, **kwargs)
 
diff --git a/torch/distributed/fsdp/_debug_utils.py b/torch/distributed/fsdp/_debug_utils.py
index 4ed76476e56b..a41a817724e5 100644
--- a/torch/distributed/fsdp/_debug_utils.py
+++ b/torch/distributed/fsdp/_debug_utils.py
@@ -57,7 +57,7 @@ def dump_and_reset(cls, msg: str) -> None:
         # This cannot be combined with DETAIL distributed log
         # as the profiling will be very incorrect.
         if dist.get_rank() == 0 and dist.get_debug_level() == dist.DebugLevel.INFO:
-            logger.warning("%s %s", msg, cls.results)
+            logger.info("%s %s", msg, cls.results)
         cls.reset()
 
 
diff --git a/torch/distributed/fsdp/_init_utils.py b/torch/distributed/fsdp/_init_utils.py
index fe39a5dc111e..d083a0702bd4 100644
--- a/torch/distributed/fsdp/_init_utils.py
+++ b/torch/distributed/fsdp/_init_utils.py
@@ -216,11 +216,11 @@ def _init_intra_node_process_group(num_devices_per_node: int) -> dist.ProcessGro
     Return a process group across the current node.
 
     For example, given each row is a distinct node:
-    0 1 2 3 4 5 6 7 8
-    9 10 11 12 13 14 15
+    0  1  2  3  4  5  6  7
+    8  9 10 11 12 13 14 15
     This API would return an intra-node subgroup across
-    [0, 7] or [8, 15] depending on the process's rank.
-    For example, rank 3 would get [0, 7].
+    [0, 1, ..., 7] or [8, 9, ..., 15] depending on the process's rank.
+    For example, rank 3 would get [0, 1, ..., 7].
     """
     intra_node_subgroup, _ = dist.new_subgroups(num_devices_per_node)
     return intra_node_subgroup
@@ -235,11 +235,11 @@ def _init_inter_node_process_group(
     Return an inter-node process group where each contained rank has the same local rank.
 
     For example, given each row is a distinct node:
-    0 1 2 3 4 5 6 7 8
-    9 10 11 12 13 14 15
-    This API would return inter-node process group {0, 8}, {1, 9}, {2, 10}, and so forth
-    depending on the process's rank. For example, rank 1 would get {1, 9}, rank 5
-    would get {5, 13}.
+    0  1  2  3  4  5  6  7
+    8  9 10 11 12 13 14 15
+    This API would return inter-node process group [0, 8], [1, 9], [2, 10], and so forth
+    depending on the process's rank. For example, rank 1 would get [1, 9], rank 5
+    would get [5, 13].
     """
     # the inter-node pg that is returned
     inter_node_pg = None
diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
index 163cde70b3f9..b066f930ebaf 100644
--- a/torch/distributed/fsdp/_optim_utils.py
+++ b/torch/distributed/fsdp/_optim_utils.py
@@ -1511,7 +1511,7 @@ def _allgather_orig_param_states(
     """
     fsdp_state = fsdp_param_info.state
     if fsdp_state.rank == 0 and dist.get_debug_level() == dist.DebugLevel.DETAIL:
-        logger.warning(
+        logger.info(
             "Memory Summary before calling to _allgather_orig_param_states %s",
             fsdp_state._device_handle.memory_summary(),
         )
diff --git a/torch/distributed/optim/__init__.py b/torch/distributed/optim/__init__.py
index 969699c2a5ac..0b576c65afea 100644
--- a/torch/distributed/optim/__init__.py
+++ b/torch/distributed/optim/__init__.py
@@ -35,3 +35,5 @@
 
 from .post_localSGD_optimizer import PostLocalSGDOptimizer
 from .zero_redundancy_optimizer import ZeroRedundancyOptimizer
+
+__all__ = ["as_functional_optim", "DistributedOptimizer", "PostLocalSGDOptimizer", "ZeroRedundancyOptimizer"]
diff --git a/torch/distributed/optim/functional_adagrad.py b/torch/distributed/optim/functional_adagrad.py
index 96e075c8216c..dfd50db17591 100644
--- a/torch/distributed/optim/functional_adagrad.py
+++ b/torch/distributed/optim/functional_adagrad.py
@@ -30,6 +30,7 @@ def __init__(
         eps: float = 1e-10,
         coalesce_grad: bool = True,
         foreach: bool = False,
+        fused: bool = False,
         maximize: bool = False,
         _allow_empty_param_list: bool = False,
     ):
@@ -44,6 +45,7 @@ def __init__(
         }
         self.coalesce_grad = coalesce_grad
         self.foreach = foreach
+        self.fused = fused
         self.maximize = maximize
         self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
 
@@ -101,4 +103,7 @@ def step(self, gradients: List[Optional[Tensor]]):
                 foreach=self.foreach,
                 maximize=self.maximize,
                 has_complex=has_complex,
+                fused=self.fused,
+                grad_scale=None,
+                found_inf=None,
             )
diff --git a/torch/distributed/pipelining/PipelineSchedule.py b/torch/distributed/pipelining/PipelineSchedule.py
index bf4116688bff..c8a256299bd5 100644
--- a/torch/distributed/pipelining/PipelineSchedule.py
+++ b/torch/distributed/pipelining/PipelineSchedule.py
@@ -411,34 +411,71 @@ def _step_microbatches(
         fwd_sends_to_wait: List[dist.Work] = []
         bwd_sends_to_wait: List[dist.Work] = []
 
+        def is_forward_step(i):
+            assert i >= 0, i
+            return i < self._n_microbatches
+
+        def is_backward_step(i):
+            assert i < total_steps, i
+            return i >= warmup_steps and self._has_backward
+
+        def is_1f1b_step(i):
+            return is_forward_step(i) and is_backward_step(i)
+
+        def is_warmup_step(i):
+            return is_forward_step(i) and not is_backward_step(i)
+
+        def is_cooldown_step(i):
+            return not is_forward_step(i) and is_backward_step(i)
+
+        def should_coalesce_fwd_send_bwd_recv(fwd_send_i):
+            return (
+                is_1f1b_step(fwd_send_i)
+                or (is_warmup_step(fwd_send_i) and is_cooldown_step(fwd_send_i + 1))
+                or (
+                    fwd_send_i >= 1
+                    and is_warmup_step(fwd_send_i - 1)
+                    and is_cooldown_step(fwd_send_i)
+                )
+            )
+
+        def should_coalesce_bwd_send_fwd_recv(bwd_send_i):
+            # The backward send to prev stage should be coalesced with the fwd recv from the previous stage
+            return bwd_send_i >= warmup_steps and is_1f1b_step(bwd_send_i + 1)
+
         # bwd chunk counter
         bwd_mb_index = 0
         self._stage._configure_data_parallel_mode(last_backward=False)
         for i in range(total_steps):
-            if i < self._n_microbatches:
-                # forward
+            if is_forward_step(i):
                 with record_function(f"Forward {i}"):
                     ops = self._stage.get_fwd_recv_ops()
+                    if should_coalesce_bwd_send_fwd_recv(i - 1):
+                        ops.extend(self._stage.get_bwd_send_ops())
+
                     works = sorted_batch_isend_irecv(ops)
                     for work in works.values():
                         work.wait()
 
                     output = self._stage.forward_one_chunk(arg_mbs[i], kwarg_mbs[i])  # type: ignore[index]
 
-                    ops = self._stage.get_fwd_send_ops()
-                    works = sorted_batch_isend_irecv(ops)
-                    fwd_sends_to_wait.extend(works.values())
+                    if not should_coalesce_fwd_send_bwd_recv(i):
+                        ops = self._stage.get_fwd_send_ops()
+                        works = sorted_batch_isend_irecv(ops)
+                        fwd_sends_to_wait.extend(works.values())
 
                 self._maybe_compute_loss(self._stage, output, target_mbs, i)
 
-            if i >= warmup_steps and self._has_backward:
+            if is_backward_step(i):
                 self._stage._configure_data_parallel_mode(
                     last_backward=(i == total_steps - 1)
                 )
-
-                # backward
                 with record_function(f"Backward {bwd_mb_index}"):
                     ops = self._stage.get_bwd_recv_ops()
+
+                    if should_coalesce_fwd_send_bwd_recv(i):
+                        ops.extend(self._stage.get_fwd_send_ops())
+
                     works = sorted_batch_isend_irecv(ops)
                     for work in works.values():
                         work.wait()
@@ -446,9 +483,12 @@ def _step_microbatches(
                     loss = self._maybe_get_loss(self._stage, bwd_mb_index)
                     self._stage.backward_one_chunk(loss=loss)
 
-                    ops = self._stage.get_bwd_send_ops()
-                    works = sorted_batch_isend_irecv(ops)
-                    bwd_sends_to_wait.extend(works.values())
+                    if not should_coalesce_bwd_send_fwd_recv(i):
+                        # see Note: coalesced bwd-send/fwd-recv
+                        ops = self._stage.get_bwd_send_ops()
+                        works = sorted_batch_isend_irecv(ops)
+                        bwd_sends_to_wait.extend(works.values())
+
                     bwd_mb_index += 1
 
         # Wait for all forward sends to finish
diff --git a/torch/distributed/pipelining/_IR.py b/torch/distributed/pipelining/_IR.py
index 799ec6d5e0d1..204a60a34022 100644
--- a/torch/distributed/pipelining/_IR.py
+++ b/torch/distributed/pipelining/_IR.py
@@ -11,12 +11,13 @@
 import torch
 import torch.fx as fx
 from torch.export import ExportedProgram
+from torch.export.unflatten import _assign_attr, _AttrKind, _sink_params
 from torch.fx.node import map_aggregate
 from torch.fx.passes.split_module import split_module
 
 from ._backward import _null_coalesce_accumulate, stage_backward
 from ._debug import PIPPY_VERBOSITY
-from ._unflatten import _assign_attr, _AttrKind, _outline_submodules, _sink_params
+from ._unflatten import _outline_submodules
 from ._utils import QualnameMapMixin
 from .microbatch import split_args_kwargs_into_chunks, TensorChunkSpec
 
@@ -303,7 +304,7 @@ def _pipe_split():
     return None
 
 
-@torch.library.impl_abstract("pippy::_pipe_split")  # type: ignore[no-redef]
+@torch.library.register_fake("pippy::_pipe_split")  # type: ignore[no-redef]
 def _pipe_split():  # noqa: F811
     return None
 
@@ -869,8 +870,8 @@ def move_param_to_callee(
         # After moving the params to their corresponding hierarchies, we also
         # need to move the `get_attr` nodes from the root of the graph to those
         # hierarchies.
-        inputs_to_state: Dict[str, str] = {
-            attr.name: attr.target for attr in attr_nodes
+        inputs_to_state: Dict[str, List[str]] = {
+            attr.name: [attr.target] for attr in attr_nodes
         }
         # This is done by (1) `_sind_params` at each submodule;
         for name, submod in split.named_children():
@@ -1281,7 +1282,7 @@ def annotate_split_points(mod: torch.nn.Module, spec: Dict[str, SplitPoint]):
             except AttributeError as e:
                 raise AttributeError(
                     f'Specified target {qualname} referenced nonexistent module {".".join(atoms[:i+1])}'
-                )
+                ) from e
 
         mod_to_wrap = getattr(predecessor_module, atoms[-1])
         mod_to_wrap._orig_forward = mod_to_wrap.forward
diff --git a/torch/distributed/pipelining/_PipelineStage.py b/torch/distributed/pipelining/_PipelineStage.py
index b30d99366caf..db0340677b17 100644
--- a/torch/distributed/pipelining/_PipelineStage.py
+++ b/torch/distributed/pipelining/_PipelineStage.py
@@ -7,6 +7,7 @@
 import torch
 import torch.distributed as dist
 import torch.fx as fx
+import torch.nn as nn
 from torch._subclasses.fake_tensor import FakeTensor
 from torch.distributed._composable.fsdp.fully_shard import FSDPModule
 from torch.fx.node import map_aggregate
@@ -55,11 +56,11 @@ def __repr__(self):
 
 
 def _make_tensor_from_meta(
-    example: FakeTensor,
+    example: Union[torch.Tensor, FakeTensor],
     device: torch.device,
 ) -> torch.Tensor:
     """
-    Create a real tensor from a fake tensor.
+    Create a real tensor from a tensor.
     """
     return torch.empty(
         example.size(),
@@ -142,7 +143,7 @@ def __init__(
         self.log_prefix = f"[Stage {self.stage_index}]"
 
         # Forward infra
-        self.args_recv_info: Dict[int, Tuple[InputInfo]] = {}
+        self.args_recv_info: Dict[int, Tuple[InputInfo, ...]] = {}
         self.set_requires_grad: Dict[int, bool] = {}
         self.act_send_info: Dict[int, List] = {}
 
@@ -211,7 +212,7 @@ def _create_grad_recv_info(
 
     def _get_recv_ops(
         self,
-        recv_infos: Tuple[InputInfo],
+        recv_infos: Tuple[InputInfo, ...],
     ) -> List[dist.P2POp]:
         """
         Helper function shared by `get_fwd_recv_ops` and `get_bwd_recv_ops`.
@@ -239,7 +240,7 @@ def get_fwd_recv_ops(self) -> List[dist.P2POp]:
         Returns a list of ops that are needed to receive the input arguments
         for this stage.
         """
-        recv_infos: Tuple[InputInfo] = self.args_recv_info[self.fwd_chunk_id]
+        recv_infos: Tuple[InputInfo, ...] = self.args_recv_info[self.fwd_chunk_id]
 
         # In case there is backward pass, set requires_grad for receive buffers
         # before first forward
@@ -360,7 +361,7 @@ def clear_runtime_states(self) -> None:
 
     def _map_tensor_from_recv_info(
         self,
-        recv_infos: Tuple[InputInfo],
+        recv_infos: Tuple[InputInfo, ...],
     ):
         """
         Map tensors from recv infos to a list.
@@ -819,3 +820,399 @@ def __init__(
         # Get my pipe info
         pipe_info = pipe.info()
         super().__init__(stage_module, stage_index, pipe_info, device, group)
+
+
+# Manual PipelineStage functions and definition
+
+METADATA_TENSOR_LEN = 100
+PLACEHOLDER_VAL = -1
+
+
+def create_empty_tensors(
+    tensor: Union[torch.Tensor, List[torch.Tensor]], device: torch.device
+) -> List[torch.Tensor]:
+    """
+    Creates a list of empty tensors with the same properties (like shape and dtype) as the input tensor(s),
+    and places them on the specified device.
+    Args:
+        tensor (Union[torch.Tensor, List[torch.tensor]]): The input tensor(s).
+        device (torch.device): The device where the new tensors will be placed.
+    Returns:
+        List[torch.Tensor]: A list of empty tensors with the same properties as the input tensor(s).
+    """
+    if isinstance(tensor, torch.Tensor):
+        return [torch.empty_like(tensor, device=device)]
+    elif isinstance(tensor, (list, tuple)):
+        return [torch.empty_like(t, device=device) for t in tensor]
+    raise TypeError(f"Unsupported type {type(tensor)} cannot create empty tensors")
+
+
+def create_metadata_tensor(
+    tensors: Optional[List[torch.Tensor]] = None,
+    device: Optional[torch.device] = torch.device("cpu"),
+) -> torch.Tensor:
+    """
+    Create a metadata tensor that can be sent over the wire.
+    This tensor contains the number of dimensions and the shape of each tensor being sent.
+
+    The data is of format [num_dims, dim1, dim2, ...].
+    If the tensor is None, a tensor of only placeholder values will be returned.
+
+    Inputs:
+        tensors: A list of tensors, the tensors will converted into its shape dimensions and
+                 these dimensions will be concatenated.
+        device: The device where the metadata tensor will be created.
+    If the tensor is None, then this tensor will contain PLACEHOLDER_VALs.
+
+    """
+    metadata_tensor = torch.full(
+        (METADATA_TENSOR_LEN,),
+        PLACEHOLDER_VAL,
+        dtype=torch.int32,
+        device=device,
+    )
+    if tensors:
+        # Create a list of tensors containing the number of dimensions and the shape of each tensor
+        data = [
+            # data is of format [num_dims, dim1, dim2, ...]
+            torch.tensor(
+                [len(tensor.shape)] + list(tensor.shape),
+                dtype=torch.int32,
+                device=device,
+            )
+            for tensor in tensors
+        ]
+        # Concatenate the data into a single tensor
+        data_tensor = torch.cat(data)
+        dt_shape = data_tensor.shape[0]
+        if dt_shape > METADATA_TENSOR_LEN:
+            raise ValueError(
+                f"Metadata tensor size ({dt_shape}) exceeds maximum allowed length ({METADATA_TENSOR_LEN})."
+            )
+        metadata_tensor[:dt_shape] = data_tensor
+    return metadata_tensor
+
+
+def extract_metadata_from_tensor(tensor: torch.Tensor) -> List[torch.Size]:
+    """
+    Extract the number of dimensions and the shape of each tensor from a metadata tensor.
+    """
+    metadata: List[torch.Size] = []
+    i = 0
+    while i < len(tensor) and tensor[i] != PLACEHOLDER_VAL:
+        num_dims = int(tensor[i].item())
+        shape = torch.Size(tensor[i + 1 : i + 1 + num_dims].tolist())
+        metadata.append(shape)
+        i += num_dims + 1
+    return metadata
+
+
+def get_stage_shapes(
+    stage_modules: List[nn.Module],
+    stage_ids: List[int],
+    num_stages: int,
+    rank: int,
+    world_size: int,
+    device: torch.device,
+    microbatch: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+):
+    """
+    Performs a dry run through all the pipeline stages (a rank can have multiple pipeline stages in the case of
+    virtual pipelining) and returns the shape of the inputs and outputs of the module.
+    Only the first stage must pass in a microbatch.
+
+    Each rank must call get_stage_shapes or the program will hang.
+
+    Args:
+        stage_modules: The chunks assigned to this rank. Rhe length should be 1 for any
+                non-interleaved schedules and >1 for any interleaved schedules.
+        stage_ids: The id of the stages assigned to this rank.
+        num_stages: Total number of stages.
+        rank: Rank of the current process.
+        world_size: Number of processes participating in the pipeline.
+        device: Device where the tensors are allocated.
+
+    Returns a dictionary containing the following keys:
+        "inputs": Shape of the inputs to the module
+        "outputs": Shape of the outputs of the module
+    """
+
+    stage_id_to_shapes: Dict[int, Dict[str, list[torch.Size]]] = {}
+    for stage_id, model in zip(stage_ids, stage_modules):
+        input_shape_metadata_tensor = create_metadata_tensor(device=device)
+        # TODO: Assumes prev_stage == rank - 1 and next_stage == rank + 1
+        prev_rank = (rank - 1) % world_size
+        next_rank = (rank + 1) % world_size
+        shapes = {}
+
+        # first stage doesn't receive anything and uses a microbatch
+        if stage_id == 0:
+            if microbatch is None:
+                raise RuntimeError("Microbatch is required for first stage")
+            example_fwd_inputs = microbatch
+            if isinstance(example_fwd_inputs, torch.Tensor):
+                example_fwd_inputs = [example_fwd_inputs]
+        else:
+            # other stages must receive shape information
+            # TODO: send/recv should take a group, rather than use the default group
+            dist.recv(input_shape_metadata_tensor, prev_rank)
+            metadata = extract_metadata_from_tensor(input_shape_metadata_tensor)
+            example_fwd_inputs = [
+                torch.empty(shape_list, device=device) for shape_list in metadata
+            ]
+        shapes["inputs"] = [fwd_input.shape for fwd_input in example_fwd_inputs]
+
+        # perform forward
+        # TODO: if forward fails raise a more descriptive error explaining which stage failed
+        fwd_outputs = model(*example_fwd_inputs)
+        fwd_outputs = create_empty_tensors(fwd_outputs, device)
+        shapes["outputs"] = [fwd_output.shape for fwd_output in fwd_outputs]
+
+        # send shape dims
+        if stage_id != num_stages - 1:
+            output_shape_metadata_tensor = create_metadata_tensor(
+                fwd_outputs, device=device
+            )
+            dist.send(output_shape_metadata_tensor, next_rank)
+        stage_id_to_shapes[stage_id] = shapes
+    logger.info(stage_id_to_shapes)
+    return stage_id_to_shapes
+
+
+class ManualPipelineStage(PipelineStageBase):
+    """
+    A class representing a pipeline stage in a pipeline parallelism setup.
+    This class is created manually by providing a example input (and optionally output)
+    as opposed to the PipelineStage class that is outputed from pipeline().
+    This class extends the `PipelineStageBase` class and can similarly be used
+    in `PipelineScheule`.
+    Args:
+        submodule (nn.Module): The PyTorch module wrapped by this stage.
+        stage_index (int): The ID of this stage.
+        num_stages (int): The total number of stages.
+        device (torch.device): The device where this stage is located.
+        num_microbatches (int): The number of microbatches to use.
+        input_args (Union[torch.Tensor, List[torch.tensor]], optional): The input arguments for the submodule.
+        output_args (Union[torch.Tensor, List[torch.tensor]], optional): The output arguments for the submodule.
+        group (dist.ProcessGroup, optional): The process group for distributed training. If None, default group.
+    """
+
+    def __init__(
+        self,
+        submodule: nn.Module,
+        stage_index: int,
+        num_stages: int,
+        device: torch.device,
+        num_microbatches: int,
+        input_args: Union[torch.Tensor, List[torch.Tensor]],
+        output_args: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        group: Optional[dist.ProcessGroup] = None,
+    ):
+        super().__init__(
+            submodule, stage_index, num_stages, device, num_microbatches, group
+        )
+        self.submod.to(self.device)
+        # When we materialize the model partition on cuda, we call reset_parameters() if it is available
+        # logger.info(f"input args {input_args=}")
+        self.inputs: List[torch.Tensor] = []
+        self.outputs: List[torch.Tensor] = []
+
+        self.inputs = create_empty_tensors(input_args, device)
+
+        if output_args is None:
+            logger.info("output_args not provided, performing forward using input_args")
+            self.outputs = self.submod(*self.inputs)
+            # create buffers for the output so that the data is in the correct
+            # shape in order to use in p2p op (send)
+            self.outputs = create_empty_tensors(self.outputs, device)
+        else:
+            self.outputs = create_empty_tensors(output_args, device)
+
+        # these are the buffers used in backwards send/recv, they are allocated later
+        self.outputs_grad: List[torch.Tensor] = []
+
+        def stage_global_rank(peer_rank):
+            return (
+                peer_rank
+                if self.group is None
+                else dist.get_global_rank(self.group, peer_rank)
+            )
+
+        self.prev_stage = stage_global_rank((self.group_rank - 1) % self.group_size)
+        self.next_stage = stage_global_rank((self.group_rank + 1) % self.group_size)
+
+        # Receive info during forward
+        # TODO: create args_recv_info lazily? (same needed for PipelineStage)
+        for chunk_id in range(self.chunks):
+            self.set_requires_grad[chunk_id] = False
+            if not self.is_first:
+                # We assume that we always receive from stage - 1
+                recv_infos = tuple(
+                    [
+                        RecvInfo(
+                            f"recv_for_{self.stage_index}_from_{self.stage_index - 1}",
+                            self.stage_index - 1,
+                            _make_tensor_from_meta(inp, self.device),
+                        )
+                        for inp in self.inputs
+                    ]
+                )
+
+                self.args_recv_info[chunk_id] = recv_infos
+            else:
+                self.args_recv_info[chunk_id] = tuple(
+                    [RootArgPlaceholder() for _ in self.inputs]
+                )
+
+        # Send info during forward for each activation
+        # only need the rank that is being sent to
+        self.act_send_info: Dict[int, List] = {}
+        for idx in range(len(self.outputs)):
+            # We assume we always send to stage + 1
+            if not self.is_last:
+                self.act_send_info[idx] = [self.stage_index + 1]
+            else:
+                self.act_send_info[idx] = []
+
+        logger.debug(
+            f"finished pipeline stage init, {self.stage_index=}, {self.is_first=}, "  # noqa: G004
+            f"{self.is_last=}, {self.num_stages=}, "
+            f"inputs: {[inp.shape for inp in self.inputs]}, "
+            f"output: {[output.shape for output in self.outputs]}"
+        )
+
+    def _create_grad_recv_info(
+        self,
+        act_send_info: Dict,
+    ) -> Tuple[RecvInfo, ...]:
+        grad_recv_info: Tuple[RecvInfo, ...] = ()
+        if not self.is_last:
+            # Receiving gradients from multiple sources is not supported
+            # hence we only take the first destination
+            grad_recv_info = tuple(
+                [
+                    RecvInfo(
+                        f"recv_grad_for_{self.stage_index}_from_{dst_list[0]}",
+                        dst_list[0],
+                        _make_tensor_from_meta(self.outputs[idx], self.device),
+                    )
+                    for idx, dst_list in act_send_info.items()
+                ]
+            )
+        return grad_recv_info
+
+    def init_p2p_neighbors(self):
+        """
+        Set up p2p communitors between previous and next stages
+        by sending a dummy tensor.
+
+        If this is used, must be called for all pipeline stages.
+        """
+        ops = []
+        recv_tensor = torch.zeros(1, device="cuda")
+        send_tensor = torch.ones(1, device="cuda")
+        # forward
+        if not self.is_first:
+            ops.append(dist.P2POp(dist.irecv, recv_tensor, self.prev_stage, self.group))
+        if not self.is_last:
+            ops.append(dist.P2POp(dist.isend, send_tensor, self.next_stage, self.group))
+
+        # backward
+        if not self.is_first:
+            ops.append(dist.P2POp(dist.isend, send_tensor, self.prev_stage, self.group))
+        if not self.is_last:
+            ops.append(dist.P2POp(dist.irecv, recv_tensor, self.next_stage, self.group))
+
+        return True
+
+
+def validate_stage_shapes(pipeline_stages: List[ManualPipelineStage]):
+    """
+    Check that the buffer shapes match between stages was expected by performing an all_gather between
+    all stages.
+    """
+    if len(pipeline_stages) == 0:
+        raise ValueError("No pipeline stages provided.")
+
+    virtual_pipeline_size = len(pipeline_stages)
+    all_inputs = []
+    all_outputs = []
+    world_size = pipeline_stages[0].group_size
+    num_stages = pipeline_stages[0].num_stages
+
+    # perform all gathers between all stages
+    for virtual_id, stage in enumerate(pipeline_stages):
+        world_size = stage.group_size
+        stage_id: int = stage.stage_index
+        rank = stage.group_rank
+        # check that world_size and num_stages are consistent across all stages
+        if stage.group_size != world_size:
+            raise ValueError(
+                f"Stage id {stage_id} has world size ({stage.group_size}) \
+                which does not match world size ({world_size}) of other stages."
+            )
+        if stage.num_stages != num_stages:
+            raise ValueError(
+                f"Stage id {stage_id} has num stages ({stage.num_stages}) \
+                which does not match num stages ({num_stages}) of other stages."
+            )
+
+        pg_rank = dist.get_rank(stage.group)
+        if rank != pg_rank:
+            raise ValueError(
+                f"Rank {rank} is not equal to process group rank {pg_rank}"
+            )
+
+        if (num_stages := stage.num_stages) % world_size != 0:
+            raise ValueError(
+                f"Number of stages ({num_stages}) must be a multiple of the world_size ({world_size})"
+            )
+
+        # all gather each ranks inputs
+        tensor_list = [
+            create_metadata_tensor(device=stage.device) for _ in range(stage.group_size)
+        ]
+        expected_inputs = stage.inputs
+        stage_input = create_metadata_tensor(expected_inputs, device=stage.device)
+        dist.all_gather(tensor_list, stage_input)
+        stage_input_shapes = [
+            extract_metadata_from_tensor(tensor) for tensor in tensor_list
+        ]
+
+        # all gather each ranks outputs
+        tensor_list = [
+            create_metadata_tensor(device=stage.device) for _ in range(stage.group_size)
+        ]
+        expected_outputs = stage.outputs
+        stage_output = create_metadata_tensor(expected_outputs, device=stage.device)
+        dist.all_gather(tensor_list, stage_output)
+        stage_output_shapes = [
+            extract_metadata_from_tensor(tensor) for tensor in tensor_list
+        ]
+
+        logger.debug(
+            f"Rank: {pg_rank}"  # noqa: G004
+            f"Stage id: {stage_id}"
+            f"Stage num stages: {stage.num_stages}"
+            f"Stage rank: {rank}"
+            f"Stage world size: {world_size}"
+            f"Stage {virtual_id * world_size}-{(virtual_id + 1) * world_size - 1} input shapes: {stage_input_shapes}"  # noqa: G003
+            f"Stage {virtual_id * world_size}-{(virtual_id + 1) * world_size - 1} output shapes: {stage_output_shapes}"  # noqa: G003
+        )
+
+        all_inputs.extend(stage_input_shapes)
+        all_outputs.extend(stage_output_shapes)
+
+        # log only rank 0's view, they will all be equivalent
+        if pg_rank == 0:
+            logger.info(
+                f"all stage inputs: {all_inputs}"  # noqa: G004
+                f"all stage outputs: {all_outputs}"
+            )
+
+    # Check if the output for stage 0 matches the input at stage 1, and so forth
+    for i in range(virtual_pipeline_size * world_size - 1):
+        if (out := all_outputs[i]) != (inp := all_inputs[i + 1]):
+            raise ValueError(
+                f"Stage_id {i} output shape {out} at does not match stage_id {i + 1} input shape {inp}."
+            )
diff --git a/torch/distributed/pipelining/_unflatten.py b/torch/distributed/pipelining/_unflatten.py
index 684fcfbc1d6d..27241d17874c 100644
--- a/torch/distributed/pipelining/_unflatten.py
+++ b/torch/distributed/pipelining/_unflatten.py
@@ -1,453 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates
-# This file is a copy of private utilities in pytorch/torch/export/unflatten.py
-# pylint: skip-file
-
-import copy
-import operator
-from enum import Enum
-from typing import cast, Dict, List, Optional, Union
+from typing import Dict
 
 import torch
-import torch.fx._pytree as fx_pytree
-import torch.utils._pytree as pytree
-from torch.export.exported_program import (
-    ConstantArgument,
-    ModuleCallSignature,
-    SymIntArgument,
-    TensorArgument,
-)
-from torch.export.unflatten import InterpreterModule
-
-
-class _AttrKind(Enum):
-    PARAMETER = "parameter"
-    BUFFER = "buffer"
-    CONSTANT = "constant"
-
-
-# Assign attribute 'from_obj' to the qualified name 'target' on 'to_module
-# This installs empty Modules where none exist yet if they are subpaths of target
-def _assign_attr(
-    from_obj: Union[torch.Tensor, torch.ScriptObject],
-    to_module: torch.nn.Module,
-    target: str,
-    attr_kind: _AttrKind,
-    persistent: bool = True,
-):
-    *prefix, field = target.split(".")
-    for item in prefix:
-        t = getattr(to_module, item, None)
-
-        if t is None:
-            t = torch.nn.Module()
-            setattr(to_module, item, t)
-        to_module = t
-
-    if attr_kind == _AttrKind.PARAMETER:
-        assert isinstance(from_obj, torch.nn.Parameter)
-        to_module.register_parameter(field, from_obj)
-    elif attr_kind == _AttrKind.BUFFER:
-        assert isinstance(from_obj, torch.Tensor)
-        to_module.register_buffer(field, from_obj, persistent=persistent)
-    elif attr_kind == _AttrKind.CONSTANT:
-        assert isinstance(from_obj, (torch.Tensor, torch.ScriptObject))
-        setattr(to_module, field, from_obj)
-
-
-def _is_prefix(candidate, target):
-    """Check whether `candidate` is a prefix of `target`."""
-    return len(candidate) < len(target) and target[: len(candidate)] == candidate
-
-
-def _compute_accessor(parent_fqn: str, child_fqn: str) -> str:
-    if parent_fqn == "":
-        # Handle the root module correctly.
-        return child_fqn
-
-    parent_split = parent_fqn.split(".")
-    child_split = child_fqn.split(".")
-
-    assert (
-        child_split[: len(parent_split)] == parent_split
-    ), f"Child module '{child_fqn}' is not a descendant of parent module '{parent_fqn}'"
-    return ".".join(child_split[len(parent_split) :])
-
-
-def _verify_graph_equivalence(x: torch.nn.Module, y: torch.nn.Module):
-    def graph_dump(graph: torch.fx.Graph) -> str:
-        ret = []
-        nodes_idx: Dict[int, int] = {}
-
-        def arg_dump(arg) -> str:
-            if isinstance(arg, torch.fx.Node):
-                return "%" + str(nodes_idx[id(arg)])
-            return str(arg)
-
-        for i, node in enumerate(graph.nodes):
-            args_dump = [str(arg) for arg in pytree.tree_map(arg_dump, node.args)]
-            args_dump += [
-                f"{key}={value}"
-                for key, value in pytree.tree_map(arg_dump, node.kwargs).items()
-            ]
-            target = node.target if node.op == "call_function" else ""
-            ret.append(f"{i}: {node.op}[{target}]({', '.join(args_dump)})")
-            nodes_idx[id(node)] = i
-        return "\n".join(ret)
-
-    assert graph_dump(x.graph) == graph_dump(y.graph)
-
-
-def _add_spec(gm: torch.nn.Module, spec) -> str:
-    i = 0
-    while hasattr(gm, f"_spec_{i}"):
-        i += 1
-    name = f"_spec_{i}"
-    setattr(gm, name, spec)
-    return name
-
-
-def _generate_flatten(gm: torch.nn.Module, node, spec) -> torch.fx.Node:
-    name = _add_spec(gm, spec)
-    spec_node = gm.graph.get_attr(name)
-    return gm.graph.call_function(fx_pytree.tree_flatten_spec, (node, spec_node))
-
-
-def _generate_unflatten(gm: torch.nn.Module, nodes, spec) -> torch.fx.Node:
-    name = _add_spec(gm, spec)
-    spec_node = gm.graph.get_attr(name)
-    return gm.graph.call_function(pytree.tree_unflatten, (nodes, spec_node))
-
-
-def _add_submodule(mod: torch.nn.Module, target: str, module_to_add: torch.nn.Module):
-    *prefix, field = target.split(".")
-
-    for item in prefix:
-        submod = getattr(mod, item, None)
-
-        if submod is None:
-            submod = torch.nn.Module()
-            setattr(mod, item, submod)
-
-        if not isinstance(submod, torch.nn.Module):
-            return False
-
-        mod = submod
-
-    mod.add_module(field, module_to_add)
-
-
-class _ModuleFrame:
-    def __init__(
-        self,
-        flat_graph,
-        nodes,
-        seen_nodes,
-        seen_modules,
-        parent,
-        module_stack,
-        module_id,
-        module_call_graph: Optional[Dict[str, ModuleCallSignature]] = None,
-        module: Optional[torch.nn.Module] = None,
-    ):
-        self.flat_graph = flat_graph
-        self.nodes = nodes
-        self.seen_nodes = seen_nodes
-        self.seen_modules = seen_modules
-        self.parent = parent
-        self.module_stack = module_stack
-        self.module_id = module_id
-
-        self.module_call_graph = module_call_graph
-        self.verbose = False
-
-        self.fqn = self.module_stack[-1]
-        if module is not None:
-            self.module = module
-        else:
-            self.module = InterpreterModule(torch.fx.Graph())
-        if self.module_id in self.seen_modules:
-            self.cached_graph_module = self.seen_modules[self.module_id]
-        else:
-            self.cached_graph_module = None
-            self.seen_modules[self.module_id] = self.module
-
-        self.graph = self.module.graph
-
-        # Mapping of nodes in the flat graph to nodes in this graph.
-        self.node_map: Dict[torch.fx.Node, torch.fx.Node] = {}
-        self.node_to_placeholder = {}
-
-        self.parent_call_module: Optional[torch.fx.Node] = None
-        if parent is not None:
-            accessor = _compute_accessor(parent.fqn, self.fqn)
-            _add_submodule(
-                parent.module,
-                accessor,
-                self.module
-                if self.cached_graph_module is None
-                else self.cached_graph_module,
-            )
-            self.parent_call_module = parent.graph.call_module(accessor)
-
-        signature = self.get_signature()
-
-        if signature is not None and self.parent is not None:
-            assert signature.in_spec.num_children == 2
-            args_spec = signature.in_spec.children_specs[0]
-            kwargs_spec = signature.in_spec.children_specs[1]
-            assert args_spec.context is None
-            assert kwargs_spec.context is not None
-
-            with self.graph.inserting_after(None):
-                arg_nodes = []
-                for idx in range(args_spec.num_children):
-                    arg_nodes.append(self.graph.placeholder(f"_positional_arg_{idx}"))
-                kwarg_nodes = {}
-                for name in kwargs_spec.context:
-                    kwarg_nodes[name] = self.graph.placeholder(name)
-                flat_args = _generate_flatten(
-                    self.module,
-                    (tuple(arg_nodes), kwarg_nodes),
-                    signature.in_spec,
-                )
-                for idx, arg in enumerate(signature.inputs):
-                    flat_arg_node = self.graph.create_node(
-                        op="call_function",
-                        target=operator.getitem,
-                        args=(flat_args, idx),
-                        name=arg.name
-                        if not isinstance(arg, ConstantArgument)
-                        else f"_constant_{idx}",
-                    )
-                    if isinstance(arg, ConstantArgument):
-                        continue
-                    flat_arg_node.meta = copy.copy(self.seen_nodes[arg.name].meta)
-                    self.node_to_placeholder[self.seen_nodes[arg.name]] = flat_arg_node
-
-            with self.parent.graph.inserting_before(self.parent_call_module):
-                input_nodes: List[Optional[torch.fx.Node]] = []
-                for input in signature.inputs:
-                    if isinstance(input, ConstantArgument) and input.value is None:
-                        input_nodes.append(None)
-                    else:
-                        assert isinstance(input, (TensorArgument, SymIntArgument))
-                        input_nodes.append(
-                            self.parent.remap_input(self.seen_nodes[input.name])
-                        )
-
-                inputs_node = _generate_unflatten(
-                    self.parent.module,
-                    input_nodes,
-                    signature.in_spec,
-                )
-
-                args_node = self.parent.graph.call_function(
-                    operator.getitem, (inputs_node, 0)
-                )
-                kwargs_node = self.parent.graph.call_function(
-                    operator.getitem, (inputs_node, 1)
-                )
-                arg_nodes = [
-                    self.parent.graph.call_function(operator.getitem, (args_node, i))
-                    for i in range(args_spec.num_children)
-                ]
-                kwarg_nodes = {
-                    k: self.parent.graph.call_function(
-                        operator.getitem, (kwargs_node, k)
-                    )
-                    for k in kwargs_spec.context
-                }
-            assert self.parent_call_module is not None
-            self.parent_call_module.args = tuple(arg_nodes)
-            self.parent_call_module.kwargs = kwarg_nodes
-
-    def add_placeholder(self, x):
-        assert x.graph is self.flat_graph
-        # x is not in subgraph, create a new placeholder for subgraph
-        with self.graph.inserting_before(None):
-            placeholder_node = self.graph.placeholder(x.name, type_expr=x.type)
-        # copy all meta fields, even if some fields might be irrelvant for
-        # the placeholder node
-        placeholder_node.meta = copy.copy(x.meta)
-        self.node_to_placeholder[x] = placeholder_node
-
-    def remap_input(self, x):
-        assert x.graph is self.flat_graph
-        if x in self.node_map:
-            return self.node_map[x]
-        if x not in self.node_to_placeholder:
-            self.add_placeholder(x)
-            if self.parent_call_module is not None:
-                # Important to *prepend* the output to match how we are
-                # inserting placeholder nodes.
-                self.parent_call_module.insert_arg(0, self.parent.remap_input(x))
-        return self.node_to_placeholder[x]
-
-    def get_signature(self):
-        if self.module_call_graph is not None:
-            return self.module_call_graph.get(self.fqn)
-        return None
-
-    def finalize_outputs(self):
-        orig_outputs = []
-        signature = self.get_signature()
-
-        if signature is not None and self.parent is not None:
-            for output in signature.outputs:
-                if isinstance(output, (TensorArgument, SymIntArgument)):
-                    orig_outputs.append(self.seen_nodes[output.name])
-                else:
-                    raise RuntimeError(
-                        f"Unsupported data type for output node: {output}"
-                    )
-
-            tree_out_node = _generate_unflatten(
-                self.module,
-                tuple(
-                    self.node_map[self.seen_nodes[output.name]]
-                    for output in orig_outputs
-                ),
-                signature.out_spec,
-            )
-            parent_out: Optional[torch.fx.Node] = _generate_flatten(
-                self.parent.module, self.parent_call_module, signature.out_spec
-            )
-            graph_outputs: Union[torch.fx.Node, List[torch.fx.Node]] = tree_out_node
-        else:
-            graph_outputs = []
-            # Iterate through nodes we have copied into self.graph.
-            for orig_node in self.node_map.keys():
-                for user_node in orig_node.users:
-                    if user_node.name not in self.seen_nodes:
-                        # external user node, need to expose as an output
-                        orig_outputs.append(orig_node)
-                        graph_outputs.append(self.node_map[orig_node])
-                        break
-
-            parent_out = self.parent_call_module
-            if len(graph_outputs) == 1:
-                graph_outputs = graph_outputs[0]
-
-        assert isinstance(graph_outputs, (list, torch.fx.Node))
-
-        self.graph.output(graph_outputs)
-
-        # Rewrite outputs in parent module
-        if parent_out is None:
-            return
-
-        parent_out.meta["val"] = (
-            graph_outputs.meta.get("val")
-            if isinstance(graph_outputs, torch.fx.Node)
-            else [o.meta.get("val") for o in graph_outputs]
-        )
-
-        if len(orig_outputs) == 1 and signature is None:
-            self.parent.node_map[orig_outputs[0]] = parent_out
-        else:
-            for i, orig_output in enumerate(orig_outputs):
-                # Use Proxy to record getitem access.
-                proxy_out = torch.fx.Proxy(parent_out)[i].node  # type: ignore[index]
-                proxy_out.meta["val"] = orig_output.meta.get("val")
-                self.parent.node_map[orig_output] = proxy_out
-
-        if self.cached_graph_module is not None:
-            _verify_graph_equivalence(self.cached_graph_module, self.module)
-
-    def copy_node(self, node):
-        self.print("copying", node.format_node())
-        self.node_map[node] = self.graph.node_copy(node, self.remap_input)
-        self.seen_nodes[node.name] = node
-
-    def run_outer(self):
-        i = 0
-        for node in self.flat_graph.nodes:
-            self.print(i, node.meta.get("nn_module_stack"), node.format_node())
-            i += 1
-
-        # Copy all graph inputs
-        node_idx: int = 0
-        node = self.nodes[node_idx]
-        while node.op == "placeholder":
-            self.copy_node(node)
-            node_idx += 1
-            node = self.nodes[node_idx]
-
-        self.run_from(node_idx)
-
-        # Copy graph outputs
-        for node in self.flat_graph.nodes:
-            if node.op == "output":
-                self.copy_node(node)
-
-    def print(self, *args, **kwargs):
-        if self.verbose:
-            print(*args, **kwargs)
-
-    def run_from(self, node_idx):
-        module_idx = 0
-        # Walk through the graph, building up a new graph with the right submodules
-        while node_idx < len(self.nodes):
-            node = self.nodes[node_idx]
-            assert node.op != "placeholder"
-
-            self.print()
-            self.print("STEP", node_idx, node.format_node())
-            self.print(self.module_stack)
-            if node.op == "output":
-                if len(self.module_stack) == 1:
-                    # We want the output node of the original graph to be handled
-                    # specially by the outermost stack frame (in run_outer). So
-                    # skip finalization here.
-                    return node_idx
-
-                # We've reached the end of the graph. Wrap up all the existing stack frames.
-                self.finalize_outputs()
-                return node_idx
-
-            node_module_stack = (
-                [path for path, ty in node.meta["nn_module_stack"].values()]
-                if "nn_module_stack" in node.meta
-                else self.module_stack
-            )
-            if node_module_stack[: len(self.module_stack)] != self.module_stack:
-                # This means that the current module is done executing and the
-                # current node is the beginning of a new module.
-                #
-                # In this case, we should finalize this module and return without
-                # incrementing the node counter.
-                self.finalize_outputs()
-                self.print("outlining", self.fqn)
-                self.print(self.graph)
-                return node_idx
-
-            assert node_module_stack is not None
-
-            if _is_prefix(self.module_stack, node_module_stack):
-                # This means that the current node represents the execution of a new
-                # module.
-                next_module = node_module_stack[len(self.module_stack)]
-                self.print("Creating new stack frame for", next_module)
-                # Run a nested version of module outliner from the current node
-                # counter. Once it is complete, continue from that point.
-                node_idx = _ModuleFrame(
-                    self.flat_graph,
-                    self.nodes,
-                    self.seen_nodes,
-                    self.seen_modules,
-                    self,
-                    self.module_stack + [next_module],
-                    list(node.meta["nn_module_stack"].keys())[len(self.module_stack)],
-                    self.module_call_graph,
-                ).run_from(node_idx)
-                module_idx += 1
-                continue
-
-            # The only remaining possibility is that we are in the right stack
-            # frame. Copy the node into this frame's graph and increment the node counter.
-            assert node_module_stack == self.module_stack
-            self.copy_node(node)
-            node_idx += 1
+from torch.export.unflatten import _ModuleFrame
 
 
 def _outline_submodules(orig_graph: torch.fx.Graph):
@@ -463,80 +18,9 @@ def _outline_submodules(orig_graph: torch.fx.Graph):
         None,
         [""],
         "",
+        {},
         module=new_module,
     ).run_outer()
     new_module.graph.lint()
     new_module.recompile()
     return new_module
-
-
-def _sink_params(
-    module: torch.nn.Module,
-    inputs_to_state: Dict[str, str],
-    scope: List[str],
-):
-    """Sink params, buffers, and constants from graph inputs into get_attr nodes.
-
-    Exported modules are purely functional, so they pass their parameters and
-    buffers in as inputs to the graph.
-
-    To replicate eager's semantics, we need to get them from the module state
-    via get_attr instead.
-
-    module: GraphModule, potentially containining nested submodules.
-    inputs_to_state: mapping graph input names to the corresponding key in the state_dict.
-    scope: tracks where we are in the module hierarchy, so that we can emit the
-        right `getattr(self, "foo.bar")` calls, etc.
-    """
-    # We need to use _modules here instead of named_children(), because we
-    # explicitly want duplicate modules to show up in the traversal.
-    for name, submodule in module._modules.items():
-        _sink_params(cast(torch.nn.Module, submodule), inputs_to_state, scope + [name])
-
-    if not hasattr(module, "graph"):
-        # Not all modules have graphs defined, if they are empty modules with no operations (like ParameterList)
-        return
-
-    graph = module.graph
-    inputs = list(filter(lambda n: n.op == "placeholder", graph.nodes))
-    the_last_input = inputs[-1]
-
-    # Also remove from call_module nodes
-    call_module_nodes = filter(lambda n: n.op == "call_module", graph.nodes)
-    for node in call_module_nodes:
-        node.args = tuple(filter(lambda n: n.name not in inputs_to_state, node.args))
-
-    for node in inputs:
-        if node.name not in inputs_to_state:
-            continue
-
-        if len(node.users) > 0:
-            state_name = inputs_to_state[node.name].split(".")
-            # If there's a mismatch beteewn scope name and state name, then there must be multuple scopes
-            # pointing to the same state name, meaning some modules are shared. In such case, we can simply
-            # skip updating the current node because another later iteration will take care of this input
-            # node when the unique match between scope and state name occurs.
-            # To make sure this always happen, we should enforce the invariant that no placeholder node
-            # in the unflattened graph appears in inputs_to_state dict, which means all the extra input
-            # nodes have been handled.
-            if state_name[: len(scope)] != scope:
-                continue
-            attr_path = state_name[len(scope) :]
-            state_attr = _recursive_getattr(module, attr_path)
-            assert isinstance(state_attr, (torch.Tensor, torch.ScriptObject))
-
-            # Make sure the newly created get_attr node is placed after the last placeholder node
-            with graph.inserting_after(the_last_input):
-                new_node = graph.create_node("get_attr", ".".join(attr_path))
-
-            node.replace_all_uses_with(new_node, propagate_meta=True)
-        graph.erase_node(node)
-    if isinstance(module, InterpreterModule):
-        module.finalize()
-
-
-def _recursive_getattr(obj, attr_path):
-    for attr in attr_path:
-        obj = getattr(obj, attr)
-
-    return obj
diff --git a/torch/distributed/rendezvous.py b/torch/distributed/rendezvous.py
index 9e701067d880..19936f910b8a 100644
--- a/torch/distributed/rendezvous.py
+++ b/torch/distributed/rendezvous.py
@@ -18,6 +18,7 @@
 
 _rendezvous_handlers: Dict[str, Callable[..., Iterator[Tuple[Store, int, int]]]] = {}
 
+__all__ = ["register_rendezvous_handler", "rendezvous"]
 
 def register_rendezvous_handler(scheme, handler):
     """
diff --git a/torch/export/_trace.py b/torch/export/_trace.py
index 1ac7fd6b5e9e..c85a82c8c4c5 100644
--- a/torch/export/_trace.py
+++ b/torch/export/_trace.py
@@ -246,6 +246,12 @@ def _get_param_buffer_mapping(
     for name, buffer in original_module.named_buffers(remove_duplicate=False):
         buffer_lookup.setdefault(id(buffer), []).append(name)
 
+    # reverse lists so FQN assignment is FIFO wrt model structure
+    for name, fqns in param_lookup.items():
+        param_lookup[name] = fqns[::-1]
+    for name, fqns in buffer_lookup.items():
+        buffer_lookup[name] = fqns[::-1]
+
     param_buffer_table: Dict[str, str] = {}
     for dynamo_name, dynamo_param in traced_module.named_parameters(
         remove_duplicate=False
@@ -409,6 +415,7 @@ def _export_to_torch_ir(
     disable_constraint_solver: bool = False,
     restore_fqn: bool = True,
     _log_export_usage: bool = True,
+    same_signature: bool = True,
 ) -> torch.fx.GraphModule:
     """
     Traces either an nn.Module's forward function or just a callable with PyTorch
@@ -439,14 +446,15 @@ def _export_to_torch_ir(
                     tracing_mode="symbolic",
                     disable_constraint_solver=disable_constraint_solver,
                     _log_export_usage=_log_export_usage,
+                    same_signature=same_signature,
                 )(
                     *args,
                     **kwargs,
                 )
         except (ConstraintViolationError, ValueRangeError) as e:
-            raise UserError(UserErrorType.CONSTRAINT_VIOLATION, str(e))  # noqa: TRY200
+            raise UserError(UserErrorType.CONSTRAINT_VIOLATION, str(e))  # noqa: B904
         except GuardOnDataDependentSymNode as e:
-            raise UserError(  # noqa: TRY200
+            raise UserError(  # noqa: B904
                 UserErrorType.ANTI_PATTERN,
                 f"Consider annotating your code using torch._check*(). {str(e)}",
                 case_name="constrain_as_size_example",
@@ -460,7 +468,7 @@ def _export_to_torch_ir(
     return gm_torch_level
 
 
-def _export_non_strict(
+def _export_to_aten_ir(
     mod: torch.nn.Module,
     fake_args,
     fake_kwargs,
@@ -470,6 +478,7 @@ def _export_non_strict(
     transform=lambda x: x,  # TODO(zhxchen17) Revisit if this is needed later.
     pre_dispatch=False,
     should_insert_runtime_assertion=False,
+    _is_torch_jit_trace=False,
 ):
     # [NOTE] If the user is exporting under training mode, we want to detect if there is any
     # state change in the autograd global state and error. If the user is exporting under inference
@@ -624,19 +633,21 @@ def make_argument_spec(i, node) -> ArgumentSpec:
     constants = rewrite_script_object_meta(gm)
     constants.update(lift_constants_pass(gm, export_graph_signature, constant_attrs))
 
-    # prettify names for placeholder nodes
-    placeholder_naming_pass(
-        gm,
-        export_graph_signature,
-        mod,
-        fake_args,
-        fake_kwargs,
-        fake_params_buffers,
-        constants,
-    )
+    # FIXME: Skipping this because traced modules do not have signature yet
+    if not _is_torch_jit_trace:
+        # prettify names for placeholder nodes
+        placeholder_naming_pass(
+            gm,
+            export_graph_signature,
+            mod,
+            fake_args,
+            fake_kwargs,
+            fake_params_buffers,
+            constants,
+        )
 
     @dataclasses.dataclass
-    class _ExportedProgramNonStrict:
+    class _ExportedArtifact:
         gm: torch.fx.GraphModule
         sig: ExportGraphSignature
         constants: Dict[
@@ -648,7 +659,7 @@ class _ExportedProgramNonStrict:
             ],
         ]
 
-    return _ExportedProgramNonStrict(
+    return _ExportedArtifact(
         gm,
         export_graph_signature,
         constants,
@@ -881,6 +892,48 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
+def _convert_ts_to_export_experimental(traced_callable, args, kwargs=None):
+    torch._C._jit_set_texpr_fuser_enabled(False)
+
+    def process_trace_inputs_for_export(example_inputs, example_kwarg_inputs):
+        if not isinstance(example_inputs, tuple):
+            example_inputs = (example_inputs,)
+
+        if example_kwarg_inputs is None:
+            example_kwarg_inputs = {}
+        return example_inputs, example_kwarg_inputs
+
+    class _WrapperModule(torch.nn.Module):
+        def __init__(self, f):
+            super().__init__()
+            self.f = f
+
+        def forward(self, *args, **kwargs):
+            return self.f(*args, **kwargs)
+
+    from torch.jit._trace import TopLevelTracedModule
+
+    export_args, export_kwargs = process_trace_inputs_for_export(args, kwargs)
+
+    if isinstance(traced_callable, TopLevelTracedModule):
+        return _export(
+            traced_callable,
+            export_args,
+            export_kwargs,
+            strict=False,
+            _is_torch_jit_trace=True,
+        ).module()
+
+    else:
+        return _export(
+            _WrapperModule(traced_callable),
+            export_args,
+            export_kwargs,
+            strict=False,
+            _is_torch_jit_trace=True,
+        ).module()
+
+
 @_log_export_wrapper
 @_disable_prexisiting_fake_mode
 def _export(
@@ -893,6 +946,7 @@ def _export(
     preserve_module_call_signature: Tuple[str, ...] = (),
     pre_dispatch: bool = False,
     _disable_forced_specializations: Optional[bool] = False,
+    _is_torch_jit_trace: bool = False,
 ) -> ExportedProgram:
     """
     Traces either an nn.Module's forward function or just a callable with PyTorch
@@ -961,7 +1015,10 @@ def _export(
 
     flat_args, orig_in_spec = pytree.tree_flatten((args, kwargs))
     original_state_dict = mod.state_dict(keep_vars=True)
-    forward_arg_names = _get_forward_arg_names(mod, args, kwargs)
+    if not _is_torch_jit_trace:
+        forward_arg_names = _get_forward_arg_names(mod, args, kwargs)
+    else:
+        forward_arg_names = None
 
     if not strict:
         out_spec = None
@@ -1040,7 +1097,9 @@ def forward(self, *args, **kwargs):
             fake_kwargs,
             equalities_inputs,
             original_signature,
-        ) = make_fake_inputs(mod, args, kwargs, dynamic_shapes)
+        ) = make_fake_inputs(
+            mod, args, kwargs, dynamic_shapes, _is_torch_jit_trace=_is_torch_jit_trace
+        )
 
         fake_params_buffers = make_fake_params_buffers(
             fake_mode, _get_params_buffers(mod)
@@ -1054,7 +1113,7 @@ def forward(self, *args, **kwargs):
                 new_fake_constant_attrs,
                 map_fake_to_real,
             ):
-                ep_non_strict = _export_non_strict(
+                aten_export_artifact = _export_to_aten_ir(
                     patched_mod,
                     new_fake_args,
                     new_fake_kwargs,
@@ -1063,16 +1122,17 @@ def forward(self, *args, **kwargs):
                     pre_dispatch=pre_dispatch,
                     transform=_tuplify_outputs,
                     should_insert_runtime_assertion=not strict,
+                    _is_torch_jit_trace=_is_torch_jit_trace,
                 )
-                # ep_non_strict.constants contains only fake script objects, we need to map them back
-                ep_non_strict.constants = {
+                # aten_export_artifact.constants contains only fake script objects, we need to map them back
+                aten_export_artifact.constants = {
                     fqn: map_fake_to_real[obj]
                     if isinstance(obj, FakeScriptObject)
                     else obj
-                    for fqn, obj in ep_non_strict.constants.items()
+                    for fqn, obj in aten_export_artifact.constants.items()
                 }
 
-        ep_non_strict.gm.meta["inline_constraints"] = {
+        aten_export_artifact.gm.meta["inline_constraints"] = {
             k: v
             for k, v in fake_mode.shape_env.var_to_range.items()
             if free_unbacked_symbols(k)
@@ -1080,25 +1140,26 @@ def forward(self, *args, **kwargs):
         num_lifted = len(
             [
                 spec
-                for spec in ep_non_strict.sig.input_specs
+                for spec in aten_export_artifact.sig.input_specs
                 if spec.kind != InputKind.USER_INPUT
             ]
         )
         try:
             produce_guards_and_solve_constraints(
                 fake_mode,
-                ep_non_strict.gm,
+                aten_export_artifact.gm,
                 equalities_inputs,
                 original_signature,
                 _disable_forced_specializations=_disable_forced_specializations,
+                _is_torch_jit_trace=_is_torch_jit_trace,
             )
         except (ConstraintViolationError, ValueRangeError) as e:
-            raise UserError(UserErrorType.CONSTRAINT_VIOLATION, str(e))  # noqa: TRY200
+            raise UserError(UserErrorType.CONSTRAINT_VIOLATION, str(e))  # noqa: B904
 
         combined_args = _combine_args(mod, args, kwargs)
         range_constraints = make_constraints(
             fake_mode,
-            ep_non_strict.gm,
+            aten_export_artifact.gm,
             combined_args,
             dynamic_shapes,
             num_lifted,
@@ -1106,7 +1167,7 @@ def forward(self, *args, **kwargs):
 
         assert out_spec is not None
 
-        gm = ep_non_strict.gm
+        gm = aten_export_artifact.gm
 
         gm.meta["forward_arg_names"] = forward_arg_names
         module_call_signatures = {
@@ -1133,25 +1194,30 @@ def forward(self, *args, **kwargs):
                             node.replace_all_uses_with(new_node)
                             gm.graph.erase_node(node)
 
-            res = CollectTracepointsPass(module_call_signatures, ep_non_strict.sig)(gm)
+            res = CollectTracepointsPass(
+                module_call_signatures, aten_export_artifact.sig
+            )(gm)
             assert res is not None
             gm = res.graph_module
 
-        _rewrite_non_persistent_buffers(mod, ep_non_strict.sig, ep_non_strict.constants)
+        _rewrite_non_persistent_buffers(
+            mod, aten_export_artifact.sig, aten_export_artifact.constants
+        )
         _verify_nn_module_stack(gm)
         _verify_stack_trace(gm)
-        _verify_placeholder_names(gm, ep_non_strict.sig)
+        if not _is_torch_jit_trace:
+            _verify_placeholder_names(gm, aten_export_artifact.sig)
         exported_program = ExportedProgram(
             root=gm,
             graph=gm.graph,
-            graph_signature=ep_non_strict.sig,
+            graph_signature=aten_export_artifact.sig,
             state_dict=original_state_dict,
             range_constraints=range_constraints,
             module_call_graph=_make_module_call_graph(
                 _EXPORT_MODULE_HIERARCHY, orig_in_spec, out_spec, module_call_signatures
             ),
             example_inputs=(args, kwargs),
-            constants=ep_non_strict.constants,
+            constants=aten_export_artifact.constants,
         )
         return exported_program
 
@@ -1248,7 +1314,7 @@ def forward(self, *args, **kwargs):
 
     # NOTE: graph module expects only positional args
     constant_attrs = _gather_constant_attrs(mod)
-    ep_non_strict = _export_non_strict(
+    aten_export_artifact = _export_to_aten_ir(
         gm_torch_level,
         _convert_to_positional_args(orig_arg_names, fake_args, fake_kwargs),
         {},
@@ -1258,9 +1324,9 @@ def forward(self, *args, **kwargs):
         should_insert_runtime_assertion=not strict,
     )
 
-    gm = ep_non_strict.gm
-    export_graph_signature = ep_non_strict.sig
-    constants = ep_non_strict.constants
+    gm = aten_export_artifact.gm
+    export_graph_signature = aten_export_artifact.sig
+    constants = aten_export_artifact.constants
 
     # Don't copy over nn_module_stack, stack_trace metadata for params/buffers nodes
     for metadata in params_buffers_to_node_meta.values():
@@ -1316,15 +1382,17 @@ def forward(self, *args, **kwargs):
     _rewrite_dynamo_tensor_constants(
         orig_mod_buffers=set(mod.buffers()),
         traced_mod_buffers=dict(gm_torch_level.named_buffers()),
-        graph_signature=ep_non_strict.sig,
-        constants=ep_non_strict.constants,
+        graph_signature=aten_export_artifact.sig,
+        constants=aten_export_artifact.constants,
     )
     # 2. Restore FQN of param/buffers
     param_buffer_table: Dict[str, str] = _get_param_buffer_mapping(mod, gm_torch_level)
     _replace_param_buffer_names(param_buffer_table, export_graph_signature)
 
     # 3. Remove non-persistent buffers from the graph signature
-    _rewrite_non_persistent_buffers(mod, ep_non_strict.sig, ep_non_strict.constants)
+    _rewrite_non_persistent_buffers(
+        mod, aten_export_artifact.sig, aten_export_artifact.constants
+    )
 
     # 4. Rewrite constants to have the same FQN as the original module.
     _remap_constants(constant_attrs, export_graph_signature, constants)
diff --git a/torch/export/_unlift.py b/torch/export/_unlift.py
index 52ce64e4dcad..2fdb7916eeeb 100644
--- a/torch/export/_unlift.py
+++ b/torch/export/_unlift.py
@@ -22,7 +22,7 @@ def _check_input_constraints_pre_hook(self, *args, **kwargs):
     flat_args_with_path, received_spec = pytree.tree_flatten_with_path(args)
 
     if received_spec != self._in_spec:
-        raise ValueError(  # noqa: TRY200
+        raise ValueError(  # noqa: B904
             "Trying to flatten user inputs with exported input tree spec: \n"
             f"{self._in_spec}\n"
             "but actually got inputs with tree spec of: \n"
diff --git a/torch/export/dynamic_shapes.py b/torch/export/dynamic_shapes.py
index eba833332344..a4ed16e975b8 100644
--- a/torch/export/dynamic_shapes.py
+++ b/torch/export/dynamic_shapes.py
@@ -602,18 +602,20 @@ def f(t, *dynamic_shapes):
     return tree_map(f, tree, *dynamic_shapes, is_leaf=is_leaf)
 
 
-def _combine_args(f, args, kwargs):
+def _combine_args(f, args, kwargs, _is_torch_jit_trace=False):
     # combine args and kwargs following the signature of f, as it happens
     # in the body of f when called with *args, **kwargs
     if isinstance(f, ExportedProgram):
         f = f.module()
-    signature = (
-        inspect.signature(f.forward)
-        if isinstance(f, torch.nn.Module)
-        else inspect.signature(f)
-    )
-    kwargs = kwargs if kwargs is not None else {}
-    return signature.bind(*args, **kwargs).arguments
+    if not _is_torch_jit_trace:
+        signature = (
+            inspect.signature(f.forward)
+            if isinstance(f, torch.nn.Module)
+            else inspect.signature(f)
+        )
+        kwargs = kwargs if kwargs is not None else {}
+        return signature.bind(*args, **kwargs).arguments
+    return args
 
 
 class ShapesCollection:
@@ -692,6 +694,7 @@ def _process_dynamic_shapes(
     args: Tuple[Any, ...],
     kwargs: Optional[Dict[str, Any]] = None,
     dynamic_shapes: Optional[Union[Dict[str, Any], Tuple[Any], List[Any]]] = None,
+    _is_torch_jit_trace=False,
 ) -> Optional[List[Constraint]]:
     from torch._dynamo.exc import UserError, UserErrorType
 
@@ -720,7 +723,7 @@ def root_value():
             if solution is not None:
                 return int(solution[1])  # type: ignore[call-overload]
             else:
-                raise UserError(  # noqa: TRY200
+                raise UserError(  # noqa: B904
                     UserErrorType.CONSTRAINT_VIOLATION,
                     f"Expected shape[{i}] = {tensor.shape[i]} of input Tensor to be "
                     f"of the form {expr}, where {symbol} is an integer",
@@ -858,7 +861,9 @@ def assoc_shape(t, dynamic_shape):
 
         _tree_map(assoc_shape, combined_args, dynamic_shapes)
 
-    combined_args = _combine_args(f, args, kwargs)
+    combined_args = _combine_args(
+        f, args, kwargs, _is_torch_jit_trace=_is_torch_jit_trace
+    )
     if not isinstance(dynamic_shapes, dict):
         assert isinstance(dynamic_shapes, (tuple, list))
         combined_args = type(dynamic_shapes)(combined_args.values())  # type: ignore[assignment, misc]
diff --git a/torch/export/exported_program.py b/torch/export/exported_program.py
index ea69b5540be3..ffb3467055b3 100644
--- a/torch/export/exported_program.py
+++ b/torch/export/exported_program.py
@@ -448,7 +448,7 @@ def _postprocess_graph_module_outputs(self, res, orig_args, orig_kwargs):
                 res = pytree.tree_unflatten(res, self.call_spec.out_spec)
             except Exception:
                 _, received_spec = pytree.tree_flatten(res)
-                raise error.InternalError(  # noqa: TRY200
+                raise error.InternalError(  # noqa: B904
                     "Trying to flatten user outputs with exported output tree spec: \n"
                     f"{self.call_spec.out_spec}\n"
                     "but actually got outputs with tree spec of: \n"
@@ -547,7 +547,8 @@ def _get_placeholders(gm):
                 placeholders.append(node)
             return placeholders
 
-        decomp_table = decomp_table or core_aten_decompositions()
+        if decomp_table is None:
+            decomp_table = core_aten_decompositions()
 
         old_placeholders = _get_placeholders(self.graph_module)
         fake_args = [node.meta["val"] for node in old_placeholders]
diff --git a/torch/export/unflatten.py b/torch/export/unflatten.py
index 14f91ee64679..31701d9fb685 100644
--- a/torch/export/unflatten.py
+++ b/torch/export/unflatten.py
@@ -172,32 +172,93 @@ def __init__(
         self.range_constraints = export_module.range_constraints
         self.equality_constraints: List = []
 
+        # aliasing/unused param or buffer issues:
+        # in strict-mode export, dynamo export will deduplicate aliased tensors,
+        # and ignore unused tensors. For aliasing, this causes issues when some aliases
+        # are unused, and we're unable to match the placeholder node to the correct FQN.
+        # This leads to the graph signature potentially having the wrong target FQN,
+        # and downstream issues where parameters are assigned to the wrong target attribute,
+        # mismatching the relevant placeholder node in the unflattened module.
+        # To resolve this we restore (_assign_attr) all aliased/unused tensors in
+        # the state_dict as module attributes, but only keep the used tensors in the
+        # graph's forward pass (_sink_params).
         state_dict = export_module.state_dict
-        for name in self.graph_signature.parameters:
-            cloned = torch.nn.Parameter(state_dict[name].clone())
+        assigned_params: Set[str] = set()  # tracking unused params
+        id_to_param: Dict[int, torch.nn.Parameter] = {}  # handling weight-sharing
+        for name in self.graph_signature.parameters:  # this loop adds used params
+            param = state_dict[name]
+            if id(param) not in id_to_param:
+                id_to_param[id(param)] = torch.nn.Parameter(param.clone())
+
             _assign_attr(
-                cloned,
+                id_to_param[id(param)],
                 self,
                 name,
                 attr_kind=_AttrKind.PARAMETER,
             )
+            assigned_params.add(name)
 
         non_persistent_buffers = set(self.graph_signature.non_persistent_buffers)
-        for name in self.graph_signature.buffers:
+        assigned_buffers: Set[str] = set()  # tracking unused buffers
+        id_to_buffer: Dict[
+            int, Tuple[torch.nn.Parameter, bool]
+        ] = {}  # handle weight-sharing
+        for name in self.graph_signature.buffers:  # this loop adds used buffers
             if name in non_persistent_buffers:
                 persistent = False
-                cloned = export_module.constants[name].clone()
+                buffer = export_module.constants[name]
             else:
                 persistent = True
-                cloned = state_dict[name].clone()
+                buffer = state_dict[name]
+
+            if id(buffer) not in id_to_buffer:
+                id_to_buffer[id(buffer)] = (buffer.clone(), persistent)
 
             _assign_attr(
-                cloned,
+                id_to_buffer[id(buffer)][0],
                 self,
                 name,
                 attr_kind=_AttrKind.BUFFER,
                 persistent=persistent,
             )
+            assigned_buffers.add(name)
+
+        # restore aliased/unused params and buffers
+        # these appear in state dict but not graph signature
+        for name, tensor in state_dict.items():
+            if name in assigned_params or name in assigned_buffers:  # already assigned
+                continue
+
+            is_buffer = False
+            if id(tensor) in id_to_buffer or not isinstance(
+                tensor, torch.nn.Parameter
+            ):  # aliased buffer
+                is_buffer = True
+
+            if is_buffer:
+                if (
+                    id(tensor) not in id_to_buffer
+                ):  # this is completely unused (not weight-sharing)
+                    id_to_buffer[id(tensor)] = (
+                        tensor,
+                        True,
+                    )  # assign to respect original model
+                _assign_attr(
+                    id_to_buffer[id(tensor)][0],
+                    self,
+                    name,
+                    attr_kind=_AttrKind.BUFFER,
+                    persistent=True,
+                )
+            else:
+                if id(tensor) not in id_to_param:  # this is unused
+                    id_to_param[id(tensor)] = tensor
+                _assign_attr(
+                    id_to_param[id(tensor)],
+                    self,
+                    name,
+                    attr_kind=_AttrKind.PARAMETER,
+                )
 
         # use id map so we don't double-clone aliased constants
         id_to_const: Dict[int, Union[torch.Tensor, torch._C.ScriptObject]] = {}
@@ -223,6 +284,7 @@ def add_to_consts_map(obj_id, node_name, target_name):
             name_list = consts_map[obj_id]
             name_list.append((node_name, target_name))
 
+        added_params_buffers: Set[str] = set()  # track aliased/unused params, buffers
         for s in self.graph_signature.input_specs:
             if s.kind == InputKind.PARAMETER or (
                 s.kind == InputKind.BUFFER and s.persistent
@@ -233,6 +295,7 @@ def add_to_consts_map(obj_id, node_name, target_name):
                     id(export_module.state_dict[s.target]), s.arg.name, s.target
                 )
                 consts_targets.add(s.target)
+                added_params_buffers.add(s.target)
             elif (
                 (s.kind == InputKind.BUFFER and not s.persistent)
                 or s.kind == InputKind.CONSTANT_TENSOR
@@ -253,6 +316,18 @@ def add_to_consts_map(obj_id, node_name, target_name):
                 ), "Constants should be either aliased or appear in graph signature"
                 ph_name, _ = consts_map[id(const)][0]
                 add_to_consts_map(id(const), ph_name, const_name)
+                added_params_buffers.add(s.target)
+
+        # add aliased/unused params and buffers that don't appear in graph signature
+        for fqn, tensor in export_module.state_dict.items():
+            if fqn not in added_params_buffers:
+                if id(tensor) not in consts_map:
+                    # completely unused (no weight-sharing), ignore.
+                    # this weight doesn't appear in graph module,
+                    # so won't cause FQN assignment issues
+                    continue
+                ph_name, _ = consts_map[id(tensor)][0]
+                add_to_consts_map(id(tensor), ph_name, fqn)
 
         # node name -> list of possible targets
         inputs_to_state: Dict[str, List[str]] = {}
diff --git a/torch/fx/experimental/proxy_tensor.py b/torch/fx/experimental/proxy_tensor.py
index e8e50993e2e9..9976c4e9beca 100644
--- a/torch/fx/experimental/proxy_tensor.py
+++ b/torch/fx/experimental/proxy_tensor.py
@@ -59,6 +59,7 @@
 
 CONSTANT_NUMEL_LIMIT = 1
 
+null_ctx_type = type(nullcontext)
 # We currently convert all SymInt to proxies before we use them.
 # This could plausibly be handled at the Dynamo level.
 pytree.register_pytree_node(
@@ -1125,145 +1126,283 @@ def create_node(self, *args, **kwargs):
 
         return node
 
+class _MakefxTracer:
+
+    def __init__(
+        self,
+        decomposition_table: Optional[Dict[Callable, Callable]],
+        tracing_mode: str,
+        _allow_non_fake_inputs: bool,
+        pre_dispatch: bool,
+        record_module_stack: bool,
+        _allow_fake_constant: bool,
+        _error_on_data_dependent_ops: bool
+    ):
+        # Configurations that are used to initialize the context managers and their states.
+        # Should not modify them during tracing.
+        self.decomposition_table: Dict[Callable, Callable] = decomposition_table or {}
+        self.decomposition_table.setdefault(torch.ops.aten.sym_numel.default, torch._decomp.decompositions.sym_numel)
+        self.tracing_mode: str = tracing_mode
+        self._allow_non_fake_inputs: bool = _allow_non_fake_inputs
+        self.pre_dispatch: bool = pre_dispatch
+        self.record_module_stack: bool = record_module_stack
+        self._allow_fake_constant: bool = _allow_fake_constant
+        self._error_on_data_dependent_ops: bool = _error_on_data_dependent_ops
+
+        # All context managers and their states should be initialized before tracing based on the inputs
+        # and configurations. After tracing, their states should be cleaned except for shape_env.
+        # Remember to specify how to intialize it from user inputs and from parent tracer whenever
+        # adding new modes in _MakefxTracer.
+        self.fake_tensor_mode: Union[null_ctx_type, FakeTensorMode] = nullcontext()
+        self.proxy_mode: Union[null_ctx_type, ProxyTorchDispatchMode] = nullcontext()
+        self.proxy_function_mode: Union[null_ctx_type, PreDispatchTorchFunctionMode] = nullcontext()
+        self.fx_tracer: Union[null_ctx_type, Tracer] = nullcontext()
+        self.python_dispatcher_mode: Union[null_ctx_type, Any] = nullcontext()
+        self.torch_fn_metadata_mode: Union[null_ctx_type, TorchFunctionMetadataMode] = nullcontext()
+
+    def _checkpoint_modes(self) -> List[Any]:
+        return [
+            self.fake_tensor_mode,
+            self.proxy_mode,
+            self.proxy_function_mode,
+            self.fx_tracer,
+            self.python_dispatcher_mode,
+            self.torch_fn_metadata_mode
+        ]
+
+    def _restore_modes(
+        self,
+        prev_fake_tensor_mode: Union[null_ctx_type, FakeTensorMode],
+        prev_proxy_mode: Union[null_ctx_type, ProxyTorchDispatchMode],
+        prev_proxy_function_mode: Union[null_ctx_type, PreDispatchTorchFunctionMode],
+        prev_fx_tracer: Union[null_ctx_type, Tracer],
+        prev_python_dispatcher_mode: Union[null_ctx_type, Any],
+        prev_torch_fn_metadata_mode : Union[null_ctx_type, TorchFunctionMetadataMode],
+    ) -> None:
+        self.fake_tensor_mode = prev_fake_tensor_mode
+        self.proxy_mode = prev_proxy_mode
+        self.proxy_function_mode = prev_proxy_function_mode
+        self.fx_tracer = prev_fx_tracer
+        self.python_dispatcher_mode = prev_python_dispatcher_mode
+        self.torch_fn_metadata_mode = prev_torch_fn_metadata_mode
 
-def make_fx(f,
-            decomposition_table=None,
-            tracing_mode="real",
-            _allow_non_fake_inputs=False,
-            *,
-            pre_dispatch=False,
-            record_module_stack=False,
-            _allow_fake_constant=False,
-            _error_on_data_dependent_ops=True):
-    assert tracing_mode in ["real", "fake", "symbolic"]
+    @contextmanager
+    def _init_modes_from_inputs(self, f, args):
+        prev_modes = self._checkpoint_modes()
+        try:
+            # Avoid importing sympy at a module level
+            from .symbolic_shapes import ShapeEnv
+            if hasattr(f, "_orig_mod") and self.record_module_stack:
+                scope_root = f._orig_mod
+                self.fx_tracer = _ModuleStackTracer(scope_root)
+            else:
+                self.fx_tracer = PythonKeyTracer()
+
+            if self.tracing_mode == "fake":
+                import torch._dynamo
+                fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
+                if fake_tensor_mode is None:
+                    import torch._functorch.config as _config
+                    with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+                        fake_tensor_mode = FakeTensorMode(
+                            allow_fallback_kernels=True,
+                            allow_non_fake_inputs=self._allow_non_fake_inputs,
+                            shape_env=ShapeEnv(),
+                            static_shapes=True,
+                        )
+                self.fake_tensor_mode = fake_tensor_mode
+            elif self.tracing_mode == "symbolic":
+                import torch._dynamo
+                fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
+                if fake_tensor_mode is None:
+                    shape_env = ShapeEnv()
+                    import torch._functorch.config as _config
+                    with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
+                        fake_tensor_mode = FakeTensorMode(
+                            allow_fallback_kernels=False,
+                            allow_non_fake_inputs=self._allow_non_fake_inputs,
+                            shape_env=shape_env)
+                assert fake_tensor_mode.shape_env is not None, "shape_env should be set if tracing with 'symbolic'"
+                self.fake_tensor_mode = fake_tensor_mode
+            else:
+                if not self.tracing_mode == "real":
+                    raise AssertionError(f"Unexpected tracing type: {self.tracing_mode}")
 
-    if decomposition_table is None:
-        decomposition_table = {}
+            self._construct_modes_with_fx_tracer(self.fx_tracer)
+            yield
+        finally:
+            self._restore_modes(*prev_modes)
+
+    def _construct_modes_with_fx_tracer(self, fx_tracer):
+        self.proxy_mode = ProxyTorchDispatchMode(
+            fx_tracer,
+            self.tracing_mode,
+            pre_dispatch=self.pre_dispatch,
+            _allow_fake_constant=self._allow_fake_constant,
+            _error_on_data_dependent_ops=self._error_on_data_dependent_ops
+        )
 
-    if torch.ops.aten.sym_numel.default not in decomposition_table:
-        decomposition_table = {
-            **decomposition_table,
-            torch.ops.aten.sym_numel.default: torch._decomp.decompositions.sym_numel
-        }
+        if self.pre_dispatch:
+            self.proxy_function_mode = PreDispatchTorchFunctionMode(fx_tracer)
 
-    @functools.wraps(f)
-    def wrapped(*args):
-        # Avoid importing sympy at a module level
-        from .symbolic_shapes import ShapeEnv
+        # pre-autograd tracing uses per-dispatch-key modes,
+        # which requires the python dispatcher
+        if self.tracing_mode == "symbolic" or self.pre_dispatch:
+            self.python_dispatcher_mode = enable_python_dispatcher()
 
-        phs = pytree.tree_map(lambda _: fx.PH, args)  # type: ignore[attr-defined]
+        self.torch_fn_metadata_mode = TorchFunctionMetadataMode(fx_tracer)
 
-        if hasattr(f, "_orig_mod") and record_module_stack:
-            scope_root = f._orig_mod
-            fx_tracer = _ModuleStackTracer(scope_root)
-        else:
-            fx_tracer = PythonKeyTracer()
-        fake_tensor_mode: Any = nullcontext()
-        if tracing_mode == "real":
-            fake_tensor_mode = nullcontext()
-        elif tracing_mode == "fake":
-            import torch._dynamo
-            fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
-            if fake_tensor_mode is None:
-                import torch._functorch.config as _config
-                with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
-                    fake_tensor_mode = FakeTensorMode(
-                        allow_fallback_kernels=True,
-                        allow_non_fake_inputs=_allow_non_fake_inputs,
-                        shape_env=ShapeEnv(),
-                        static_shapes=True,
-                    )
-        elif tracing_mode == "symbolic":
-            import torch._dynamo
-            fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(args)
-            if fake_tensor_mode is None:
-                shape_env = ShapeEnv()
-                import torch._functorch.config as _config
-                with _config.patch(fake_tensor_allow_unsafe_data_ptr_access=False):
-                    fake_tensor_mode = FakeTensorMode(
-                        allow_fallback_kernels=False,
-                        allow_non_fake_inputs=_allow_non_fake_inputs,
-                        shape_env=shape_env)
-            else:
-                shape_env = fake_tensor_mode.shape_env
-                assert shape_env is not None, "shape_env should be set if tracing with 'symbolic'"
+    @contextmanager
+    def _init_modes_from_parent(self, parent_tracer):
+        # By default, subtracer creates new modes based on parent tracer's config.
+        # However, there are cases where we want to share the same modes with parent tracer
+        # For example, fake_tensor_mode, we want the example value's fake_mode of parent graph and subgraphs to be the same.
+        prev_modes = self._checkpoint_modes()
+        try:
+            self.fake_tensor_mode = parent_tracer.fake_tensor_mode
 
-        else:
-            raise AssertionError(f"Unexpected tracing type: {tracing_mode}")
+            def _create_sub_fx_tracer(parent_tracer):
+                if type(parent_tracer) == PythonKeyTracer:
+                    sub_tracer = PythonKeyTracer()
+                elif type(parent_tracer) == _ModuleStackTracer:
+                    sub_tracer = _ModuleStackTracer(parent_tracer.scope_root)
+                else:
+                    raise RuntimeError(f"Unexpected tracer type: {type(parent_tracer)}.")
 
-        python_dispatcher_mode: Any = nullcontext()
-        # pre-autograd tracing uses per-dispatch-key modes,
-        # which requires the python dispatcher
-        if tracing_mode == "symbolic" or pre_dispatch:
-            python_dispatcher_mode = enable_python_dispatcher()
-
-        proxy_function_mode: Any = nullcontext()
-        if pre_dispatch:
-            proxy_function_mode = PreDispatchTorchFunctionMode(fx_tracer)
-
-        proxy_mode = ProxyTorchDispatchMode(fx_tracer,
-                                            tracing_mode,
-                                            pre_dispatch=pre_dispatch,
-                                            _allow_fake_constant=_allow_fake_constant,
-                                            _error_on_data_dependent_ops=_error_on_data_dependent_ops)
-
-        arg_count = 0
-
-        def wrap_fake(x):
-            nonlocal arg_count
-            # TODO: it would be nice to line these up with the names
-            # FX will choose for the placeholders, but we don't
-            # actually know what the names will be at this point yet
-            # NB: the Source here is actually meaningless
-            from torch._dynamo.source import ConstantSource
-            source = ConstantSource(f"input{arg_count}")
-            if isinstance(x, torch.Tensor):
-                arg_count += 1
-                return fake_tensor_mode.from_tensor(x, source=source)  # type: ignore[attr-defined]
-            # NB: don't match on bools
-            elif type(x) is int and tracing_mode == "symbolic":
-                return shape_env.create_symintnode(shape_env.create_symbol(x, source, positive=None), hint=x, source=source)
-            elif isinstance(x, torch.ScriptObject):
-                return torch._library.fake_class_registry.to_fake_obj(fake_tensor_mode, x)
-
-            assert not isinstance(x, FakeScriptObject), f"ScriptObject {x} has been fakified. Cannot wrap_fake it again."
-            return x
-
-        sym_mode = proxy_mode.sym_mode
-
-        wrap_fn_map = {
-            "real": lambda x: x,
-            "fake": wrap_fake,
-            "symbolic": wrap_fake,
-        }
-        args = pytree.tree_map(wrap_fn_map[tracing_mode], args)
-
-        if not hasattr(inspect.unwrap(f), '__code__') or inspect.unwrap(f).__code__.co_flags & inspect.CO_VARARGS:
-            # FX doesn't support varargs, so we gotta fake up a wrapper
-            # TODO: Would be nice to fix this at the source...
-            func = fake_signature(f, len(phs))
-        else:
-            func = f
+                return sub_tracer
 
-        torch_fn_metadata_mode = TorchFunctionMetadataMode(fx_tracer)
+            self.fx_tracer = _create_sub_fx_tracer(parent_tracer.fx_tracer)
+            self._construct_modes_with_fx_tracer(self.fx_tracer)
+            yield
+        finally:
+            self._restore_modes(*prev_modes)
+
+
+    def _trace_inner(self, f, *args):
+        phs = pytree.tree_map(lambda _: fx.PH, args)  # type: ignore[attr-defined]
 
+        def _wrap_fake(args: Tuple[Any]) -> Tuple[Any]:
+            arg_count = 0
+
+            def inner_wrap_fake(x):
+                nonlocal arg_count
+                # TODO: it would be nice to line these up with the names
+                # FX will choose for the placeholders, but we don't
+                # actually know what the names will be at this point yet
+                # NB: the Source here is actually meaningless
+                from torch._dynamo.source import ConstantSource
+                source = ConstantSource(f"input{arg_count}")
+                if isinstance(x, torch.Tensor):
+                    arg_count += 1
+                    return self.fake_tensor_mode.from_tensor(x, source=source)  # type: ignore[attr-defined]
+                # NB: don't match on bools
+                elif type(x) is int and self.tracing_mode == "symbolic":
+                    return self.fake_tensor_mode.shape_env.create_symintnode(
+                        self.fake_tensor_mode.shape_env.create_symbol(x, source, positive=None),
+                        hint=x,
+                        source=source
+                    )
+                elif isinstance(x, torch.ScriptObject):
+                    return torch._library.fake_class_registry.to_fake_obj(self.fake_tensor_mode, x)
+
+                assert not isinstance(x, FakeScriptObject), f"ScriptObject {x} has been fakified. Cannot wrap_fake it again."
+                return x
+
+            wrap_fn_map = {
+                "real": lambda x: x,
+                "fake": inner_wrap_fake,
+                "symbolic": inner_wrap_fake,
+            }
+            return pytree.tree_map(wrap_fn_map[self.tracing_mode], args)
+
+        def _wrap_func(f, phs):
+            if not hasattr(inspect.unwrap(f), '__code__') or inspect.unwrap(f).__code__.co_flags & inspect.CO_VARARGS:
+                # FX doesn't support varargs, so we gotta fake up a wrapper
+                # TODO: Would be nice to fix this at the source...
+                return fake_signature(f, len(phs))
+            return f
+
+        args = _wrap_fake(args)
+        func = _wrap_func(f, phs)
         # We disable the autocast cache as the autocast cache causes type conversions on parameters to
         # check a cache, which introduces untracked tensors into the graph
         #
         # We also disable tracing by any other tensor proxy-based tracers except the current. The
         # purpose of `make_fx` is to produce graphmodules as a side effect; its internal execution is
         # thus irrelevant to any external functional trace.
-        with decompose(decomposition_table), fake_tensor_mode, python_dispatcher_mode, proxy_function_mode, \
-             sym_mode, torch_fn_metadata_mode, proxy_mode, disable_autocast_cache():
-            t = dispatch_trace(wrap_key(func, args, fx_tracer, pre_dispatch), tracer=fx_tracer, concrete_args=tuple(phs))
+        with decompose(self.decomposition_table), self.fake_tensor_mode, self.python_dispatcher_mode, self.proxy_function_mode, \
+             self.proxy_mode.sym_mode, self.torch_fn_metadata_mode, \
+             self.proxy_mode, disable_autocast_cache(), _set_make_fx_tracer(self):
+            t = dispatch_trace(
+                wrap_key(func, args, self.fx_tracer, self.pre_dispatch),
+                tracer=self.fx_tracer,
+                concrete_args=tuple(phs)
+            )
 
         # TODO: kind of a bad way to do it, should maybe figure out a better way
-        if tracing_mode == "symbolic":
-            t.shape_env = shape_env  # type: ignore[assignment]
+        if self.tracing_mode == "symbolic":
+            t.shape_env = self.fake_tensor_mode.shape_env  # type: ignore[assignment]
         return t
 
-    return wrapped
+    def trace(self, f, *args) -> torch.fx.GraphModule:
+        with self._init_modes_from_inputs(f, args):
+            return self._trace_inner(f, *args)
+
+    def trace_subgraph(self, f, *args):
+        # Create a new tracer based on parent's config
+        sub_tracer = _MakefxTracer(
+            self.decomposition_table,
+            self.tracing_mode,
+            self._allow_non_fake_inputs,
+            self.pre_dispatch,
+            self.record_module_stack,
+            self._allow_fake_constant,
+            self._error_on_data_dependent_ops
+        )
+        with sub_tracer._init_modes_from_parent(self):
+            return sub_tracer._trace_inner(f, *args)
+
+_CURRENT_MAKE_FX_TRACER : Optional[_MakefxTracer] = None
 
+@contextmanager
+def _set_make_fx_tracer(tracer: _MakefxTracer) -> None:
+    global _CURRENT_MAKE_FX_TRACER
+    prev_tracer = _CURRENT_MAKE_FX_TRACER
+    try:
+        _CURRENT_MAKE_FX_TRACER = tracer
+        yield
+    finally:
+        _CURRENT_MAKE_FX_TRACER = prev_tracer
+
+def make_fx(
+        f,
+        decomposition_table=None,
+        tracing_mode="real",
+        _allow_non_fake_inputs=False,
+        *,
+        pre_dispatch=False,
+        record_module_stack=False,
+        _allow_fake_constant=False,
+        _error_on_data_dependent_ops=True):
+
+    assert tracing_mode in ["real", "fake", "symbolic"]
+
+
+    make_fx_tracer = _MakefxTracer(
+        decomposition_table,
+        tracing_mode,
+        _allow_non_fake_inputs,
+        pre_dispatch,
+        record_module_stack,
+        _allow_fake_constant,
+        _error_on_data_dependent_ops
+    )
+
+    @functools.wraps(f)
+    def wrapped(*args):
+        return make_fx_tracer.trace(f, *args)
+
+    return wrapped
 
 def get_torch_dispatch_modes():
     return torch.utils._python_dispatch._get_current_dispatch_mode_stack()
diff --git a/torch/fx/experimental/symbolic_shapes.py b/torch/fx/experimental/symbolic_shapes.py
index cffe5e8a92ec..e310d490b77c 100644
--- a/torch/fx/experimental/symbolic_shapes.py
+++ b/torch/fx/experimental/symbolic_shapes.py
@@ -55,6 +55,7 @@
     shape_env_check_state_equal
 )
 from torch.fx.experimental.sym_node import SymNode, SymTypes
+from torch._logging import trace_structured, structured
 
 # NB: The sym_* functions are used via getattr() and must be imported here.
 from torch import SymBool, SymFloat, SymInt
@@ -83,6 +84,9 @@
 class GuardOnDataDependentSymNode(RuntimeError):
     pass
 
+class PendingUnbackedSymbolNotFound(RuntimeError):
+    pass
+
 import sympy
 from sympy.printing.str import StrPrinter
 from sympy.printing.precedence import precedence, PRECEDENCE
@@ -602,14 +606,19 @@ def free_unbacked_symbols_with_path(
             return r
 
         symbol_to_path = free_unbacked_symbols_with_path(example_value, ())
-        assert not pending, (
-            f"pending {pending} not in {example_value} " +
-            (
+        if not peek and pending:
+            extra = (
                 repr((example_value.stride(), example_value.storage_offset()))
                 if isinstance(example_value, torch.Tensor)
                 else ""
             )
-        )
+            raise PendingUnbackedSymbolNotFound(
+                f"Pending unbacked symbols {pending} not in returned outputs {example_value} {extra}.\n"
+                "Did you accidentally call new_dynamic_size() or item() more times "
+                "than you needed to in your fake implementation?\n"
+                "For more help, see https://docs.google.com/document/d/1RWrH-3wLEpzR9kCS6gGBNen_-Fs-8PVbWWFE5AcgeWE/edit"
+            )
+
         # Why do we have to do some rebinding here?  If the original FX node
         # wasn't a binding site because you had a memo hit, but post
         # translation you aren't a memo hit anymore, there's now a new binding
@@ -2498,7 +2507,7 @@ def _eliminate_unbacked(self, orig_s: sympy.Symbol, new_s: sympy.Expr):
     def set_unbacked_var_to_val(self, k: sympy.Symbol, v: int) -> None:
         """Used only when propagate_real_tensors; registers a value for an
         unbacked symbol, which can be used last resort to resolve hints."""
-        self.unbacked_var_to_val[k] = v
+        self.unbacked_var_to_val[k] = sympy.sympify(v)
 
     # Unlike set_replacement, this records a shapeenv event
     @record_shapeenv_event()
@@ -3787,12 +3796,6 @@ def issue_guard(guard: ShapeGuard) -> None:
             if expr in issued:
                 return
 
-            # When propagate_real_tensors is on, we may end up with guards on
-            # data dependent variables.  These guards are unissuable, so just ignore them
-            if free_unbacked_symbols(expr):
-                log.warning("propagate_real_tensors: ignoring guard %s", expr)
-                return
-
             issued.add(expr)
 
             try:
@@ -4309,6 +4312,18 @@ def size_hint(self, expr: "sympy.Expr", *, allow_none=False):
                 unsound_expr = result_expr.xreplace(self.unbacked_var_to_val)
                 if not unsound_expr.free_symbols:
                     log.warning("propagate_real_tensors size_hint(%s) -> %s", expr, unsound_expr)
+                    trace_structured(
+                        "propagate_real_tensors",
+                        metadata_fn=lambda: {
+                            "expr": repr(expr),
+                            "result": repr(unsound_expr),
+                            "stack": structured.from_traceback(CapturedTraceback.extract(skip=1).summary()),
+                        },
+                    )
+                    self.defer_runtime_assert(
+                        sympy.Eq(result_expr, unsound_expr),
+                        f"propagate_real_tensors: {result_expr} == {unsound_expr}"
+                    )
                     return unsound_expr
 
             raise self._make_data_dependent_error(result_expr, expr)
@@ -4382,6 +4397,9 @@ def _set_replacement(self, a: "sympy.Symbol", tgt: "sympy.Expr", msg: str) -> No
         Use this instead of `self.replacements[a] = tgt`.
         """
 
+        if tgt == self.replacements.get(a, None):
+            return
+
         # Precondition: a == tgt
         assert isinstance(a, sympy.Symbol)
 
@@ -4472,14 +4490,24 @@ def issubset(x, y):
                                    "[%s not subset of %s (size-oblivious conditions)]", a, tgt, msg, tgt_bound_so, src_bound_so)
                     return
 
-        if config.print_specializations and isinstance(tgt, (sympy.Integer, sympy.Float)):
-            # specializing to a constant, which is likely unexpected
+        if isinstance(tgt, (sympy.Integer, sympy.Float)):
+            # specializing to a constant, which is likely unexpected (unless
+            # you specified dynamic=True)
+
+            user_tb = TracingContext.extract_stack()
+            trace_structured(
+                "symbolic_shape_specialization",
+                metadata_fn=lambda: {
+                    "symbol": repr(a),
+                    "sources": [s.name() for s in self.var_to_sources[a]],
+                    "value": repr(tgt),
+                    "reason": msg,
+                    "stack": structured.from_traceback(CapturedTraceback.extract(skip=1).summary()),
+                    "user_stack": structured.from_traceback(user_tb) if user_tb else None,
+                }
+            )
 
-            # NOTE(avik): It is possible that we try logging the same specialization multiple times, e.g.,
-            # when adding a to self.replacements, and again when simplifying an expression containing a.
-            # Thus to avoid duplication, checking whether a is in self.replacements isn't enough; if it is,
-            # it must not already map to `tgt`. Fortunately this check is cheap because `tgt` is a constant.
-            if a not in self.replacements or tgt != self.replacements[a]:
+            if config.print_specializations:
                 self.log.warning("Specializing %s to %s", self.var_to_sources[a][0].name(), tgt)
                 self.log.debug("SPECIALIZATION", stack_info=True)
         log.info("set_replacement %s = %s (%s) %s", a, tgt, msg, tgt_bound)
@@ -4823,6 +4851,8 @@ def compute_concrete_val():
                     assert static_expr == hint, f"{static_expr} != {hint}"
                 return static_expr
 
+            transmute_into_runtime_assert = False
+
             concrete_val = None
             if not (expr.free_symbols <= self.var_to_val.keys()):
                 # TODO: dedupe this with _maybe_evaluate_static
@@ -4843,6 +4873,15 @@ def compute_concrete_val():
                         not (unsound_result := orig_expr.xreplace(self.unbacked_var_to_val)).free_symbols
                     ):
                         log.warning("propagate_real_tensors evaluate_expr(%s) -> %s", orig_expr, unsound_result)
+                        trace_structured(
+                            "propagate_real_tensors",
+                            metadata_fn=lambda: {
+                                "expr": repr(orig_expr),
+                                "result": repr(unsound_result),
+                                "stack": structured.from_traceback(CapturedTraceback.extract(skip=1).summary()),
+                            },
+                        )
+                        transmute_into_runtime_assert = True
                         concrete_val = unsound_result
                     else:
                         raise self._make_data_dependent_error(
@@ -4873,6 +4912,13 @@ def compute_concrete_val():
             else:
                 g = sympy.Eq(expr, concrete_val)  # type: ignore[arg-type]
 
+            if transmute_into_runtime_assert:
+                self.defer_runtime_assert(
+                    g,
+                    f"propagate_real_tensors: {orig_expr} == {unsound_result}"
+                )
+                return concrete_val
+
             if isinstance(g, sympy.Rel):
                 # TODO: If we successfully eliminate a symbol via equality, it
                 # is not actually necessary to save a guard for the equality,
diff --git a/torch/fx/graph_module.py b/torch/fx/graph_module.py
index 95a9f568443e..fa44b6306786 100644
--- a/torch/fx/graph_module.py
+++ b/torch/fx/graph_module.py
@@ -310,7 +310,7 @@ def __call__(self, obj, *args, **kwargs):
                     _WrappedCall._generate_error_message(topmost_framesummary),
                     file=sys.stderr,
                 )
-                raise e.with_traceback(None)  # noqa: TRY200
+                raise e.with_traceback(None)  # noqa: B904
             else:
                 raise e
 
diff --git a/torch/fx/passes/infra/partitioner.py b/torch/fx/passes/infra/partitioner.py
index 7b36918928d3..3952bb652517 100644
--- a/torch/fx/passes/infra/partitioner.py
+++ b/torch/fx/passes/infra/partitioner.py
@@ -262,10 +262,14 @@ def _update_partition_map(node: Node, id: int):
 
         return [partition for partition in partitions_by_id.values() if partition.size() > 0]
 
-    def fuse_partitions(self, partitions: List[Partition]) -> GraphModule:
+    def fuse_partitions(self, partitions: List[Partition], prefix: str = "fused_") -> GraphModule:
         logger.debug("Fusing partitions...")
         # fuse_by_partitions expects partitions in List[List[Node]]: [ [node0, node1], [node2, node3] ]
-        return fuse_by_partitions(self.graph_module, [list(partition.nodes) for partition in partitions])
+        return fuse_by_partitions(
+            self.graph_module,
+            [list(partition.nodes) for partition in partitions],
+            prefix=prefix,
+        )
 
     # remove non-compute-ops that sits at the boundary of a partition.
     def remove_bookend_non_compute_ops(self, partitions: List[Partition]):
@@ -323,7 +327,7 @@ def is_transparent_output_node(node: Node, partition: Set[Node], removed_nodes:
             if len(remove_node) != 0:
                 partition.nodes = partition.nodes - remove_node
 
-    def partition_and_fuse(self) -> GraphModule:
+    def partition_and_fuse(self, prefix: str = "fused_") -> GraphModule:
         partitions = self.propose_partitions()
-        fused_gm = self.fuse_partitions(partitions)
+        fused_gm = self.fuse_partitions(partitions, prefix=prefix)
         return fused_gm
diff --git a/torch/fx/passes/net_min_base.py b/torch/fx/passes/net_min_base.py
index 9d24162500ac..6d050c78f754 100644
--- a/torch/fx/passes/net_min_base.py
+++ b/torch/fx/passes/net_min_base.py
@@ -373,7 +373,7 @@ def _run_and_compare(
             self._store_outputs(a_result, b_result, submodule)
         except Exception as e:
             report.append(f"Exception raised when running {submod_name}: {e}")
-            raise FxNetMinimizerRunFuncError(  # noqa: TRY200
+            raise FxNetMinimizerRunFuncError(  # noqa: B904
                 f"Exception raised when running {submod_name}: {e}"
             )
 
diff --git a/torch/fx/passes/utils/fuser_utils.py b/torch/fx/passes/utils/fuser_utils.py
index 8976690ed73a..3423ea3dad5a 100644
--- a/torch/fx/passes/utils/fuser_utils.py
+++ b/torch/fx/passes/utils/fuser_utils.py
@@ -218,11 +218,11 @@ def erase_nodes(gm: GraphModule, nodes: NodeList):
 
 
 @compatibility(is_backward_compatible=False)
-def fuse_by_partitions(gm: GraphModule, partitions: List[NodeList]) -> GraphModule:
+def fuse_by_partitions(gm: GraphModule, partitions: List[NodeList], prefix: str = "fused_") -> GraphModule:
     for partition_id, nodes in enumerate(partitions):
         sorted_nodes = topo_sort(nodes)
 
-        submodule_name = "fused_" + str(partition_id)
+        submodule_name = prefix + str(partition_id)
         sub_gm, orig_inputs, orig_outputs = fuse_as_graphmodule(gm, sorted_nodes, submodule_name)
 
         insert_subgm(gm, sub_gm, orig_inputs, orig_outputs)
diff --git a/torch/nn/attention/_flex_attention.py b/torch/nn/attention/_flex_attention.py
index ee131dfac852..c56374fcbc40 100644
--- a/torch/nn/attention/_flex_attention.py
+++ b/torch/nn/attention/_flex_attention.py
@@ -83,6 +83,9 @@ def score_mod(
     """
 
     if torch.compiler.is_dynamo_compiling():
+        # mark head_dim always to be static
+        for x in [query, key, value]:
+            torch._dynamo.mark_static(x, -1)
         out, _ = flex_attention_hop(query, key, value, score_mod)
         return out
 
@@ -93,6 +96,8 @@ def score_mod(
         raise ValueError(
             "NYI: The target sequence length (L) of the query tensor must match the source sequence length (S) of the key tensor."
         )
+    if query.size(-2) % 128 != 0:
+        raise ValueError("NYI: S and L must be a multiple of 128")
 
     if not torch._dynamo.is_dynamo_supported():
         raise RuntimeError("flex_attention requires dynamo support.")
@@ -146,7 +151,7 @@ def _rel_causal(
     token_q: torch.Tensor,
     token_kv: torch.Tensor,
 ) -> torch.Tensor:
-    return torch.where(token_q <= token_kv, score + (token_q - token_kv), float("-inf"))
+    return torch.where(token_q >= token_kv, score + (token_q - token_kv), float("-inf"))
 
 
 def _generate_alibi_bias(num_heads: int):
diff --git a/torch/nn/functional.py b/torch/nn/functional.py
index 57dd5905e6fe..a1d2a846e75e 100644
--- a/torch/nn/functional.py
+++ b/torch/nn/functional.py
@@ -5016,6 +5016,24 @@ def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.
 
 .. warning:: This function is beta and subject to change.
 
+.. warning::
+
+    This function always applies dropout according to the specified ``dropout_p`` argument.
+    To disable dropout during evaluation, be sure to pass a value of ``0.0`` when the module
+    that makes the function call is not in training mode.
+
+    For example:
+
+    .. code-block:: python
+
+        class MyModel(nn.Module):
+            def __init__(self, p=0.5):
+                super().__init__()
+                self.p = p
+
+            def forward(self, ...):
+                return F.scaled_dot_product_attention(..., dropout_p=(self.p if self.training else 0.0))
+
 Note:
 
     There are currently three supported implementations of scaled dot product attention:
diff --git a/torch/nn/utils/_named_member_accessor.py b/torch/nn/utils/_named_member_accessor.py
index 3a82b2b426aa..e46318b0d3ac 100644
--- a/torch/nn/utils/_named_member_accessor.py
+++ b/torch/nn/utils/_named_member_accessor.py
@@ -147,7 +147,7 @@ def get_submodule(self, name: str) -> "torch.nn.Module":
                     f"{module._get_name()} has no attribute `{attr}`"
                 ) from ex
             if not isinstance(submodule, torch.nn.Module):
-                raise TypeError(  # noqa: TRY200
+                raise TypeError(  # noqa: B904
                     f"submodule `{name}`: {submodule} is not an instance of torch.nn.Module"
                 )
             self.memo[name] = submodule
diff --git a/torch/onnx/_internal/fx/decomposition_skip.py b/torch/onnx/_internal/fx/decomposition_skip.py
index 425e8604468b..7fb971a3307a 100644
--- a/torch/onnx/_internal/fx/decomposition_skip.py
+++ b/torch/onnx/_internal/fx/decomposition_skip.py
@@ -71,7 +71,7 @@ def register_custom_op(cls):
         new_op_qualname = f"{_NEW_OP_NAMESPACE}::{cls.new_op_name}"
         torch.library.define(new_op_qualname, cls.new_op_schema)
         torch.library.impl(new_op_qualname, "default", cls.replacement)
-        torch.library.impl_abstract(new_op_qualname, cls.abstract)
+        torch.library.register_fake(new_op_qualname, cls.abstract)
 
     @classmethod
     def replacement(cls, *args, **kwargs):
diff --git a/torch/onnx/_internal/onnx_proto_utils.py b/torch/onnx/_internal/onnx_proto_utils.py
index b33b4a77f469..278af3feacc6 100644
--- a/torch/onnx/_internal/onnx_proto_utils.py
+++ b/torch/onnx/_internal/onnx_proto_utils.py
@@ -98,7 +98,7 @@ def load_test_case(dir: str) -> Tuple[bytes, Any, Any]:
     """
     try:
         import onnx
-        from onnx import numpy_helper
+        from onnx import numpy_helper  # type: ignore[attr-defined]
     except ImportError as exc:
         raise ImportError(
             "Load test case from ONNX format failed: Please install ONNX."
@@ -134,7 +134,7 @@ def export_data(data, value_info_proto, f: str) -> None:
         f: The file to write the data to.
     """
     try:
-        from onnx import numpy_helper
+        from onnx import numpy_helper  # type: ignore[attr-defined]
     except ImportError as exc:
         raise ImportError(
             "Export data to ONNX format failed: Please install ONNX."
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
index 0ed8acfac1c6..6ce7d850a530 100644
--- a/torch/optim/adagrad.py
+++ b/torch/optim/adagrad.py
@@ -2,7 +2,7 @@
 
 import torch
 from torch import Tensor
-
+from torch.utils._foreach_utils import _get_fused_kernels_supported_devices
 from .optimizer import (
     _default_to_fused_or_foreach,
     _differentiable_doc,
@@ -32,6 +32,7 @@ def __init__(
         *,
         maximize: bool = False,
         differentiable: bool = False,
+        fused: Optional[bool] = None,
     ):
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
@@ -55,13 +56,41 @@ def __init__(
             foreach=foreach,
             maximize=maximize,
             differentiable=differentiable,
+            fused=fused,
         )
         super().__init__(params, defaults)
 
+        if fused:
+            if differentiable:
+                raise RuntimeError("`fused` does not support `differentiable`")
+            self._step_supports_amp_scaling = True
+            fused_supported_devices = _get_fused_kernels_supported_devices()
+            # Not support CUDA yet
+            fused_supported_devices.remove("cuda")
+            if not all(
+                p.device.type in fused_supported_devices and torch.is_floating_point(p)
+                for pg in self.param_groups
+                for p in pg["params"]
+            ):
+                raise RuntimeError(
+                    "`fused=True` requires all the params to be floating point Tensors of "
+                    f"supported devices: {fused_supported_devices}."
+                )
+            if foreach:
+                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")
+
         for group in self.param_groups:
             for p in group["params"]:
                 state = self.state[p]
-                state["step"] = torch.tensor(0.0, dtype=_get_scalar_dtype())
+                state["step"] = (
+                    torch.zeros(
+                        (),
+                        dtype=_get_scalar_dtype(is_fused=group["fused"]),
+                        device=p.device,
+                    )
+                    if group["fused"]
+                    else torch.tensor(0.0, dtype=_get_scalar_dtype())
+                )
                 init_value = (
                     complex(initial_accumulator_value, initial_accumulator_value)
                     if torch.is_complex(p)
@@ -73,10 +102,14 @@ def __init__(
 
     def __setstate__(self, state):
         super().__setstate__(state)
+        #  define "fused" for
+        #  MYPY error: Name "fused" may be undefined
+        fused = None
         for group in self.param_groups:
             group.setdefault("foreach", None)
             group.setdefault("maximize", False)
             group.setdefault("differentiable", False)
+            fused = group.setdefault("fused", None)
 
         state_values = list(self.state.values())
         step_is_tensor = (len(state_values) != 0) and torch.is_tensor(
@@ -84,7 +117,9 @@ def __setstate__(self, state):
         )
         if not step_is_tensor:
             for s in state_values:
-                s["step"] = torch.tensor(float(s["step"]), dtype=_get_scalar_dtype())
+                s["step"] = torch.tensor(
+                    float(s["step"]), dtype=_get_scalar_dtype(is_fused=fused)
+                )
 
     def share_memory(self):
         for group in self.param_groups:
@@ -144,6 +179,9 @@ def step(self, closure=None):
                 maximize=group["maximize"],
                 differentiable=group["differentiable"],
                 has_complex=has_complex,
+                fused=group["fused"],
+                grad_scale=getattr(self, "grad_scale", None),
+                found_inf=getattr(self, "found_inf", None),
             )
 
         return loss
@@ -190,7 +228,10 @@ def step(self, closure=None):
         {_foreach_doc}
         {_maximize_doc}
         {_differentiable_doc}
-
+        fused (bool, optional): whether the fused implementation (CPU only) is used.
+            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
+            are supported. (default: None). Please note that the fused implementations does not
+            support sparse or complex gradients.
     .. _Adaptive Subgradient Methods for Online Learning and Stochastic
         Optimization: http://jmlr.org/papers/v12/duchi11a.html
 
@@ -203,6 +244,9 @@ def adagrad(
     grads: List[Tensor],
     state_sums: List[Tensor],
     state_steps: List[Tensor],
+    fused: Optional[bool] = None,
+    grad_scale: Optional[Tensor] = None,
+    found_inf: Optional[Tensor] = None,
     # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
     # setting these as kwargs for now as functional API is compiled by torch/distributed/optim
     has_sparse_grad: bool = False,
@@ -225,15 +269,28 @@ def adagrad(
             "API has changed, `state_steps` argument must contain a list of singleton tensors"
         )
 
-    if foreach is None:
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
+    if fused is None and foreach is None:
         _, foreach = _default_to_fused_or_foreach(
             params, differentiable, use_fused=False
         )
 
+    if fused is None:
+        fused = False
+    if foreach is None:
+        foreach = False
+
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
+    if fused and torch.jit.is_scripting():
+        raise RuntimeError("torch.jit.script not supported with fused optimizers")
 
-    if foreach and not torch.jit.is_scripting():
+    if fused and not torch.jit.is_scripting():
+        func = _fused_adagrad
+    elif foreach and not torch.jit.is_scripting():
         func = _multi_tensor_adagrad
     else:
         func = _single_tensor_adagrad
@@ -251,6 +308,8 @@ def adagrad(
         maximize=maximize,
         differentiable=differentiable,
         has_complex=has_complex,
+        grad_scale=grad_scale,
+        found_inf=found_inf,
     )
 
 
@@ -266,6 +325,8 @@ def _single_tensor_adagrad(
     grads: List[Tensor],
     state_sums: List[Tensor],
     state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
     *,
     lr: float,
     weight_decay: float,
@@ -276,6 +337,7 @@ def _single_tensor_adagrad(
     differentiable: bool,
     has_complex: bool,
 ):
+    assert grad_scale is None and found_inf is None
     for param, grad, state_sum, step_t in zip(params, grads, state_sums, state_steps):
         # update step
         step_t += 1
@@ -324,6 +386,8 @@ def _multi_tensor_adagrad(
     grads: List[Tensor],
     state_sums: List[Tensor],
     state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
     *,
     lr: float,
     weight_decay: float,
@@ -335,6 +399,7 @@ def _multi_tensor_adagrad(
     has_complex: bool,
 ):
     assert not differentiable, "_foreach ops don't support autograd"
+    assert grad_scale is None and found_inf is None
 
     # Foreach functions will throw errors if given empty lists
     if len(params) == 0:
@@ -367,6 +432,8 @@ def _multi_tensor_adagrad(
                 maximize=maximize,
                 differentiable=differentiable,
                 has_complex=has_complex,
+                grad_scale=grad_scale,
+                found_inf=found_inf,
             )
             continue
 
@@ -414,3 +481,76 @@ def _multi_tensor_adagrad(
             numerator = torch._foreach_mul(device_grads, minus_clr)  # type: ignore[assignment]
 
         torch._foreach_addcdiv_(device_params, numerator, std)
+
+
+def _fused_adagrad(
+    params: List[Tensor],
+    grads: List[Tensor],
+    state_sums: List[Tensor],
+    state_steps: List[Tensor],
+    grad_scale: Optional[Tensor],
+    found_inf: Optional[Tensor],
+    *,
+    lr: float,
+    weight_decay: float,
+    lr_decay: float,
+    eps: float,
+    has_sparse_grad: bool,
+    maximize: bool,
+    differentiable: bool,
+    has_complex: bool,
+) -> None:
+    if not params:
+        return
+    if has_sparse_grad or has_complex:
+        raise RuntimeError("`fused` does not support sparse grad or complex param")
+
+    if differentiable:
+        raise RuntimeError(
+            "adagrad with fused=True does not support differentiable=True"
+        )
+
+    grad_scale_dict = (
+        {grad_scale.device: grad_scale} if grad_scale is not None else None
+    )
+    found_inf_dict = {found_inf.device: found_inf} if found_inf is not None else None
+
+    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
+        [params, grads, state_sums, state_steps]
+    )
+    for (device, _), (
+        (
+            device_params,
+            device_grads,
+            device_state_sums,
+            device_state_steps,
+        ),
+        _,
+    ) in grouped_tensors.items():
+        device_grad_scale, device_found_inf = None, None
+        if grad_scale is not None and grad_scale_dict is not None:
+            if device not in grad_scale_dict:
+                grad_scale_dict[device] = grad_scale.to(device, non_blocking=True)  # type: ignore[index]
+            device_grad_scale = grad_scale_dict[device]  # type: ignore[index]
+        if found_inf is not None and found_inf_dict is not None:
+            if found_inf not in found_inf_dict:
+                found_inf_dict[device] = found_inf.to(device, non_blocking=True)  # type: ignore[index]
+            device_found_inf = found_inf_dict[device]  # type: ignore[index]
+        torch._foreach_add_(device_state_steps, 1)
+        torch._fused_adagrad_(
+            device_params,
+            device_grads,
+            device_state_sums,
+            device_state_steps,
+            lr=lr,
+            lr_decay=lr_decay,
+            weight_decay=weight_decay,
+            eps=eps,
+            maximize=maximize,
+            grad_scale=device_grad_scale,
+            found_inf=device_found_inf,
+        )
+        if device_found_inf is not None:
+            torch._foreach_sub_(
+                device_state_steps, [device_found_inf] * len(device_state_steps)
+            )
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
index a87aadc81803..f53f8b427e9f 100644
--- a/torch/optim/asgd.py
+++ b/torch/optim/asgd.py
@@ -22,13 +22,6 @@
 __all__ = ["ASGD", "asgd"]
 
 
-def _to_tensor(x, device=None):
-    if not isinstance(x, torch.Tensor):
-        return torch.tensor(x, device=device)
-
-    return x
-
-
 class ASGD(Optimizer):
     def __init__(
         self,
@@ -264,9 +257,9 @@ def _single_tensor_asgd(
             mu.copy_(1 / torch.maximum(step_t - t0, torch.ones_like(step_t)))
         else:
             step = _get_value(step_t)
-            new_eta = _to_tensor(lr / ((1 + lambd * lr * step) ** alpha))
+            new_eta = torch.as_tensor(lr / ((1 + lambd * lr * step) ** alpha))
             eta.copy_(new_eta)
-            new_mu = _to_tensor(1 / max(1, step - t0))
+            new_mu = torch.as_tensor(1 / max(1, step - t0))
             mu.copy_(new_mu)
 
 
@@ -381,27 +374,23 @@ def _multi_tensor_asgd(
             torch._foreach_copy_(grouped_mus, new_mus)
             del new_mus
 
-            # update eta = lr / (1 + lambd * lr * step^alpha)
-            new_etas = torch._foreach_pow(grouped_state_steps, alpha)
-            torch._foreach_mul_(new_etas, lambd)
+            # update eta = lr / ((1 + lambd * lr * step)^alpha)
+            new_etas = torch._foreach_mul(grouped_state_steps, lambd)
             torch._foreach_mul_(new_etas, lr)
             torch._foreach_add_(new_etas, 1)
+            torch._foreach_pow_(new_etas, alpha)
             torch._foreach_reciprocal_(new_etas)
             torch._foreach_mul_(new_etas, lr)
             torch._foreach_copy_(grouped_etas, new_etas)
         else:
-            step = grouped_state_steps[0].item()
-            new_etas = []
-            new_mus = []
-
-            for i in range(len(grouped_mus)):
-                new_eta = _to_tensor(
-                    lr / (1 + lambd * lr * step**alpha), device=device
-                )
-                new_etas.append(new_eta)
-                new_mu = _to_tensor(1 / max(1, step - t0), device=device)
-                new_mus.append(new_mu)
-
+            new_etas = [
+                torch.as_tensor(lr / ((1 + lambd * lr * step) ** alpha), device=device)
+                for step in grouped_state_steps
+            ]
+            new_mus = [
+                torch.as_tensor(1 / max(1, _get_value(step) - t0), device=device)
+                for step in grouped_state_steps
+            ]
             torch._foreach_copy_(grouped_etas, new_etas)
             torch._foreach_copy_(grouped_mus, new_mus)
 
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
index 1b76f6287af3..e0e86d588758 100644
--- a/torch/optim/optimizer.py
+++ b/torch/optim/optimizer.py
@@ -234,7 +234,7 @@ def _get_capturable_supported_devices(supports_xla: bool = True) -> List[str]:
             being a tensorlist vs just one tensor. If memory is prohibitive, batch fewer
             parameters through the optimizer at a time or switch this flag to False (default: None)"""
 
-_fused_doc = r"""fused (bool, optional): whether the fused implementation (CUDA only) is used.
+_fused_doc = r"""fused (bool, optional): whether the fused implementation is used.
             Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
             are supported. (default: None)
 
diff --git a/torch/serialization.py b/torch/serialization.py
index df839408ee77..a7703b9964d0 100644
--- a/torch/serialization.py
+++ b/torch/serialization.py
@@ -1,4 +1,5 @@
 import difflib
+import functools
 import os
 import io
 import shutil
@@ -31,7 +32,7 @@
 STORAGE_KEY_SEPARATOR = ','
 
 FILE_LIKE: TypeAlias = Union[str, os.PathLike, BinaryIO, IO[bytes]]
-MAP_LOCATION: TypeAlias = Optional[Union[Callable[[torch.Tensor, str], torch.Tensor], torch.device, str, Dict[str, str]]]
+MAP_LOCATION: TypeAlias = Optional[Union[Callable[[Storage, str], Storage], torch.device, str, Dict[str, str]]]
 STORAGE: TypeAlias = Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage]
 
 IS_WINDOWS = sys.platform == "win32"
@@ -58,6 +59,9 @@
     'LoadEndianness',
     'get_default_load_endianness',
     'set_default_load_endianness',
+    'clear_safe_globals',
+    'get_safe_globals',
+    'add_safe_globals',
 ]
 
 
@@ -147,6 +151,27 @@ def set_default_mmap_options(flags: int):
                          f"expected mmap.MAP_PRIVATE or mmap.MAP_SHARED, but got {flags}")
     _default_mmap_options = flags
 
+def clear_safe_globals() -> None:
+    '''
+    Clears the list of globals that are safe for ``weights_only`` load.
+    '''
+    _weights_only_unpickler._clear_safe_globals()
+
+def get_safe_globals() -> List[Any]:
+    '''
+    Returns the list of user-added globals that are safe for ``weights_only`` load.
+    '''
+    return _weights_only_unpickler._get_safe_globals()
+
+def add_safe_globals(safe_globals: List[Any]) -> None:
+    '''
+    Marks the given globals as safe for ``weights_only`` load.
+
+    Args:
+        safe_globals (List[Any]): list of globals to mark as safe
+    '''
+    _weights_only_unpickler._add_safe_globals(safe_globals)
+
 def _is_zipfile(f) -> bool:
     # This is a stricter implementation than zipfile.is_zipfile().
     # zipfile.is_zipfile() is True if the magic number appears anywhere in the
@@ -252,14 +277,6 @@ def _cpu_tag(obj):
         return 'cpu'
 
 
-def _cuda_tag(obj):
-    if obj.device.type == 'cuda':
-        return 'cuda:' + str(obj.device.index)
-
-def _hpu_tag(obj):
-    if obj.device.type == 'hpu':
-        return 'hpu:' + str(obj.device.index)
-
 def _mps_tag(obj):
     if obj.device.type == 'mps':
         return 'mps'
@@ -270,8 +287,9 @@ def _meta_tag(obj):
         return 'meta'
 
 
-def _privateuse1_tag(obj):
-    backend_name = torch._C._get_privateuse1_backend_name()
+def _backend_tag(backend_name, obj):
+    if backend_name == 'privateuse1':
+        backend_name = torch._C._get_privateuse1_backend_name()
     if obj.device.type == backend_name:
         if obj.device.index is None:
             return backend_name
@@ -284,66 +302,6 @@ def _cpu_deserialize(obj, location):
         return obj
 
 
-def validate_cuda_device(location):
-    device = torch.cuda._utils._get_device_index(location, True)
-
-    if not torch.cuda.is_available():
-        raise RuntimeError('Attempting to deserialize object on a CUDA '
-                           'device but torch.cuda.is_available() is False. '
-                           'If you are running on a CPU-only machine, '
-                           'please use torch.load with map_location=torch.device(\'cpu\') '
-                           'to map your storages to the CPU.')
-    device_count = torch.cuda.device_count()
-    if device >= device_count:
-        raise RuntimeError('Attempting to deserialize object on CUDA device '
-                           f'{device} but torch.cuda.device_count() is {device_count}. Please use '
-                           'torch.load with map_location to map your storages '
-                           'to an existing device.')
-    return device
-
-
-def _cuda_deserialize(obj, location):
-    if location.startswith('cuda'):
-        device = validate_cuda_device(location)
-        if getattr(obj, "_torch_load_uninitialized", False):
-            with torch.cuda.device(device):
-                return torch.UntypedStorage(obj.nbytes(), device=torch.device(location))
-        else:
-            return obj.cuda(device)
-
-
-def validate_hpu_device(location):
-    hpu = getattr(torch, "hpu", None)
-    assert hpu is not None, "HPU device module is not loaded"
-    device = hpu._utils._get_device_index(location, optional=True)
-
-    if not hpu.is_available():
-        raise RuntimeError('Attempting to deserialize object on a HPU '
-                           'device but torch.hpu.is_available() is False. '
-                           'If you are running on a CPU-only machine, '
-                           'please use torch.load with map_location=torch.device(\'cpu\') '
-                           'to map your storages to the CPU.')
-    device_count = hpu.device_count()
-    if device >= device_count:
-        raise RuntimeError('Attempting to deserialize object on HPU device '
-                           f'{device} but torch.hpu.device_count() is {device_count}. Please use '
-                           'torch.load with map_location to map your storages '
-                           'to an existing device.')
-    return device
-
-
-def _hpu_deserialize(obj, location):
-    if location.startswith('hpu'):
-        hpu = getattr(torch, "hpu", None)
-        assert hpu is not None, "HPU device module is not loaded"
-        device = validate_hpu_device(location)
-        if getattr(obj, "_torch_load_uninitialized", False):
-            with hpu.device(device):
-                return torch.UntypedStorage(obj.nbytes(), device=torch.device(location))
-        else:
-            return obj.hpu(device)
-
-
 def _mps_deserialize(obj, location):
     if location.startswith('mps'):
         return obj.mps()
@@ -354,18 +312,18 @@ def _meta_deserialize(obj, location):
         return torch.UntypedStorage(obj.nbytes(), device='meta')
 
 
-def _validate_privateuse1_device(location, backend_name):
+def _validate_device(location, backend_name):
     '''
-    Check whether the device index of privateuse1 is valid
+    Check whether the device index of specified backend is valid
 
-    Register a device_module of privateuse1 by torch._register_device_module.
-    Implement the following methods in device_module like cuda:
-    device_module._utils._get_device_index(location, True),
+    In case of privateuse1 backend, your must first register a device_module for
+    privateuse1 using torch._register_device_module. Implement the following
+    methods in device_module like cuda: device_module._utils._get_device_index(location, True),
     device_module.device_count().
 
     Args:
         location: string of device
-        backend_name: the name of privateuse1, which can be renamed
+        backend_name: the backend name or the name of privateuse1, which can be renamed
 
     Returns:
         device_index: int
@@ -378,6 +336,7 @@ def _validate_privateuse1_device(location, backend_name):
     device_module = getattr(torch, backend_name)
     if hasattr(device_module, '_utils') and hasattr(device_module._utils, '_get_device_index'):
         device_index = device_module._utils._get_device_index(location, True)
+        device = torch.device(backend_name, device_index)
     else:
         device = torch.device(location)
         device_index = device.index if device.index else 0
@@ -394,29 +353,32 @@ def _validate_privateuse1_device(location, backend_name):
                                f'{device_index} but torch.{backend_name}.device_count() is {device_count}. '
                                'Please use torch.load with map_location to map your storages '
                                'to an existing device.')
-    return device_index
+    return device
+
+
+def validate_cuda_device(location):
+    return _validate_device(location, 'cuda').index
+
 
+def validate_hpu_device(location):
+    return _validate_device(location, 'hpu').index
 
-def _privateuse1_deserialize(obj, location):
-    backend_name = torch._C._get_privateuse1_backend_name()
+
+def _deserialize(backend_name, obj, location):
+    if backend_name == 'privateuse1':
+        backend_name = torch._C._get_privateuse1_backend_name()
     if location.startswith(backend_name):
-        if not hasattr(obj, backend_name):
-            raise RuntimeError(f'Attempting to load the storages to the {backend_name.upper()} device '
-                               f'but torch.storage._StorageBase.{backend_name}() or '
-                               f'torch.storage.TypedStorage.{backend_name}() is not generated. '
-                               'Please use torch.utils.generate_methods_for_privateuse1_backend '
-                               f'to generate storage.{backend_name}() method first.')
-        device_index = _validate_privateuse1_device(location, backend_name)
-        return getattr(obj, backend_name)(device_index)
+        device = _validate_device(location, backend_name)
+        return obj.to(device=device)
 
 
 register_package(10, _cpu_tag, _cpu_deserialize)
-register_package(20, _cuda_tag, _cuda_deserialize)
+register_package(20, functools.partial(_backend_tag, 'cuda'), functools.partial(_deserialize, 'cuda'))
 register_package(21, _mps_tag, _mps_deserialize)
 register_package(22, _meta_tag, _meta_deserialize)
-register_package(23, _privateuse1_tag, _privateuse1_deserialize)
-register_package(24, _hpu_tag, _hpu_deserialize)
-
+register_package(23, functools.partial(_backend_tag, 'privateuse1'), functools.partial(_deserialize, 'privateuse1'))
+register_package(24, functools.partial(_backend_tag, 'hpu'), functools.partial(_deserialize, 'hpu'))
+register_package(25, functools.partial(_backend_tag, 'xpu'), functools.partial(_deserialize, 'xpu'))
 
 def location_tag(storage: Union[Storage, torch.storage.TypedStorage, torch.UntypedStorage]):
     for _, tagger, _ in _package_registry:
@@ -1014,7 +976,9 @@ def load(
     UNSAFE_MESSAGE = (
         "Weights only load failed. Re-running `torch.load` with `weights_only` set to `False`"
         " will likely succeed, but it can result in arbitrary code execution."
-        "Do it only if you get the file from a trusted source. WeightsUnpickler error: "
+        " Do it only if you get the file from a trusted source. Alternatively, to load"
+        " with `weights_only` please check the recommended steps in the following error message."
+        " WeightsUnpickler error: "
     )
     # Add ability to force safe only weight loads via environment variable
     if os.getenv("TORCH_FORCE_WEIGHTS_ONLY_LOAD", "0").lower() in ['1', 'y', 'yes', 'true']:
diff --git a/torch/storage.py b/torch/storage.py
index 306dd99a93ad..32070783f494 100644
--- a/torch/storage.py
+++ b/torch/storage.py
@@ -1,7 +1,7 @@
 import io
 
 import torch
-from ._utils import _type, _cuda, _hpu
+from ._utils import _type, _to
 from torch.types import Storage
 from typing import cast, Any, Dict as _Dict, Optional as _Optional, TypeVar, Type, Union
 import copy
@@ -38,8 +38,37 @@ def size(self) -> int:
         return self.nbytes()
 
     def type(self, dtype: _Optional[str] = None, non_blocking: bool = False) -> T: ...  # type: ignore[empty-body, misc, type-var] # noqa: E704
-    def cuda(self, device=None, non_blocking=False, **kwargs) -> T: ...  # type: ignore[empty-body, misc, type-var] # noqa: E704
-    def hpu(self, device=None, non_blocking=False, **kwargs) -> T: ...  # type: ignore[empty-body, misc, type-var] # noqa: E704
+
+    def cuda(self, device=None, non_blocking=False) -> T:  # type: ignore[type-var] # noqa: E704
+        """Returns a copy of this object in CUDA memory.
+
+        If this object is already in CUDA memory and on the correct device, then
+        no copy is performed and the original object is returned.
+
+        Args:
+            device (int): The destination GPU id. Defaults to the current device.
+            non_blocking (bool): If ``True`` and the source is in pinned memory,
+                the copy will be asynchronous with respect to the host. Otherwise,
+                the argument has no effect.
+        """
+        device2 = torch.device('cuda', device) if device else torch.device('cuda')
+        return self.to(device=device2, non_blocking=non_blocking)
+
+    def hpu(self, device=None, non_blocking=False) -> T:  # type: ignore[type-var] # noqa: E704
+        """Returns a copy of this object in HPU memory.
+
+        If this object is already in HPU memory and on the correct device, then
+        no copy is performed and the original object is returned.
+
+        Args:
+            device (int): The destination HPU id. Defaults to the current device.
+            non_blocking (bool): If ``True`` and the source is in pinned memory,
+                the copy will be asynchronous with respect to the host. Otherwise,
+                the argument has no effect.
+        """
+        device2 = torch.device('hpu', device) if device else torch.device('hpu')
+        return self.to(device=device2, non_blocking=non_blocking)
+
     def element_size(self) -> int: ...  # type: ignore[empty-body, type-var] # noqa: E704
 
     def get_device(self) -> int:
@@ -153,6 +182,9 @@ def _to(self, dtype):
             storage = storage.clone()
         return storage
 
+    def to(self, *, device: torch.device, non_blocking: bool = False) -> T:  # type: ignore[type-var] # noqa: E704
+        return _to(self, device, non_blocking)
+
     def double(self):
         """Casts this storage to double type."""
         return self._to(torch.double)
@@ -382,8 +414,6 @@ def _load_from_bytes(b):
 
 
 _StorageBase.type = _type  # type: ignore[assignment]
-_StorageBase.cuda = _cuda  # type: ignore[assignment]
-_StorageBase.hpu = _hpu  # type: ignore[assignment]
 
 
 @lru_cache(maxsize=None)
@@ -812,20 +842,27 @@ def type(self, dtype: _Optional[str] = None, non_blocking: bool = False) -> Unio
         else:
             return self._untyped_storage.type(dtype, non_blocking)
 
-    def cuda(self, device=None, non_blocking=False, **kwargs) -> T:  # type: ignore[misc, type-var]
+    def cuda(self, device=None, non_blocking=False) -> T:  # type: ignore[misc, type-var]
         _warn_typed_storage_removal()
         if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
             raise RuntimeError("Cannot create CUDA storage with quantized dtype")
-        cuda_storage: torch.UntypedStorage = self._untyped_storage.cuda(device, non_blocking, **kwargs)
+        cuda_storage: torch.UntypedStorage = self._untyped_storage.cuda(device, non_blocking)
         return self._new_wrapped_storage(cuda_storage)
 
-    def hpu(self, device=None, non_blocking=False, **kwargs) -> T:  # type: ignore[misc, type-var]
+    def hpu(self, device=None, non_blocking=False) -> T:  # type: ignore[misc, type-var]
         _warn_typed_storage_removal()
         if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
             raise RuntimeError("Cannot create HPU storage with quantized dtype")
-        hpu_storage: torch.UntypedStorage = self._untyped_storage.hpu(device, non_blocking, **kwargs)
+        hpu_storage: torch.UntypedStorage = self._untyped_storage.hpu(device, non_blocking)
         return self._new_wrapped_storage(hpu_storage)
 
+    def to(self, *, device: torch.device, non_blocking: bool = False) -> T:  # type: ignore[type-var]
+        _warn_typed_storage_removal()
+        if self.dtype in [torch.quint8, torch.quint4x2, torch.quint2x4, torch.qint32, torch.qint8]:
+            raise RuntimeError(f"Cannot create {device.type.upper()} storage with quantized dtype")
+        to_storage: torch.UntypedStorage = self._untyped_storage.to(device=device, non_blocking=non_blocking)
+        return self._new_wrapped_storage(to_storage)
+
     def element_size(self):
         _warn_typed_storage_removal()
         return self._element_size()
@@ -1209,8 +1246,9 @@ def _get_legacy_storage_class(self):
             return None
 
 TypedStorage.type.__doc__ = _type.__doc__
-TypedStorage.cuda.__doc__ = _cuda.__doc__
-TypedStorage.hpu.__doc__ = _hpu.__doc__
+TypedStorage.cuda.__doc__ = _StorageBase.cuda.__doc__
+TypedStorage.hpu.__doc__ = _StorageBase.hpu.__doc__
+TypedStorage.to.__doc__ = _to.__doc__
 
 class _LegacyStorageMeta(type):
     dtype: torch.dtype
diff --git a/torch/testing/_internal/common_optimizers.py b/torch/testing/_internal/common_optimizers.py
index 61396b622630..c81efb093cd8 100644
--- a/torch/testing/_internal/common_optimizers.py
+++ b/torch/testing/_internal/common_optimizers.py
@@ -590,6 +590,7 @@ def optim_inputs_func_asgd(device, dtype=None):
     ]
     return [
         OptimizerInput(params=None, kwargs={}, desc="default"),
+        OptimizerInput(params=None, kwargs={"lambd": 0.1}, desc="non-default lambd"),
         OptimizerInput(params=None, kwargs={"lr": 0.02}, desc="non-default lr"),
         OptimizerInput(params=None, kwargs={"t0": 100}, desc="t0"),
         OptimizerInput(params=None, kwargs={"maximize": True}, desc="maximize"),
@@ -1146,7 +1147,8 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
         Adagrad,
         optim_inputs_func=optim_inputs_func_adagrad,
         optim_error_inputs_func=optim_error_inputs_func_adagrad,
-        supported_impls=("foreach", "differentiable"),
+        supported_impls=("foreach", "differentiable", "fused"),
+        supports_fused_on=("cpu",),
         supports_sparse=True,
         metadata_for_sparse=(
             {"lr": 0.1, "weight_decay": 0, "lr_decay": 0},
@@ -1155,6 +1157,23 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 lambda opt: ReduceLROnPlateau(opt, threshold=1e-4),
             ],
         ),
+        decorators=(
+            DecorateInfo(
+                #  Note on tolerances:
+                #  difference comes from the fact that the non fused kernel have
+                #  more dtype cast operations. We have another test test_fused_cpu_matches_cuda
+                #  to make sure there is no discrepancies between cuda fused kernel
+                #  and cpu fused kernel
+                toleranceOverride(
+                    {
+                        torch.bfloat16: tol(atol=5e-3, rtol=5e-3),
+                        torch.float16: tol(atol=5e-3, rtol=5e-3),
+                    }
+                ),
+                "TestOptimRenewed",
+                "test_fused_matches_forloop",
+            ),
+        ),
         skips=(
             DecorateInfo(
                 skipIfMps,  # addcdiv doesn't work for non-contiguous, see #118115
@@ -1432,6 +1451,13 @@ def _get_optim_inputs_including_global_cliquey_kwargs(
                 "TestOptimRenewed",
                 "test_defaults_changed_to_foreach",
             ),
+            DecorateInfo(
+                unittest.skip(
+                    "ASGD internally changes the weights even with zero grad"
+                ),
+                "TestOptimRenewed",
+                "test_step_is_noop_for_zero_grads",
+            ),
         ),
     ),
     OptimizerInfo(
diff --git a/torch/testing/_internal/custom_op_db.py b/torch/testing/_internal/custom_op_db.py
index 3177fb9c8bb5..ee170cc36058 100644
--- a/torch/testing/_internal/custom_op_db.py
+++ b/torch/testing/_internal/custom_op_db.py
@@ -458,7 +458,7 @@ def source1_fake(x):
 
 lib.define("source2(Tensor x) -> Tensor")
 
-@torch.library.impl_abstract("_torch_testing::source2", lib=lib)
+@torch.library.register_fake("_torch_testing::source2", lib=lib)
 def _(x):
     return x.clone()
 
@@ -467,7 +467,7 @@ def _(x):
 def source3_fake(x):
     return x.clone()
 
-torch.library.impl_abstract("_torch_testing::source3", source3_fake, lib=lib)
+torch.library.register_fake("_torch_testing::source3", source3_fake, lib=lib)
 
 
 @torch.library.custom_op("_torch_testing::source4", mutates_args=())
diff --git a/torch/testing/_internal/distributed/_tensor/common_dtensor.py b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
index bb32baf66890..1012d065e7ad 100644
--- a/torch/testing/_internal/distributed/_tensor/common_dtensor.py
+++ b/torch/testing/_internal/distributed/_tensor/common_dtensor.py
@@ -35,7 +35,6 @@
 DEVICE_TYPE = (
     "cuda" if torch.cuda.is_available() and torch.cuda.device_count() > 1 else "cpu"
 )
-PG_BACKEND = "nccl" if DEVICE_TYPE == "cuda" else "gloo"
 
 NUM_DEVICES = 4
 
@@ -298,10 +297,11 @@ def world_size(self) -> int:
 
     @property
     def backend(self) -> str:
-        return PG_BACKEND
+        backend = "nccl" if self.device_type == "cuda" else "gloo"
+        return backend
 
     def build_device_mesh(self) -> DeviceMesh:
-        return DeviceMesh(DEVICE_TYPE, list(range(self.world_size)))
+        return DeviceMesh(self.device_type, list(range(self.world_size)))
 
     def init_pg(self) -> None:
         if "nccl" in self.backend and torch.cuda.device_count() < self.world_size:
@@ -359,11 +359,11 @@ def with_comms(func: TestFunc) -> TestFunc:
     def wrapper(
         self, *args: Tuple[object], **kwargs: Dict[str, Any]  # type: ignore[misc]
     ) -> None:
-        # if backend not specified, and cuda available, then use nccl, else gloo
-        if torch.cuda.is_available() and torch.cuda.device_count() >= self.world_size:
-            self.device_type = "cuda"
-        else:
+        # if enough GPU we can use GPU, otherwise we fallback to CPU
+        if not torch.cuda.is_available() or torch.cuda.device_count() < self.world_size:
             self.device_type = "cpu"
+        else:
+            self.device_type = DEVICE_TYPE
 
         self.init_pg()
         func(self, *args, **kwargs)  # type: ignore[misc]
diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py
index 180513413093..b9873b9950fa 100644
--- a/torch/testing/_internal/distributed/distributed_test.py
+++ b/torch/testing/_internal/distributed/distributed_test.py
@@ -48,6 +48,10 @@
     _verify_param_shape_across_processes,
     _sync_module_states,
 )
+from torch.profiler import (
+    ExecutionTraceObserver,
+    ProfilerActivity,
+)
 
 from torch.nn.parallel import DistributedDataParallel
 from torch.nn.parallel.distributed import _dump_DDP_relevant_env_vars, _MixedPrecision
@@ -6867,7 +6871,20 @@ def test_ddp_grad_div_uneven_inputs(self):
                     net.zero_grad()
                     torch.cuda.synchronize(device=self.rank)
 
-        def _test_ddp_profiling(self, profiler_ctx):
+        def _test_ddp_profiling(self, profiler_ctx, profiler_ctx2=None):
+            """Runs DDP based model training and captures profiles.
+            This test will do two profiler runs.
+            1. An inital basic run to check if profiler events are correctly captured.
+            2. A second profiling pass after running some iterations of DDP, to check robustness of thread local state.
+
+            args
+                profiler_ctx : Profiler context manager for pass 1
+                profiler_ctx2 : Profiler context manager for pass 2.
+                    This can be left out as None, in which case a deepcopy
+                    of profiler_ctx is used.
+            Returns:
+                prof: Instantiated profiler object that can be used for post analysis.
+            """
             batch = 3
             dim = 10
             num_iters = 6
@@ -6878,7 +6895,8 @@ def _test_ddp_profiling(self, profiler_ctx):
                 model.cuda(self.rank),
                 device_ids=[self.rank],
             )
-            profiler_ctx_copy = copy.deepcopy(profiler_ctx)
+            if profiler_ctx2 is None:
+                profiler_ctx2 = copy.deepcopy(profiler_ctx)
 
             with profiler_ctx as prof:
                 for i in range(num_iters):
@@ -6913,7 +6931,7 @@ def _test_ddp_profiling(self, profiler_ctx):
                 loss = net(inp).sum()
                 loss.backward()
             # Now enable the profiler.
-            with profiler_ctx_copy as prof:
+            with profiler_ctx2 as prof:
                 loss = net(inp).sum()
                 loss.backward()
 
@@ -6971,6 +6989,90 @@ def test_ddp_profiling_torch_profiler(self):
             self.assertEqual(a1["Out msg nelems"], 1, msg=f"{a1}")
             self.assertEqual(a1["dtype"], "Int", msg=f"{a1}")
 
+        def _validate_execution_trace_nccl(self, et_file: str) -> None:
+            """Torch profiler includes nccl metadata in an inserted operator called "record_param_comms"
+            We test for basic fields in theese nodes in the Execution Trace.
+            """
+            with open(et_file) as f:
+                et = json.load(f)
+
+            nccl_meta_nodes = [n for n in et["nodes"] if n["name"] == "record_param_comms"]
+            self.assertEqual(len(nccl_meta_nodes), 3)
+            per_coll_meta = defaultdict(list)
+
+            # Sanity check NCCL metadata nodes
+            for n in nccl_meta_nodes:
+                attrs_list = n.get("attrs", [])
+                self.assertGreater(len(attrs_list), 0)
+                attrs = {a["name"]: a["value"] for a in attrs_list}
+
+                collname = attrs.get("collective_name", "")
+                self.assertNotEqual(collname, "")
+                self.assertNotEqual(attrs.get("dtype", ""), "")
+
+                per_coll_meta[collname].append(attrs)
+                if collname in {"wait"}:
+                    continue
+
+                self.assertEqual(attrs["pg_name"], "0")   # yes this is a string
+                self.assertEqual(attrs["pg_desc"], "default_pg")
+                self.assertEqual(attrs["pg_size"], 2)
+
+                self.assertGreaterEqual(attrs.get("in_msg_nelems", -1), 0)
+                self.assertGreaterEqual(attrs.get("out_msg_nelems", -1), 0)
+                self.assertTrue("in_split_size" in attrs.keys())
+                self.assertTrue("out_split_size" in attrs.keys())
+                self.assertEqual(attrs.get("global_rank_start", -1), 0)
+                self.assertEqual(attrs.get("global_rank_stride", -1), 1)
+
+            # print(per_coll_meta)
+            self.assertEqual(len(per_coll_meta["allreduce"]), 2)
+            self.assertEqual(len(per_coll_meta["wait"]), 1)
+
+            # check allreduce message sizes
+            a0 = per_coll_meta["allreduce"][0]
+            self.assertEqual(a0["out_msg_nelems"], 100, msg=f"{a0}")
+            self.assertEqual(a0["dtype"], "Float", msg=f"{a0}")
+            a1 = per_coll_meta["allreduce"][1]
+            self.assertEqual(a1["out_msg_nelems"], 1, msg=f"{a1}")
+            self.assertEqual(a1["dtype"], "Int", msg=f"{a1}")
+
+
+        @require_backend_is_available(DistTestCases.backend_feature["gpu"])
+        @skip_if_lt_x_gpu(2)
+        @skip_but_pass_in_sandcastle_if(IS_FBCODE, "Kineto in fbcode code causes hang")
+        @skip_but_pass_in_sandcastle_if(
+            IS_MACOS or IS_WINDOWS,
+            "torch.profiler not enabled for mac/windows: https://github.com/pytorch/pytorch/pull/56124",
+        )
+        @unittest.skipIf(BACKEND != "nccl", "Tests nccl metadata primarily.")
+        def test_ddp_profiling_execution_trace(self):
+            self.assertEqual(dist.get_backend(), "nccl")
+            # Create a temp file to save execution trace data
+            fp = tempfile.NamedTemporaryFile("w+t", suffix=".et.json", delete=False)
+            fp.close()
+            et_file = fp.name
+
+            et = ExecutionTraceObserver().register_callback(et_file)
+
+            # first profiler context need not have ET
+            torch_profiler_ctx1 = torch.profiler.profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+            )
+            # collect ET in second profiler pass
+            torch_profiler_ctx2 = torch.profiler.profile(
+                activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
+                execution_trace_observer=et
+            )
+            prof = self._test_ddp_profiling(
+                profiler_ctx=torch_profiler_ctx1,
+                profiler_ctx2=torch_profiler_ctx2,
+            )
+
+            print(f"Execution trace saved at {fp.name}")
+            self._validate_execution_trace_nccl(et_file)
+
+
         @skip_if_lt_x_gpu(2)
         @skip_but_pass_in_sandcastle_if(
             BACKEND not in DistTestCases.backend_feature["ddp"],
diff --git a/torch/testing/_internal/hop_db.py b/torch/testing/_internal/hop_db.py
index 4772fb42a963..1602c1ef6562 100644
--- a/torch/testing/_internal/hop_db.py
+++ b/torch/testing/_internal/hop_db.py
@@ -82,7 +82,7 @@ def foo_impl_cuda(x, z):
     return x, z, x + z
 
 
-@torch.library.impl_abstract("testlib::mutating_custom_op")
+@torch.library.register_fake("testlib::mutating_custom_op")
 def foo_impl_abstract(x, z):
     return x, z, x + z
 
@@ -118,9 +118,9 @@ def score_mod(score, b, h, m, n):
         return score + h
 
     yield SampleInput(
-        make_arg(2, 2, 64, 8, low=0.1, high=2),
-        make_arg(2, 2, 64, 8, low=0.1, high=2),
-        make_arg(2, 2, 64, 8, low=0.1, high=2),
+        make_arg(2, 2, 128, 8, low=0.1, high=2),
+        make_arg(2, 2, 128, 8, low=0.1, high=2),
+        make_arg(2, 2, 128, 8, low=0.1, high=2),
         score_mod,
     )
 
diff --git a/torch/testing/_internal/logging_tensor.py b/torch/testing/_internal/logging_tensor.py
index 5ddd53747440..8b7faf45b3c3 100644
--- a/torch/testing/_internal/logging_tensor.py
+++ b/torch/testing/_internal/logging_tensor.py
@@ -11,6 +11,7 @@
 import functools
 from torch._C._profiler import gather_traceback, symbolize_tracebacks
 
+logger = logging.getLogger("LoggingTensor")
 
 _dtype_abbrs = {
     torch.bfloat16: "bf16",
@@ -135,8 +136,8 @@ def emit(self, record):
         if self.tracebacks_list is not None:
             self.tracebacks_list.append(record.traceback)
 
-def log_input(name: str, var: object):
-    logging.getLogger("LoggingTensor").info("input", (name,), {}, var)  # noqa: PLE1205
+def log_input(name: str, var: object) -> None:
+    logger.info("input", (name,), {}, var)  # noqa: PLE1205
 
 class GatherTraceback(logging.Filter):
     def __init__(self, python=True, script=True, cpp=False):
@@ -151,7 +152,6 @@ def filter(self, record):
 @contextlib.contextmanager
 def capture_logs(is_mode=False, python_tb=False, script_tb=False, cpp_tb=False) -> Iterator[List[str]]:
     collect_traceback = python_tb or script_tb or cpp_tb
-    logger = logging.getLogger("LoggingTensor")
     log_list: List[str] = []
     tracebacks_list: List[str] = []
     handler = LoggingTensorHandler(
diff --git a/torch/testing/_internal/opinfo/definitions/sparse.py b/torch/testing/_internal/opinfo/definitions/sparse.py
index e6f0ad0e6f51..3e1f816d9f73 100644
--- a/torch/testing/_internal/opinfo/definitions/sparse.py
+++ b/torch/testing/_internal/opinfo/definitions/sparse.py
@@ -25,7 +25,7 @@ def _check_fail(sample):
         except sample.error_type:
             pass
         except Exception as msg:
-            raise AssertionError(  # noqa: TRY200
+            raise AssertionError(  # noqa: B904
                 f"{op_info.name} on {sample.sample_input=} expected exception "
                 f"{sample.error_type}: {sample.error_regex}, got {type(msg).__name__}: {msg}"
             )
@@ -39,7 +39,7 @@ def _check_success(sample):
         try:
             op_info(sample.input, *sample.args, **sample.kwargs)
         except Exception as msg:
-            raise AssertionError(  # noqa: TRY200
+            raise AssertionError(  # noqa: B904
                 f"{op_info.name} on {sample=} expected to succeed "
                 f", got {type(msg).__name__}: {msg}"
             )
diff --git a/torch/utils/_foreach_utils.py b/torch/utils/_foreach_utils.py
index 840df0432b53..6f8a9b5b7e23 100644
--- a/torch/utils/_foreach_utils.py
+++ b/torch/utils/_foreach_utils.py
@@ -15,6 +15,8 @@ def _get_fused_kernels_supported_devices() -> List[str]:
 
 TensorListList: TypeAlias = List[List[Optional[Tensor]]]
 Indices: TypeAlias = List[int]
+_foreach_supported_types = [torch.Tensor]
+
 
 # This util function splits tensors into groups by device and dtype, which is useful before sending
 # tensors off to a foreach implementation, which requires tensors to be on one device and dtype.
@@ -44,4 +46,4 @@ def _device_has_foreach_support(device: torch.device) -> bool:
 
 
 def _has_foreach_support(tensors: List[Tensor], device: torch.device) -> bool:
-    return _device_has_foreach_support(device) and all(t is None or type(t) == torch.Tensor for t in tensors)
+    return _device_has_foreach_support(device) and all(t is None or type(t) in _foreach_supported_types for t in tensors)
diff --git a/torch/utils/_sympy/functions.py b/torch/utils/_sympy/functions.py
index 427333b07c16..e8c4a57d84c8 100644
--- a/torch/utils/_sympy/functions.py
+++ b/torch/utils/_sympy/functions.py
@@ -1,11 +1,21 @@
+import math
+
 import sympy
 from sympy import S
 from sympy.core.logic import fuzzy_and, fuzzy_not, fuzzy_or
-import math
 
 __all__ = [
-    "FloorDiv", "ModularIndexing", "CleanDiv", "CeilDiv", "Pow", "TrueDiv",
-    "LShift", "RShift", "IsNonOverlappingAndDenseIndicator", "Round", "RoundDecimal",
+    "FloorDiv",
+    "ModularIndexing",
+    "CleanDiv",
+    "CeilDiv",
+    "Pow",
+    "TrueDiv",
+    "LShift",
+    "RShift",
+    "IsNonOverlappingAndDenseIndicator",
+    "Round",
+    "RoundDecimal",
 ]
 
 
@@ -21,6 +31,7 @@ class FloorDiv(sympy.Function):
     1. We can use divisibility guards to simplify FloorDiv(a, b) to a / b.
     2. Printing out the expression is nicer (compared to say, representing a//b as (a - a % b) / b)
     """
+
     nargs = (2,)
     precedence = 50  # precedence of mul  # noqa: F811
 
@@ -53,11 +64,14 @@ def _eval_is_integer(self):
     @classmethod
     def eval(cls, base, divisor):
         def check_supported_type(x):
-            if (x.is_integer is False and x.is_real is False and x.is_complex) or x.is_Boolean:
+            if (
+                x.is_integer is False and x.is_real is False and x.is_complex
+            ) or x.is_Boolean:
                 raise TypeError(
                     f"unsupported operand type(s) for //: "
                     f"'{type(base).__name__}' and '{type(divisor).__name__}'"
-                    f", expected integer or real")
+                    f", expected integer or real"
+                )
 
         check_supported_type(base)
         check_supported_type(divisor)
@@ -77,7 +91,9 @@ def check_supported_type(x):
             return sympy.Mul(base, -1)
         if isinstance(base, sympy.Integer) and isinstance(divisor, sympy.Integer):
             return base // divisor
-        if isinstance(base, (sympy.Integer, sympy.Float)) and isinstance(divisor, (sympy.Integer, sympy.Float)):
+        if isinstance(base, (sympy.Integer, sympy.Float)) and isinstance(
+            divisor, (sympy.Integer, sympy.Float)
+        ):
             return sympy.floor(base / divisor)
         if isinstance(base, FloorDiv):
             return FloorDiv(base.args[0], base.args[1] * divisor)
@@ -125,7 +141,9 @@ def eval(cls, base, divisor, modulus):
                 gcd = sympy.gcd(base, divisor)
                 if gcd != 1:
                     return ModularIndexing(
-                        sympy.simplify(base / gcd), sympy.simplify(divisor / gcd), modulus
+                        sympy.simplify(base / gcd),
+                        sympy.simplify(divisor / gcd),
+                        modulus,
                     )
         except sympy.PolynomialError:
             pass  # https://github.com/pytorch/pytorch/issues/108276
@@ -178,6 +196,7 @@ def eval(cls, c, p, q):
         elif c == sympy.false:
             return q
 
+
 class Mod(sympy.Function):
     """
     We maintain this so that we avoid SymPy correctness issues, such as:
@@ -263,16 +282,17 @@ class LShift(sympy.Function):
     @classmethod
     def eval(cls, base, shift):
         if shift < 0:
-            raise ValueError('negative shift count')
-        return base * 2 ** shift
+            raise ValueError("negative shift count")
+        return base * 2**shift
 
 
 class RShift(sympy.Function):
     @classmethod
     def eval(cls, base, shift):
         if shift < 0:
-            raise ValueError('negative shift count')
-        return base // 2 ** shift
+            raise ValueError("negative shift count")
+        return base // 2**shift
+
 
 # Overloaded to be compatible with regular Python.
 # https://github.com/pytorch/pytorch/issues/90900
@@ -284,7 +304,8 @@ def eval(cls, base, exp):
         elif base.is_zero and exp < 0:
             raise ZeroDivisionError(f"{base} cannot be raised to a negative power")
         else:
-            return base ** exp
+            return base**exp
+
 
 # Overloaded to be compatible with regular Python.
 # https://github.com/pytorch/pytorch/issues/90900
@@ -317,13 +338,14 @@ def eval(cls, *args):
         # in dim 0.
         if all(isinstance(a, sympy.Integer) for a in args):
             # sym_node imported in torch.__init__. Local import to avoid an import cycle
-            from torch.fx.experimental.symbolic_shapes import eval_is_non_overlapping_and_dense
+            from torch.fx.experimental.symbolic_shapes import (
+                eval_is_non_overlapping_and_dense,
+            )
 
             size_args = args[0:dim]
             stride_args = args[dim:]
             return eval_is_non_overlapping_and_dense(
-                [int(a) for a in size_args],
-                [int(a) for a in stride_args]
+                [int(a) for a in size_args], [int(a) for a in stride_args]
             )
         return None
 
@@ -361,7 +383,11 @@ def eval(cls, number, ndigits):
         if number.is_integer and ndigits >= 0:
             return number
         elif isinstance(number, sympy.Number) and isinstance(ndigits, sympy.Integer):
-            value_type, output_type = (int, sympy.Integer) if isinstance(number, sympy.Integer) else (float, sympy.Float)
+            value_type, output_type = (
+                (int, sympy.Integer)
+                if isinstance(number, sympy.Integer)
+                else (float, sympy.Float)
+            )
             return output_type(round(value_type(number), int(ndigits)))
 
 
@@ -401,6 +427,7 @@ def eval(cls, a):
 
     return OpaqueUnaryFn
 
+
 # Keep in sync with math_op_names in torch/fx/experimental/sym_node.py
 OpaqueUnaryFn_sqrt = make_opaque_unary_fn("sqrt")
 OpaqueUnaryFn_cos = make_opaque_unary_fn("cos")
diff --git a/torch/utils/_sympy/value_ranges.py b/torch/utils/_sympy/value_ranges.py
index eae126b1b4dc..504fe757d4f2 100644
--- a/torch/utils/_sympy/value_ranges.py
+++ b/torch/utils/_sympy/value_ranges.py
@@ -137,8 +137,8 @@ def __init__(self, lower: AllIn, upper: AllIn) -> None:
         try:
             if not sympy_generic_le(lower, upper):
                 raise ValueRangeError(f"Invalid ranges [{lower}:{upper}]")
-        except TypeError:
-            raise TypeError(f"Could not compare {lower} <= {upper}")  # noqa: TRY200
+        except TypeError as e:
+            raise TypeError(f"Could not compare {lower} <= {upper}") from e
         # Because this is a frozen class
         object.__setattr__(self, "lower", lower)
         object.__setattr__(self, "upper", upper)
diff --git a/torch/utils/_traceback.py b/torch/utils/_traceback.py
index fa73b9f41cd6..9f4d04c55105 100644
--- a/torch/utils/_traceback.py
+++ b/torch/utils/_traceback.py
@@ -128,7 +128,7 @@ def report_compile_source_on_error():
             tb.tb_next = tb_next
             tb_next = tb
 
-        raise exc.with_traceback(tb_next)  # noqa: TRY200
+        raise exc.with_traceback(tb_next)  # noqa: B904
 
 def shorten_filename(fn, *, base=None):
     """Shorten a source filepath, with the assumption that torch/ subdirectories don't need to be shown to user."""
diff --git a/torch/utils/module_tracker.py b/torch/utils/module_tracker.py
index b79d1432bb1b..f2d83fb36f92 100644
--- a/torch/utils/module_tracker.py
+++ b/torch/utils/module_tracker.py
@@ -10,6 +10,8 @@
 )
 from torch.utils._pytree import tree_flatten
 
+__all__ = ["ModuleTracker"]
+
 
 class ModuleTracker:
     """
diff --git a/torch/utils/tensorboard/writer.py b/torch/utils/tensorboard/writer.py
index cd281bc0d3fc..c646ce0c0c11 100644
--- a/torch/utils/tensorboard/writer.py
+++ b/torch/utils/tensorboard/writer.py
@@ -916,7 +916,7 @@ def add_embedding(
                     "warning: Embedding dir exists, did you set global_step for add_embedding()?"
                 )
             else:
-                raise FileExistsError(
+                raise NotADirectoryError(
                     f"Path: `{save_path}` exists, but is a file. Cannot proceed."
                 )
         else:
diff --git a/torch/xpu/streams.py b/torch/xpu/streams.py
index 2c3c3a63d58b..f4e35a376e7c 100644
--- a/torch/xpu/streams.py
+++ b/torch/xpu/streams.py
@@ -2,6 +2,7 @@
 
 import torch
 from torch._streambase import _EventBase, _StreamBase
+
 from .._utils import _dummy_type
 
 
@@ -34,7 +35,7 @@ def __new__(cls, device=None, priority=0, **kwargs):
             with torch.xpu.device(device):
                 return super().__new__(cls, priority=priority, **kwargs)
 
-    def wait_event(self, event):
+    def wait_event(self, event) -> None:
         r"""Make all future work submitted to the stream wait for an event.
 
         Args:
@@ -42,7 +43,7 @@ def wait_event(self, event):
         """
         event.wait(self)
 
-    def wait_stream(self, stream):
+    def wait_stream(self, stream) -> None:
         r"""Synchronize with another stream.
 
         All future work submitted to this stream will wait until all kernels
@@ -68,7 +69,7 @@ def record_event(self, event=None):
         event.record(self)
         return event
 
-    def query(self):
+    def query(self) -> bool:
         r"""Check if all the work submitted has been completed.
 
         Returns:
@@ -76,7 +77,7 @@ def query(self):
         """
         return super().query()
 
-    def synchronize(self):
+    def synchronize(self) -> None:
         r"""Wait for all the kernels in this stream to complete."""
         super().synchronize()
 
@@ -114,7 +115,7 @@ class Event(torch._C._XpuEventBase, _EventBase):
     def __new__(cls, enable_timing=False):
         return super().__new__(cls, enable_timing=enable_timing)
 
-    def record(self, stream=None):
+    def record(self, stream=None) -> None:
         r"""Record the event in a given stream.
 
         Uses ``torch.xpu.current_stream()`` if no stream is specified. The
@@ -124,7 +125,7 @@ def record(self, stream=None):
             stream = torch.xpu.current_stream()
         super().record(stream)
 
-    def wait(self, stream=None):
+    def wait(self, stream=None) -> None:
         r"""Make all future work submitted to the given stream wait for this event.
 
         Use ``torch.xpu.current_stream()`` if no stream is specified.
@@ -133,7 +134,7 @@ def wait(self, stream=None):
             stream = torch.xpu.current_stream()
         super().wait(stream)
 
-    def query(self):
+    def query(self) -> bool:
         r"""Check if all work currently captured by event has completed.
 
         Returns:
@@ -150,7 +151,7 @@ def elapsed_time(self, end_event):
         """
         return super().elapsed_time(end_event)
 
-    def synchronize(self):
+    def synchronize(self) -> None:
         r"""Wait for the event to complete.
 
         Waits until the completion of all work currently captured in this event.
diff --git a/torchgen/aoti/fallback_ops.py b/torchgen/aoti/fallback_ops.py
index aba595e14192..4a300c3cc301 100644
--- a/torchgen/aoti/fallback_ops.py
+++ b/torchgen/aoti/fallback_ops.py
@@ -25,6 +25,8 @@
     "aten.avg_pool2d.default",
     "aten.avg_pool3d_backward.default",
     "aten.avg_pool3d.default",
+    "aten.bernoulli_.float",
+    "aten.bernoulli_.Tensor",
     "aten.bmm.out",
     "aten.bucketize.Tensor",
     "aten.cat.default",
@@ -65,6 +67,7 @@
     "aten.histogram.bin_ct",
     "aten._histogramdd_bin_edges.default",
     "aten._histogramdd_from_bin_cts.default",
+    "aten.index_put.default",
     "aten.index_reduce.default",
     "aten.index.Tensor",
     "aten.kthvalue.default",
@@ -82,7 +85,9 @@
     "aten.mm.out",
     "aten.mode.default",
     "aten.mul.Scalar",
+    "aten.mul.Tensor",
     "aten.nanmedian.default",
+    "aten.native_dropout.default",
     "aten.nonzero.default",
     "aten.ormqr.default",
     "aten._pdist_backward.default",
@@ -93,6 +98,8 @@
     "aten.rand.default",
     "aten.rand.generator",
     "aten.randint.default",
+    "aten.randint.generator",
+    "aten.randint.low_out",
     "aten.randn.default",
     "aten.randn.generator",
     "aten.randperm.default",
@@ -110,9 +117,11 @@
     "aten._scaled_mm.default",
     "aten.scatter_reduce.two_out",
     "aten.scatter.src_out",
+    "aten.scatter.value_out",
     "aten.searchsorted.default",
     "aten._segment_reduce_backward.default",
     "aten.segment_reduce.default",
+    "aten.slice.Tensor",
     "aten.soft_margin_loss_backward.default",
     "aten.sort.default",
     "aten.sort.stable",
diff --git a/torchgen/gen.py b/torchgen/gen.py
index 28e46c3536e6..d715361146ea 100644
--- a/torchgen/gen.py
+++ b/torchgen/gen.py
@@ -49,8 +49,8 @@
 from torchgen.gen_aoti_c_shim import (
     gen_aoti_c_shim,
     gen_static_dispatch_backend_call_signature,
-    get_backend_index_for_aoti,
     get_fallback_op_name,
+    get_header_for_aoti,
 )
 from torchgen.gen_functionalization_type import (
     gen_functionalization_definition,
@@ -2353,54 +2353,28 @@ def operator_headers() -> List[str]:
             else:
                 raise AssertionError(f"unrecognized {dispatch_key} for ufunc")
 
-        structured_func_group_dict = {
-            f"{func_group.functional.namespace}.{func_group.functional.func.name}": func_group
-            for func_group in structured_native_functions
-        }
+        structured_func_group_dict = dict()
+        for func_group in structured_native_functions:
+            for func in func_group.functions():
+                if func.structured_delegate is not None:
+                    structured_func_group_dict[func.structured_delegate] = func_group
+                    break
+
         if dispatch_key in (DispatchKey.CPU, DispatchKey.CUDA):
             fallbacks = dict()
             for func in native_functions:
                 op_name = get_fallback_op_name(func)
                 if op_name in inductor_fallback_ops:
-                    fallbacks[op_name] = (
-                        func,
-                        structured_func_group_dict.get(
-                            f"{func.namespace}.{func.func.name.name}", None
-                        ),
-                    )
+                    fallbacks[op_name] = func
             fallback_native_functions = tuple(
                 value for _, value in sorted(fallbacks.items())
             )
 
-            def get_header(
-                func: NativeFunction,
-                func_group: Optional[NativeFunctionsGroup],
-            ) -> Optional[str]:
-                backend_index = get_backend_index_for_aoti(
-                    func, func_group, dispatch_key, backend_indices
-                )
-                return (
-                    None
-                    if backend_index is None
-                    else f"#include <ATen/ops/{func.root_name}_{backend_index.dispatch_key.lower()}_dispatch.h>"
-                )
-
-            def headers_for_aoti() -> str:
-                headers = []
-                for func, func_group in fallback_native_functions:
-                    header = get_header(func, func_group)
-                    if header is not None:
-                        headers.append(header)
-                return "\n".join(sorted(set(headers)))
-
-            extra_headers = (
-                extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else ""
-            )
-
             # header files were checked in for ABI-compatiblilty checking
             header_file_name = f"c_shim_{dispatch_key.lower()}.h"
             new_header = gen_aoti_c_shim(
                 fallback_native_functions,
+                structured_func_group_dict,
                 dispatch_key,
                 backend_indices,
                 header=True,
@@ -2442,10 +2416,25 @@ def headers_for_aoti() -> str:
                     )
 
             # cpp files are always generated on-the-fly
+            def headers_for_aoti() -> str:
+                headers = []
+                for func in fallback_native_functions:
+                    header = get_header_for_aoti(
+                        func, structured_func_group_dict, dispatch_key, backend_indices
+                    )
+                    if header is not None:
+                        headers.append(header)
+                return "\n".join(sorted(set(headers)))
+
+            extra_headers = (
+                extra_cuda_headers if is_cuda_dispatch_key(dispatch_key) else ""
+            )
+
             aoti_fm.write(
                 f"c_shim_{dispatch_key.lower()}.cpp",
                 lambda: gen_aoti_c_shim(
                     fallback_native_functions,
+                    structured_func_group_dict,
                     dispatch_key,
                     backend_indices,
                     header=False,
diff --git a/torchgen/gen_aoti_c_shim.py b/torchgen/gen_aoti_c_shim.py
index 0d31bd14a5e6..f123bc879cd3 100644
--- a/torchgen/gen_aoti_c_shim.py
+++ b/torchgen/gen_aoti_c_shim.py
@@ -16,6 +16,7 @@
     ListType,
     NativeFunction,
     NativeFunctionsGroup,
+    OperatorName,
     OptionalType,
     Type,
 )
@@ -33,6 +34,7 @@
     BaseTy.Layout: "int32_t",  # Represent enum as int
     BaseTy.MemoryFormat: "int32_t",  # Represent enum as int
     BaseTy.ScalarType: "int32_t",  # Represent enum as int
+    BaseTy.Generator: "AtenGeneratorHandle",
 }
 
 base_type_to_aten_type = {
@@ -47,6 +49,7 @@
     BaseTy.Layout: "c10::Layout",
     BaseTy.MemoryFormat: "c10::MemoryFormat",
     BaseTy.ScalarType: "c10::ScalarType",
+    BaseTy.Generator: "at::Generator",
 }
 
 base_type_to_callsite_expr = {
@@ -61,6 +64,7 @@
     BaseTy.Layout: "static_cast<c10::Layout>",
     BaseTy.MemoryFormat: "static_cast<c10::MemoryFormat>",
     BaseTy.ScalarType: "static_cast<c10::ScalarType>",
+    BaseTy.Generator: "*generator_handle_to_generator_pointer",
 }
 
 
@@ -88,7 +92,7 @@ def convert_arg_type_and_name(typ: Type, name: str) -> Tuple[List[str], List[str
                 ],
             )
         else:
-            # TODO: BaseTy.Dimname, BaseTy.Generator, etc.
+            # TODO: BaseTy.Dimname, etc.
             raise NotImplementedError(f"TODO: add support for arg type {repr(typ)}")
     elif isinstance(typ, OptionalType):
         c_types, names, aten_types, callsite_exprs = convert_arg_type_and_name(
@@ -209,7 +213,11 @@ def convert_return(typ: BaseType, val: str) -> str:
 
     ret_pointer_can_be_null = False
     unambiguous_name = schema.name.unambiguous_name()
-    for name in ["_scaled_dot_product_flash_attention", "convolution_backward"]:
+    for name in [
+        "_scaled_dot_product_flash_attention",
+        "_scaled_dot_product_efficient_attention",
+        "convolution_backward",
+    ]:
         if name in unambiguous_name:
             ret_pointer_can_be_null = True
             break
@@ -241,18 +249,18 @@ def gen_declaration_and_definition(
         return declaration_definition_cache[(func_name, device, backend_call)]
 
     if schema.is_out_fn():
-        # out_variant has out arguments in the front, and it's ok to ignore return value
+        # out_variant has out arguments in the front, and it's ok to ignore return values
         # because C shim functions only return AOTITorchError
-        # Somehow at::native out-variant functions have out arguments in the back
         args, callsite_exprs = gen_arguments(
-            [*schema.arguments.flat_non_out, *schema.arguments.out]
-            if "at::native" in backend_call
-            else [*schema.arguments.out, *schema.arguments.flat_non_out],
+            [*schema.arguments.out, *schema.arguments.flat_non_out]
         )
         ret_assignments: List[str] = []
     else:
         args, callsite_exprs = gen_arguments(schema.arguments.flat_all)
-        ret_declarations, ret_assignments = gen_returns(schema)
+        # ignore return values for inplace ops
+        ret_declarations, ret_assignments = (
+            ([], []) if schema.name.name.inplace else gen_returns(schema)
+        )
         args.extend(ret_declarations)
 
     declaration = f"AOTITorchError aoti_torch_{device}_{func_name}({', '.join(args)})"
@@ -302,15 +310,17 @@ def gen_static_dispatch_backend_call(
 
 def get_backend_index_for_aoti(
     func: NativeFunction,
-    func_group: Optional[NativeFunctionsGroup],
+    func_group_mapping: Dict[OperatorName, NativeFunctionsGroup],
     dispatch_key: DispatchKey,
     backend_indices: Dict[DispatchKey, BackendIndex],
 ) -> Optional[BackendIndex]:
     backend_index = None
     if backend_indices[dispatch_key].has_kernel(func) or (
         func.structured_delegate is not None
-        and func_group is not None
-        and backend_indices[dispatch_key].has_kernel(func_group)
+        and func.structured_delegate in func_group_mapping
+        and backend_indices[dispatch_key].has_kernel(
+            func_group_mapping[func.structured_delegate]
+        )
     ):
         backend_index = backend_indices[dispatch_key]
     elif backend_indices[DispatchKey.CompositeExplicitAutograd].has_kernel(func):
@@ -327,6 +337,22 @@ def get_backend_index_for_aoti(
     return backend_index
 
 
+def get_header_for_aoti(
+    func: NativeFunction,
+    func_group_mapping: Dict[OperatorName, NativeFunctionsGroup],
+    dispatch_key: DispatchKey,
+    backend_indices: Dict[DispatchKey, BackendIndex],
+) -> Optional[str]:
+    backend_index = get_backend_index_for_aoti(
+        func, func_group_mapping, dispatch_key, backend_indices
+    )
+    return (
+        None
+        if backend_index is None
+        else f"#include <ATen/ops/{func.root_name}_{backend_index.dispatch_key.lower()}_dispatch.h>"
+    )
+
+
 def get_fallback_op_name(func: NativeFunction) -> str:
     return (
         f"{func.namespace}.{func.func.name.name}.{func.func.name.overload_name}"
@@ -337,13 +363,13 @@ def get_fallback_op_name(func: NativeFunction) -> str:
 
 def gen_c_shim(
     func: NativeFunction,
-    func_group: Optional[NativeFunctionsGroup],
+    func_group_mapping: Dict[OperatorName, NativeFunctionsGroup],
     dispatch_key: DispatchKey,
     backend_indices: Dict[DispatchKey, BackendIndex],
     header: bool,
 ) -> Optional[str]:
     backend_index = get_backend_index_for_aoti(
-        func, func_group, dispatch_key, backend_indices
+        func, func_group_mapping, dispatch_key, backend_indices
     )
     if backend_index is None:
         return None
@@ -371,7 +397,7 @@ def gen_c_shim(
 
 @dataclass(frozen=True)
 class ShimGenerator:
-    func_group_mapping: Dict[str, Optional[NativeFunctionsGroup]]
+    func_group_mapping: Dict[OperatorName, NativeFunctionsGroup]
     dispatch_key: DispatchKey
     backend_indices: Dict[DispatchKey, BackendIndex]
     header: bool  # True to generate .h and False to generate .cpp
@@ -383,7 +409,7 @@ def __call__(
     ) -> Optional[str]:
         result = gen_c_shim(
             func,
-            self.func_group_mapping.get(get_fallback_op_name(func), None),
+            self.func_group_mapping,
             self.dispatch_key,
             self.backend_indices,
             self.header,
@@ -392,22 +418,20 @@ def __call__(
 
 
 def gen_aoti_c_shim(
-    native_functions: Sequence[Tuple[NativeFunction, Optional[NativeFunctionsGroup]]],
+    native_functions: Sequence[NativeFunction],
+    func_group_mapping: Dict[OperatorName, NativeFunctionsGroup],
     dispatch_key: DispatchKey,
     backend_indices: Dict[DispatchKey, BackendIndex],
     header: bool,
     includes: str = "",
 ) -> str:
-    func_group_mapping = {
-        get_fallback_op_name(func): func_group for func, func_group in native_functions
-    }
     body = "\n".join(
         list(
             mapMaybe(
                 ShimGenerator(
                     func_group_mapping, dispatch_key, backend_indices, header
                 ),
-                [func for func, _ in native_functions],
+                native_functions,
             )
         )
     )