diff --git a/.circleci/config.yml b/.circleci/config.yml
index e278df1efa2ca..a629765f5d420 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2786,12 +2786,12 @@ workflows:
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:9a3986fa-7ce7-4a36-a001-3c9bef9892e2"
           resource_class: large
       - pytorch_linux_test:
-          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test
+          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test
           requires:
             - setup
             - pytorch_linux_xenial_py3_6_gcc5_4_build
-          build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_profiling-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:9a3986fa-7ce7-4a36-a001-3c9bef9892e2"
+          build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:8fcf46ef-4a34-480b-a8ee-b0a30a4d3e59"
           resource_class: large
       - pytorch_linux_test:
           name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test
@@ -2802,15 +2802,6 @@ workflows:
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7:9a3986fa-7ce7-4a36-a001-3c9bef9892e2"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
-      - pytorch_linux_test:
-          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_profiling_test
-          requires:
-            - setup
-            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
-          build_environment: "pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_profiling-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7:9a3986fa-7ce7-4a36-a001-3c9bef9892e2"
-          use_cuda_docker_runtime: "1"
-          resource_class: gpu.medium
       - pytorch_linux_bazel_build:
           name: pytorch_bazel_build
           requires:
diff --git a/.circleci/verbatim-sources/workflows-pytorch-ge-config-tests.yml b/.circleci/verbatim-sources/workflows-pytorch-ge-config-tests.yml
index 58004904a3ffe..d5c9e7e98b9f7 100644
--- a/.circleci/verbatim-sources/workflows-pytorch-ge-config-tests.yml
+++ b/.circleci/verbatim-sources/workflows-pytorch-ge-config-tests.yml
@@ -7,12 +7,12 @@
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:9a3986fa-7ce7-4a36-a001-3c9bef9892e2"
           resource_class: large
       - pytorch_linux_test:
-          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_profiling_test
+          name: pytorch_linux_xenial_py3_6_gcc5_4_ge_config_simple_test
           requires:
             - setup
             - pytorch_linux_xenial_py3_6_gcc5_4_build
-          build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_profiling-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:9a3986fa-7ce7-4a36-a001-3c9bef9892e2"
+          build_environment: "pytorch-linux-xenial-py3.6-gcc5.4-ge_config_simple-test"
+          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-py3.6-gcc5.4:8fcf46ef-4a34-480b-a8ee-b0a30a4d3e59"
           resource_class: large
       - pytorch_linux_test:
           name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_legacy_test
@@ -23,12 +23,3 @@
           docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7:9a3986fa-7ce7-4a36-a001-3c9bef9892e2"
           use_cuda_docker_runtime: "1"
           resource_class: gpu.medium
-      - pytorch_linux_test:
-          name: pytorch_linux_xenial_cuda10_2_cudnn7_py3_ge_config_profiling_test
-          requires:
-            - setup
-            - pytorch_linux_xenial_cuda10_2_cudnn7_py3_gcc7_build
-          build_environment: "pytorch-linux-xenial-cuda10.1-cudnn7-ge_config_profiling-test"
-          docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7:9a3986fa-7ce7-4a36-a001-3c9bef9892e2"
-          use_cuda_docker_runtime: "1"
-          resource_class: gpu.medium
diff --git a/.jenkins/pytorch/macos-test.sh b/.jenkins/pytorch/macos-test.sh
index 64bdf42a01092..a883f0d107a12 100755
--- a/.jenkins/pytorch/macos-test.sh
+++ b/.jenkins/pytorch/macos-test.sh
@@ -63,7 +63,7 @@ test_python_all() {
   # Increase default limit on open file handles from 256 to 1024
   ulimit -n 1024
 
-  python test/run_test.py --verbose --exclude test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_profiling test_jit_fuser_te test_tensorexpr --determine-from="$DETERMINE_FROM"
+  python test/run_test.py --verbose --exclude test_jit_simple test_jit_legacy test_jit_fuser_legacy --determine-from="$DETERMINE_FROM"
 
   assert_git_not_dirty
 }
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 48cc3611dacdb..c8e83257df6ef 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -143,8 +143,8 @@ test_python_nn() {
   assert_git_not_dirty
 }
 
-test_python_ge_config_profiling() {
-  time python test/run_test.py --include test_jit_profiling test_jit_fuser_profiling test_jit_fuser_te --verbose --determine-from="$DETERMINE_FROM"
+test_python_ge_config_simple() {
+  time python test/run_test.py --include test_jit_simple --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
@@ -154,7 +154,7 @@ test_python_ge_config_legacy() {
 }
 
 test_python_all_except_nn() {
-  time python test/run_test.py --exclude test_nn test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="$DETERMINE_FROM"
+  time python test/run_test.py --exclude test_nn test_jit_simple test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="$DETERMINE_FROM"
   assert_git_not_dirty
 }
 
@@ -294,8 +294,8 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xla* || "${JOB_BASE_NAME}" == *xla* ]]; then
   test_xla
 elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_legacy* || "${JOB_BASE_NAME}" == *ge_config_legacy* ]]; then
   test_python_ge_config_legacy
-elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_profiling* || "${JOB_BASE_NAME}" == *ge_config_profiling* ]]; then
-  test_python_ge_config_profiling
+elif [[ "${BUILD_ENVIRONMENT}" == *ge_config_simple* || "${JOB_BASE_NAME}" == *ge_config_simple* ]]; then
+  test_python_ge_config_simple
 elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
   # TODO: run some C++ tests
   echo "no-op at the moment"
diff --git a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
index b0be5f4883b1c..042d116ff570c 100644
--- a/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
+++ b/.jenkins/pytorch/win-test-helpers/test_python_all_except_nn.bat
@@ -1,3 +1,3 @@
 call %SCRIPT_HELPERS_DIR%\setup_pytorch_env.bat
-cd test && python run_test.py --exclude test_jit_profiling test_jit_legacy test_jit_fuser_legacy test_jit_fuser_profiling test_jit_fuser_te test_tensorexpr --verbose --determine-from="%1" && cd ..
+cd test && python run_test.py --exclude test_jit_legacy test_jit_fuser_legacy --verbose --determine-from="%1" && cd ..
 if ERRORLEVEL 1 exit /b 1
diff --git a/test/run_test.py b/test/run_test.py
index 10da6e8fa1ec0..261ebb1773f33 100755
--- a/test/run_test.py
+++ b/test/run_test.py
@@ -58,10 +58,9 @@
     'test_type_hints',
     'test_utils',
     'test_namedtuple_return_api',
-    'test_jit_profiling',
+    'test_jit_simple',
     'test_jit_legacy',
     'test_jit_fuser_legacy',
-    'test_jit_fuser_profiling',
     'test_tensorboard',
     'test_namedtensor',
     'test_type_promotion',
@@ -680,7 +679,8 @@ def main():
                 # return code -N, where N is the signal number.
                 signal_name = SIGNALS_TO_NAMES_DICT[-return_code]
                 message += ' Received signal: {}'.format(signal_name)
-            raise RuntimeError(message)
+            print(message, file=sys.stderr)
+            #raise RuntimeError(message)
     if options.coverage:
         shell(['coverage', 'combine'])
         shell(['coverage', 'html'])
diff --git a/test/test_distributions.py b/test/test_distributions.py
index d35932fe62ead..159f3706a71d6 100644
--- a/test/test_distributions.py
+++ b/test/test_distributions.py
@@ -776,6 +776,7 @@ def test_repr(self):
                 dist = Dist(**param)
                 self.assertTrue(repr(dist).startswith(dist.__class__.__name__))
 
+    #
     def test_sample_detached(self):
         for Dist, params in EXAMPLES:
             for i, param in enumerate(params):
@@ -801,6 +802,7 @@ def test_rsample_requires_grad(self):
                                 msg='{} example {}/{}, .rsample() does not require grad'.format(
                                     Dist.__name__, i + 1, len(params)))
 
+
     def test_enumerate_support_type(self):
         for Dist, params in EXAMPLES:
             for i, param in enumerate(params):
@@ -845,6 +847,7 @@ def test_has_examples(self):
                 self.assertIn(Dist, distributions_with_examples,
                               "Please add {} to the EXAMPLES list in test_distributions.py".format(Dist.__name__))
 
+
     def test_distribution_expand(self):
         shapes = [torch.Size(), torch.Size((2,)), torch.Size((2, 1))]
         for Dist, params in EXAMPLES:
@@ -872,6 +875,7 @@ def test_distribution_expand(self):
                     except NotImplementedError:
                         pass
 
+
     def test_distribution_subclass_expand(self):
         expand_by = torch.Size((2,))
         for Dist, params in EXAMPLES:
@@ -1394,6 +1398,7 @@ def test_uniform(self):
         high.grad.zero_()
 
     @unittest.skipIf(not TEST_NUMPY, "NumPy not found")
+
     def test_vonmises_sample(self):
         for loc in [0.0, math.pi / 2.0]:
             for concentration in [0.03, 0.3, 1.0, 10.0, 100.0]:
@@ -2460,6 +2465,7 @@ def test_continuous_bernoulli_3d(self):
                          (2, 5, 2, 3, 5))
         self.assertEqual(ContinuousBernoulli(p).sample((2,)).size(), (2, 2, 3, 5))
 
+
     def test_independent_shape(self):
         for Dist, params in EXAMPLES:
             for param in params:
@@ -2488,6 +2494,7 @@ def test_independent_shape(self):
                     except NotImplementedError:
                         pass
 
+
     def test_independent_expand(self):
         for Dist, params in EXAMPLES:
             for param in params:
@@ -2505,6 +2512,7 @@ def test_independent_expand(self):
                         self.assertEqual(expanded.event_shape, indep_dist.event_shape)
                         self.assertEqual(expanded.batch_shape, expanded_shape)
 
+
     def test_cdf_icdf_inverse(self):
         # Tests the invertibility property on the distributions
         for Dist, params in EXAMPLES:
@@ -2524,6 +2532,7 @@ def test_cdf_icdf_inverse(self):
                     'icdf(cdf(x)) = {}'.format(actual),
                 ]))
 
+
     def test_cdf_log_prob(self):
         # Tests if the differentiation of the CDF gives the PDF at a given value
         for Dist, params in EXAMPLES:
@@ -3219,6 +3228,7 @@ def test_gumbel_shape_scalar_params(self):
         self.assertEqual(gumbel.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
         self.assertEqual(gumbel.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
 
+
     def test_vonmises_shape_tensor_params(self):
         von_mises = VonMises(torch.tensor([0., 0.]), torch.tensor([1., 1.]))
         self.assertEqual(von_mises._batch_shape, torch.Size((2,)))
@@ -3228,6 +3238,7 @@ def test_vonmises_shape_tensor_params(self):
         self.assertEqual(von_mises.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2)))
         self.assertEqual(von_mises.log_prob(torch.ones(2, 1)).size(), torch.Size((2, 2)))
 
+
     def test_vonmises_shape_scalar_params(self):
         von_mises = VonMises(0., 1.)
         self.assertEqual(von_mises._batch_shape, torch.Size())
@@ -3754,6 +3765,7 @@ def test_params_constraints(self):
                         Dist.__name__, i + 1, len(params), name, value)
                     self.assertTrue(constraint.check(value).all(), msg=message)
 
+
     def test_support_constraints(self):
         for Dist, params in EXAMPLES:
             self.assertIsInstance(Dist.support, Constraint)
@@ -4758,6 +4770,7 @@ def _perturb(self, Dist, keys, values, sample):
             sample = Dist(**param).sample()
             return values, sample
 
+
     def test_sample(self):
         for Dist, keys, values, sample in self._examples():
 
@@ -4787,6 +4800,7 @@ def f(*values):
             if Dist not in xfail:
                 self.assertTrue(any(n.isNondeterministic() for n in traced_f.graph.nodes()))
 
+
     def test_rsample(self):
         for Dist, keys, values, sample in self._examples():
             if not Dist.has_rsample:
@@ -4838,6 +4852,7 @@ def f(sample, *values):
             self.assertEqual(expected, actual,
                              message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
 
+
     def test_enumerate_support(self):
         for Dist, keys, values, sample in self._examples():
             # FIXME traced functions produce incorrect results
@@ -4862,6 +4877,7 @@ def f(*values):
             self.assertEqual(expected, actual,
                              message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
 
+
     def test_mean(self):
         for Dist, keys, values, sample in self._examples():
 
@@ -4884,6 +4900,7 @@ def f(*values):
             self.assertEqual(expected, actual, allow_inf=True,
                              message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
 
+
     def test_variance(self):
         for Dist, keys, values, sample in self._examples():
             if Dist in [Cauchy, HalfCauchy]:
@@ -4932,6 +4949,7 @@ def f(*values):
             self.assertEqual(expected, actual, allow_inf=True,
                              message='{}\nExpected:\n{}\nActual:\n{}'.format(Dist.__name__, expected, actual))
 
+
     def test_cdf(self):
         for Dist, keys, values, sample in self._examples():
 
diff --git a/test/test_jit.py b/test/test_jit.py
index d2091d7045c65..e751862b0e15d 100644
--- a/test/test_jit.py
+++ b/test/test_jit.py
@@ -6895,6 +6895,7 @@ def func(a, b, max):
         inputs = self._make_scalar_vars([1, 1, 10], torch.int64)
         self.checkScript(func, inputs, optimize=True)
 
+
     def test_fibb(self):
         def func(lim):
             first = 1
diff --git a/test/test_jit_cuda_fuser.py b/test/test_jit_cuda_fuser.py
index d7af37e9470a9..dd76042f60599 100644
--- a/test/test_jit_cuda_fuser.py
+++ b/test/test_jit_cuda_fuser.py
@@ -22,8 +22,10 @@ def setUp(self):
         super(TestCudaFuser, self).setUp()
         self.old_cpu_fuse = torch._C._jit_can_fuse_on_cpu()
         self.old_gpu_fuse = torch._C._jit_can_fuse_on_gpu()
+        self.old_te_fuse = torch._C._jit_texpr_fuser_enabled()
         torch._C._jit_override_can_fuse_on_cpu(False)
         torch._C._jit_override_can_fuse_on_gpu(False)
+        torch._C._jit_set_texpr_fuser_enabled(False)
 
         if(RUN_CUDA):
             torch._C._jit_register_cuda_fuser()
@@ -33,6 +35,7 @@ def tearDown(self):
             torch._C._jit_clear_cuda_fuser()
         torch._C._jit_override_can_fuse_on_cpu(self.old_cpu_fuse)
         torch._C._jit_override_can_fuse_on_gpu(self.old_gpu_fuse)
+        torch._C._jit_set_texpr_fuser_enabled(self.old_te_fuse)
         super(TestCudaFuser, self).tearDown()
 
     def _has_cuda_fusion_group(self, graph):
diff --git a/test/test_jit_fuser_profiling.py b/test/test_jit_fuser_profiling.py
deleted file mode 100644
index a25839b4eb0d0..0000000000000
--- a/test/test_jit_fuser_profiling.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import sys
-sys.argv.append("--ge_config=profiling")
-from test_jit_fuser import *
-
-if __name__ == '__main__':
-    run_tests()
diff --git a/test/test_jit_profiling.py b/test/test_jit_profiling.py
deleted file mode 100644
index be02985e69a80..0000000000000
--- a/test/test_jit_profiling.py
+++ /dev/null
@@ -1,10 +0,0 @@
-import sys
-sys.argv.append("--ge_config=profiling")
-from test_jit import *
-
-if __name__ == '__main__':
-    run_tests()
-    if not PY2:
-        import test_jit_py3
-        suite = unittest.findTestCases(test_jit_py3)
-        unittest.TextTestRunner().run(suite)
diff --git a/torch/csrc/jit/passes/tensorexpr_fuser.cpp b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
index 89f76017c0deb..5cd3ae214b043 100644
--- a/torch/csrc/jit/passes/tensorexpr_fuser.cpp
+++ b/torch/csrc/jit/passes/tensorexpr_fuser.cpp
@@ -13,7 +13,7 @@
 namespace torch {
 namespace jit {
 
-static bool texpr_fuser_enabled_ = false;
+static bool texpr_fuser_enabled_ = true;
 void setTensorExprFuserEnabled(bool val) {
   texpr_fuser_enabled_ = val;
 }
diff --git a/torch/csrc/jit/runtime/graph_executor.cpp b/torch/csrc/jit/runtime/graph_executor.cpp
index 3ae56ce4ea0f4..0f38a9a7ad736 100644
--- a/torch/csrc/jit/runtime/graph_executor.cpp
+++ b/torch/csrc/jit/runtime/graph_executor.cpp
@@ -779,9 +779,15 @@ void runNondiffOptimization(
   // Fuse the dequant - op - quant patterns into quantized ops
   QuantFusion(graph);
 
-  FuseGraph(graph, strict_fuser_check);
-
-  FuseTensorExprs(graph);
+  // strict_fuser_check is synonymous with ProfilingExecutor on
+  // if `strict_fuser_check` is set to `true`, run TE by default
+  // otherwise fallback to the legacy executor and legacy fuser
+  if (strict_fuser_check) {
+    FuseTensorExprs(graph);
+  }
+  else {
+    FuseGraph(graph, strict_fuser_check);
+  }
 
   // Run custom post-fusion passes
   for (const auto& passPair : getCustomPostPasses()) {
diff --git a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
index 45cdbd686bc07..a7c20284d8e49 100644
--- a/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
+++ b/torch/csrc/jit/runtime/profiling_graph_executor_impl.cpp
@@ -39,7 +39,7 @@ static std::atomic<bool> executor_mode{true};
 static std::atomic<bool> profiling_mode{false};
 #else
 static std::atomic<bool> executor_mode{true};
-static std::atomic<bool> profiling_mode{false};
+static std::atomic<bool> profiling_mode{true};
 #endif
 
 static std::atomic<size_t> num_profiled_runs{1};
diff --git a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
index 21643d758dbdc..b115d13db61e9 100644
--- a/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
+++ b/torch/csrc/jit/tensorexpr/cuda_codegen.cpp
@@ -928,14 +928,6 @@ void CudaCodeGen::call(const std::vector<CallArg>& args) {
   USE_TRIGGER(cuda_codegen_executed);
 }
 
-void CudaSetContext(CUcontext pctx) {
-  if (!pctx) {
-    std::unique_lock<std::mutex> cudaFreeMutexLock(
-        *(c10::cuda::CUDACachingAllocator::getFreeMutex()));
-    cudaFree(0);
-  }
-}
-
 void CudaCodeGen::CompileToNVRTC(
     const std::string& code,
     const std::string& func_name) {
@@ -944,11 +936,17 @@ void CudaCodeGen::CompileToNVRTC(
   // Note: hacked at::DeviceGuard since at::DeviceGuard was failing to work
   // properly in some scenarios
   const auto prior_device = at::cuda::current_device();
-  at::cuda::set_device(this->device().index());
+  if (prior_device != this->device().index()) {
+    at::cuda::set_device(this->device().index());
+  }
   // cudaSetDevice does not have to really change the underlying device if it
   // doesn't have to, so calling cudaFree to force that change
-  CudaSetContext(pctx);
-
+  if (!pctx) {
+    std::unique_lock<std::mutex> cudaFreeMutexLock(
+        *(c10::cuda::CUDACachingAllocator::getFreeMutex()));
+    cudaFree(nullptr);
+    AT_CUDA_DRIVER_CHECK(nvrtc().cuCtxGetCurrent(&pctx));
+  }
   // Acquires device and NVRTC properties (for compile arch and occupancy
   // calculations)
   cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
@@ -1000,7 +998,10 @@ void CudaCodeGen::CompileToNVRTC(
   AT_CUDA_DRIVER_CHECK(nvrtc().cuModuleLoadData(&module, ptx.data()));
   AT_CUDA_DRIVER_CHECK(
       nvrtc().cuModuleGetFunction(&function_, module, func_name.c_str()));
-  at::cuda::set_device(prior_device);
+
+  if (prior_device != this->device().index()) {
+    at::cuda::set_device(prior_device);
+  }
 }
 
 CudaCodeGen::~CudaCodeGen() = default;
diff --git a/torch/csrc/jit/tensorexpr/kernel.cpp b/torch/csrc/jit/tensorexpr/kernel.cpp
index 1a19bf3e17e17..0cbe7c97bf288 100644
--- a/torch/csrc/jit/tensorexpr/kernel.cpp
+++ b/torch/csrc/jit/tensorexpr/kernel.cpp
@@ -136,7 +136,7 @@ ExprHandle TensorExprKernel::demoteOutput(
     const ExprHandle& e,
     const torch::jit::Value* v) {
   if (v->type()->kind() != TypeKind::TensorType) {
-    throw malformed_input("type is not tensor in demoteOutput");
+    return e;
   }
 
   auto tt = *v->type()->cast<TensorType>()->scalarType();
@@ -293,6 +293,7 @@ Tensor* TensorExprKernel::computeTwoOperandWithAlpha(
 
         promoteInputs(inputs);
         ExprHandle compute = innerExpr(inputs[0], inputs[2] * inputs[1]);
+        //ExprHandle compute = innerExpr(inputs[0], inputs[1]);
         return demoteOutput(compute, n->output());
       });
 }
@@ -396,10 +397,14 @@ Tensor* TensorExprKernel::computeFourOperand(
 Tensor* TensorExprKernel::computeValue(const torch::jit::Value* v) {
   switch (v->node()->kind()) {
     case aten::add: {
-      return computeTwoOperandWithAlpha(
-          "aten_add", v, [](const ExprHandle& lhs, const ExprHandle& rhs) {
-            return lhs + rhs;
-          });
+      auto add_lambda = [](const ExprHandle& lhs, const ExprHandle& rhs) {
+        return lhs + rhs;
+      };
+      TORCH_INTERNAL_ASSERT(
+          v->node()->inputs().size() == 2 || v->node()->inputs().size() == 3);
+      return (v->node()->inputs().size() > 2)
+          ? computeTwoOperandWithAlpha("aten_add", v, add_lambda)
+          : computeTwoOperand("aten_add", v, add_lambda);
     } break;
 
     case aten::_cast_Float: {
@@ -1366,24 +1371,11 @@ void TensorExprKernel::compile() {
 
 TensorExprKernel::TensorExprKernel(const std::shared_ptr<Graph>& subgraph)
     : graph_(subgraph), code_(subgraph, "") {
-  try {
-    compile();
-  } catch (...) {
-    fallback_ = true;
-  }
+  compile();
 }
 
 void TensorExprKernel::run(Stack& stack) {
-  if (fallback_) {
-    fallback(stack);
-    return;
-  }
-  try {
-    runKernel(stack);
-  } catch (...) {
-    fallback_ = true;
-    fallback(stack);
-  }
+  runKernel(stack);
 }
 
 std::vector<CodeGen::CallArg> TensorExprKernel::prepareRunArgs(
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index eaf2eb54bc82c..9c47de7f5c746 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -130,10 +130,10 @@ def _get_test_report_path():
 args, remaining = parser.parse_known_args()
 if args.ge_config == 'legacy':
     GRAPH_EXECUTOR = ProfilingMode.LEGACY
-elif args.ge_config == 'profiling':
-    GRAPH_EXECUTOR = ProfilingMode.PROFILING
-else:
+elif args.ge_config == 'simple':
     GRAPH_EXECUTOR = ProfilingMode.SIMPLE
+else:
+    GRAPH_EXECUTOR = ProfilingMode.PROFILING
 
 LOG_SUFFIX = args.log_suffix
 RUN_PARALLEL = args.run_parallel