diff --git a/test/onnx/expect/TestOperators.test_view_flatten.expect b/test/onnx/expect/TestOperators.test_view_flatten.expect
index 07667797e2cf..5ae9c0576c7a 100644
--- a/test/onnx/expect/TestOperators.test_view_flatten.expect
+++ b/test/onnx/expect/TestOperators.test_view_flatten.expect
@@ -65,60 +65,40 @@ graph {
     }
   }
   node {
-    input: "6"
     output: "7"
-    name: "Cast_6"
-    op_type: "Cast"
-    attribute {
-      name: "to"
-      i: 11
-      type: INT
-    }
-  }
-  node {
-    output: "8"
-    name: "Constant_7"
+    name: "Constant_6"
     op_type: "Constant"
     attribute {
       name: "value"
       t {
-        data_type: 11
-        raw_data: "\000\000\000\000\000\000\360?"
+        data_type: 7
+        raw_data: "\030\000\000\000\000\000\000\000"
       }
       type: TENSOR
     }
   }
   node {
-    input: "8"
     input: "7"
-    output: "9"
-    name: "Div_8"
+    input: "6"
+    output: "8"
+    name: "Div_7"
     op_type: "Div"
   }
   node {
-    output: "10"
-    name: "Constant_9"
-    op_type: "Constant"
+    input: "8"
+    output: "9"
+    name: "Cast_8"
+    op_type: "Cast"
     attribute {
-      name: "value"
-      t {
-        data_type: 11
-        raw_data: "\000\000\000\000\000\0008@"
-      }
-      type: TENSOR
+      name: "to"
+      i: 7
+      type: INT
     }
   }
   node {
     input: "9"
-    input: "10"
-    output: "11"
-    name: "Mul_10"
-    op_type: "Mul"
-  }
-  node {
-    input: "11"
-    output: "12"
-    name: "Cast_11"
+    output: "10"
+    name: "Cast_9"
     op_type: "Cast"
     attribute {
       name: "to"
@@ -128,8 +108,8 @@ graph {
   }
   node {
     input: "3"
-    output: "13"
-    name: "Unsqueeze_12"
+    output: "11"
+    name: "Unsqueeze_10"
     op_type: "Unsqueeze"
     attribute {
       name: "axes"
@@ -138,9 +118,9 @@ graph {
     }
   }
   node {
-    input: "12"
-    output: "14"
-    name: "Unsqueeze_13"
+    input: "10"
+    output: "12"
+    name: "Unsqueeze_11"
     op_type: "Unsqueeze"
     attribute {
       name: "axes"
@@ -149,10 +129,10 @@ graph {
     }
   }
   node {
-    input: "13"
-    input: "14"
-    output: "15"
-    name: "Concat_14"
+    input: "11"
+    input: "12"
+    output: "13"
+    name: "Concat_12"
     op_type: "Concat"
     attribute {
       name: "axis"
@@ -162,9 +142,9 @@ graph {
   }
   node {
     input: "0"
-    input: "15"
-    output: "16"
-    name: "Reshape_15"
+    input: "13"
+    output: "14"
+    name: "Reshape_13"
     op_type: "Reshape"
   }
   name: "torch-jit-export"
@@ -191,7 +171,7 @@ graph {
     }
   }
   output {
-    name: "16"
+    name: "14"
     type {
       tensor_type {
         elem_type: 1
diff --git a/test/test_torch.py b/test/test_torch.py
index 09bef13243cc..0633ae2ee7f5 100644
--- a/test/test_torch.py
+++ b/test/test_torch.py
@@ -14264,28 +14264,268 @@ def test_binary_op_scalar_device_unspecified(self, devices):
                     self.assertEqual(y1.device, device_obj)
                     self.assertEqual(y0, y1)
 
-    # Tests that CPU scalars (including zero dim tensors) can be used in
-    # binary operations with CUDA tensors.
-    @onlyCUDA
-    def test_cuda_cpu_scalar_binary_ops(self, device):
-        val_scalar = math.pi
-        val_tensor = torch.tensor(val_scalar)
-        for op in (operator.add, torch.add,
-                   operator.sub, torch.sub,
-                   operator.mul, torch.mul,
-                   operator.truediv, torch.true_divide,
-                   operator.floordiv, torch.floor_divide):
-            for tensor_val in (1, (1,)):
-                t_cuda = torch.tensor(tensor_val, device=device)
-                t_cpu = t_cuda.cpu()
-                for val in (val_scalar, val_tensor):
-                    cpu_result = op(t_cpu, val)
-                    cuda_result = op(t_cuda, val)
-                    self.assertEqual(cpu_result, cuda_result)
-
-                    reverse_cpu_result = op(val, t_cpu)
-                    reverse_cuda_result = op(val, t_cuda)
-                    self.assertEqual(reverse_cpu_result, reverse_cuda_result)
+    def test_div_and_floordiv_vs_python(self, device):
+        # Tests torch division ops which can handle both arguments being
+        #   scalars.
+        # NOTE: torch.floor_divide currently truncates instead of flooring.
+        #   the quotient. See https://github.com/pytorch/pytorch/issues/43874.
+        def _scalar_helper(python_op, torch_op):
+            for a, b in product(range(-10, 10), range(-10, 10)):
+                for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                    a = op(a)
+                    b = op(b)
+
+                    # Skips zero divisors
+                    if b == 0:
+                        continue
+
+                    expected = python_op(a, b)
+
+                    for op in (operator.truediv, torch.true_divide):
+                        actual_scalar = torch_op(a, b)
+
+                        a_t = torch.tensor(a, device=device)
+                        b_t = torch.tensor(b, device=device)
+
+                        actual_tensor = torch_op(a_t, b_t)
+                        actual_first_tensor = torch_op(a_t, b)
+                        actual_second_tensor = torch_op(a, b_t)
+
+                        self.assertEqual(actual_scalar, expected_div)
+                        self.assertEqual(actual_tensor.item(), expected_div)
+                        self.assertEqual(actual_first_tensor, actual_tensor)
+                        self.assertEqual(actual_second_tensor, actual_tensor)
+
+            _scalar_helper(operator.truediv, operator.truediv)
+            _scalar_helper(operator.truediv, torch.true_divide)
+            _scalar_helper(lambda a, b: math.trunc(a / b), operator.floordiv)
+            _scalar_helper(lambda a, b: math.trunc(a / b), torch.floor_divide)
+
+    # NOTE: torch.floor_divide currently truncates instead of flooring.
+    # See https://github.com/pytorch/pytorch/issues/43874.
+    @onlyOnCPUAndCUDA
+    def test_div_and_floordiv_script_vs_python(self, device):
+        # Creates jitted functions of two tensors
+        def _wrapped_div(a, b):
+            return a / b
+
+        def _wrapped_floordiv(a, b):
+            return a // b
+
+        scripted_div = torch.jit.script(_wrapped_div)
+        scripted_floordiv = torch.jit.script(_wrapped_floordiv)
+        for a, b in product(range(-10, 10), range(-10, 10)):
+            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                a = op(a)
+                b = op(b)
+
+                # Skips zero divisors
+                if b == 0:
+                    continue
+
+                expected_div = a / b
+                expected_truncdiv = math.trunc(a / b)
+                a_t = torch.tensor(a, device=device)
+                b_t = torch.tensor(b, device=device)
+
+                self.assertEqual(scripted_div(a_t, b_t), expected_div)
+                self.assertEqual(scripted_floordiv(a_t, b_t), expected_truncdiv)
+
+        # Creates jitted functions of one tensor
+        def _wrapped_div_scalar(a):
+            return a / 5
+
+        # NOTE: this will fail when given an integer input, since
+        #   the JIT implements division as
+        #   torch.reciprocal(a) * 5, and reciprocal is only
+        #   implemented for float types.
+        def _wrapped_rdiv_scalar(a):
+            return 5 / a
+
+        def _wrapped_floordiv_scalar(a):
+            return a // 5
+
+        # NOTE: this fails if the input is not an integer tensor
+        # See https://github.com/pytorch/pytorch/issues/45199
+        def _wrapped_rfloordiv_scalar(a):
+            return 5 // a
+
+        scripted_div_scalar = torch.jit.script(_wrapped_div_scalar)
+        scripted_rdiv_scalar = torch.jit.script(_wrapped_rdiv_scalar)
+        scripted_floordiv_scalar = torch.jit.script(_wrapped_floordiv_scalar)
+        scripted_rfloordiv_scalar = torch.jit.script(_wrapped_rfloordiv_scalar)
+
+        for a in range(-10, 10):
+            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                a = op(a)
+
+                a_t = torch.tensor(a, device=device)
+
+                self.assertEqual(a / 5, scripted_div_scalar(a_t))
+                self.assertEqual(math.trunc(a / 5), scripted_floordiv_scalar(a_t))
+
+                # Skips zero divisors
+                if a == 0:
+                    continue
+
+                if a_t.is_floating_point():
+                    self.assertEqual(5 / a, scripted_rdiv_scalar(a_t))
+                else:
+                    with self.assertRaises(RuntimeError):
+                        scripted_rdiv_scalar(a_t)
+
+
+                # Handles Issue 45199 (see comment above)
+                if a_t.is_floating_point():
+                    with self.assertRaises(RuntimeError):
+                        scripted_rfloordiv_scalar(a_t)
+                else:
+                    self.assertEqual(5 // a, scripted_rfloordiv_scalar(a_t))
+
+    # NOTE: torch.floor_divide currently truncates instead of flooring
+    #   the quotient. See https://github.com/pytorch/pytorch/issues/43874.
+    @onlyOnCPUAndCUDA
+    def test_idiv_and_ifloordiv_vs_python(self, device):
+        def _wrapped_idiv_tensor(a, b):
+            a /= b
+            return a
+
+        def _wrapped_idiv_scalar(a):
+            a /= 5
+            return a
+
+        def _wrapped_true_divide__tensor(a, b):
+            a.true_divide_(b)
+            return a
+
+        def _wrapped_true_divide__scalar(a):
+            a.true_divide_(5)
+            return a
+
+        def _wrapped_floor_divide__tensor(a, b):
+            a.floor_divide_(b)
+            return a
+
+        def _wrapped_floor_divide__scalar(a):
+            a.floor_divide_(5)
+            return a
+
+        # The following functions are unsupported by the JIT
+        def _wrapped_ifloordiv_tensor(a, b):
+            a //= b
+            return a
+
+        def _wrapped_ifloordiv_scalar(a):
+            a //= 5
+            return a
+
+        with self.assertRaises(torch.jit.frontend.NotSupportedError):
+            scripted_ifloordiv_tensor = torch.jit.script(_wrapped_ifloordiv_tensor)
+
+        with self.assertRaises(torch.jit.frontend.NotSupportedError):
+            scripted_ifloordiv_scalar = torch.jit.script(_wrapped_ifloordiv_scalar)
+
+        scripted_idiv_tensor = torch.jit.script(_wrapped_idiv_tensor)
+        scripted_idiv_scalar = torch.jit.script(_wrapped_idiv_scalar)
+        scripted_true_divide__tensor = torch.jit.script(_wrapped_true_divide__tensor)
+        scripted_true_divide__scalar = torch.jit.script(_wrapped_true_divide__scalar)
+        scripted_floor_divide__tensor = torch.jit.script(_wrapped_floor_divide__tensor)
+        scripted_floor_divide__scalar = torch.jit.script(_wrapped_floor_divide__scalar)
+
+        for a, b in product(range(-10, 10), range(-10, 10)):
+            for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                a = op(a)
+                b = op(b)
+
+                # Skips zero divisors
+                if b == 0:
+                    continue
+
+                expected_idiv = a / b
+                expected_ifloordiv = a // b
+                expected_itruncdiv = math.trunc(a / b)
+
+                a_t = torch.tensor(a, device=device)
+                b_t = torch.tensor(b, device=device)
+
+                if a_t.is_floating_point():
+                    tmp0 = a_t.clone()
+                    tmp0 /= b
+
+                    tmp1 = a_t.clone()
+                    tmp1 /= b_t
+
+                    self.assertEqual(tmp0.item(), expected_idiv)
+                    self.assertEqual(tmp1.item(), expected_idiv)
+                    self.assertEqual(scripted_true_divide__tensor(a_t.clone(), b_t).item(), expected_idiv)
+                    self.assertEqual(scripted_true_divide__scalar(a_t.clone()).item(), a / 5)
+                else:
+                    tmp = a_t.clone()
+                    with self.assertRaises(RuntimeError):
+                        tmp /= b
+                    with self.assertRaises(RuntimeError):
+                        tmp /= b_t
+                    with self.assertRaises(RuntimeError):
+                        scripted_true_divide__tensor(tmp, b_t)
+                    with self.assertRaises(RuntimeError):
+                        scripted_true_divide__scalar(tmp)
+
+
+                if not a_t.is_floating_point() and b_t.is_floating_point():
+                    # Inplace modification fails because a float tensor is required
+                    #   if the divisor is a float tensor
+                    with self.assertRaises(RuntimeError):
+                        a_t.clone().floor_divide_(b_t)
+                    with self.assertRaises(RuntimeError):
+                        scripted_floor_divide_tensor(a_t.clone(), b_t)
+                    tmp = a_t.clone()
+                    with self.assertRaises(RuntimeError):
+                        tmp //= b_t
+                else:
+                    # Inplace modification is OK when both or neither tensor is
+                    #   a float tensor
+                    self.assertEqual(a_t.clone().floor_divide_(b_t).item(), expected_itruncdiv)
+                    self.assertEqual(scripted_floor_divide__tensor(a_t.clone(), b_t).item(), expected_itruncdiv)
+                    tmp = a_t.clone()
+                    tmp //= b_t
+                    self.assertEqual(tmp.item(), expected_itruncdiv)
+
+                self.assertEqual(scripted_floor_divide__scalar(a_t), math.trunc(a / 5))
+
+    # Tests binary op equivalence with Python builtin ops
+    # Also tests that reverse operations are equivalent to forward ops
+    # NOTE: division ops are tested separately above
+    def test_binary_ops_with_scalars(self, device):
+        for ops in ((operator.add, torch.add),
+                    (operator.sub, torch.sub),
+                    (operator.mul, torch.mul),
+                    (operator.truediv, torch.div)):
+            python_op, torch_op = ops
+
+            for a, b in product(range(-10, 10), range(-10, 10)):
+                for op in (lambda x: x * .5, lambda x: math.floor(x)):
+                    a = op(a)
+                    b = op(b)
+
+                    # Skips zero divisors
+                    if b == 0 or a == 0:
+                        continue
+
+                    a_tensor = torch.tensor(a, device=device)
+                    b_tensor = torch.tensor(b, device=device)
+                    a_tensor_cpu = a_tensor.cpu()
+                    b_tensor_cpu = b_tensor.cpu()
+                    vals = (a, b, a_tensor, b_tensor, a_tensor_cpu, b_tensor_cpu)
+
+                    for args in product(vals, vals):
+                        first, second = args
+
+                        first_scalar = first if not isinstance(first, torch.Tensor) else first.item()
+                        second_scalar = second if not isinstance(second, torch.Tensor) else second.item()
+                        expected = python_op(first_scalar, second_scalar)
+
+                        self.assertEqual(expected, python_op(first, second))
+                        self.assertEqual(expected, torch_op(first, second))
 
     @onlyCUDA
     def test_ceil_out_mismatch(self, device):
@@ -16912,11 +17152,8 @@ def test_rdiv(self, device, dtype):
         else:
             x = torch.rand(100, device=device).add(1).mul(4).to(dtype)
         y = 30 / x
-        if dtype.is_floating_point or dtype.is_complex:
-            z = torch.tensor([30 / v.item() for v in x], dtype=dtype, device=device)
-        else:
-            z = torch.tensor([math.trunc(30. / v.item()) for v in x], dtype=dtype, device=device)
-        self.assertEqual(y, z)
+        z = torch.tensor([30 / v.item() for v in x], device=device)
+        self.assertEqual(y, z, exact_dtype=False)
 
     @onlyCPU
     @dtypes(*torch.testing.get_all_dtypes(include_bfloat16=False, include_bool=False, include_complex=False))
@@ -19865,7 +20102,7 @@ def inner(self, device, dtype):
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types,
         True, [], 0, True),
     ('addmv', 'scalar', _medium_1d,
-        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4, 
+        lambda t, d: [_number(0.4, 2, t), _medium_2d(t, d), _medium_1d(t, d)], 1e-2, 1e-1, 1e-4,
         torch.testing.get_all_fp_dtypes(include_bfloat16=AMPERE_OR_ROCM) + _complex_types_skip_rocm, _cpu_types, True,
         [_wrap_maybe_warns("This overload of addmv_? is deprecated")]),
     ('addmv', 'two_scalars', _medium_1d,
@@ -20105,7 +20342,7 @@ def inner(self, device, dtype):
     ('sigmoid', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('logit', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes()),
     ('sqrt', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, torch.testing.get_all_fp_dtypes(), [torch.bfloat16]),
-    ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, 
+    ('tanh', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5,
         torch.testing.get_all_fp_dtypes() + _complex_types, [torch.bfloat16]),
     ('asin', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
     ('atan', '', _small_3d, lambda t, d: [], 1e-3, 1e-2, 1e-5, _float_types, [torch.bfloat16]),
diff --git a/torch/tensor.py b/torch/tensor.py
index 3eadb4667e87..9709c146c815 100644
--- a/torch/tensor.py
+++ b/torch/tensor.py
@@ -518,7 +518,7 @@ def __rdiv__(self, other):
         if self.dtype.is_floating_point or self.dtype.is_complex:
             return self.reciprocal() * other
         else:
-            return (self.double().reciprocal() * other).type_as(self)
+            return self.to(torch.get_default_dtype()).reciprocal() * other
 
     __rtruediv__ = __rdiv__
     __itruediv__ = _C._TensorBase.__idiv__
@@ -552,10 +552,7 @@ def __floordiv__(self, other):
 
     @_wrap_type_error_to_not_implemented
     def __rfloordiv__(self, other):
-        result = other / self
-        if result.dtype.is_floating_point:
-            result = result.trunc()
-        return result
+        return torch.floor_divide(other, self)
 
     __neg__ = _C._TensorBase.neg