diff --git a/tests/fakelowp/test_batchmatmul_nnpi_fp16.py b/tests/fakelowp/test_batchmatmul_nnpi_fp16.py
deleted file mode 100644
index fd8b7f07d6..0000000000
--- a/tests/fakelowp/test_batchmatmul_nnpi_fp16.py
+++ /dev/null
@@ -1,120 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-import glow.fb.test.init_shared_libs  # noqa
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from glow.fb.test.test_utils import print_test_debug_info
-from hypothesis import given
-import hypothesis.strategies as st
-import caffe2.python.hypothesis_test_util as hu
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
-
-GLOW_MATMUL_RTOL = 1e-3
-
-
-class TestBatchMatMul(serial.SerializedTestCase):
-    # @settings(max_examples=30)
-    @given(
-        #C=0, #st.integers(min_value=0, max_value=3),  # number of batch dims
-        M=st.integers(min_value=1, max_value=10),
-        K=st.integers(min_value=1, max_value=10),
-        N=st.integers(min_value=1, max_value=10),
-        trans_a=st.booleans(),
-        trans_b=st.booleans(),
-        run_ints=st.booleans(),
-        **hu.gcs
-    )
-    def test_batch_matmul(self, M, K, N, trans_a, trans_b, run_ints, gc, dc):
-        workspace.ResetWorkspace()
-        C = 0  # TODO
-        batch_dims = np.random.randint(
-            low=1,
-            high=3,
-            size=C,
-            dtype=np.int64).tolist()
-
-        if run_ints:
-            X = np.random.randint(low=1, high=3, size=((1, M, K))).astype(np.float32)
-        else:
-            X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype(np.float32) - 0.5)
-        if trans_a:
-            X = X.swapaxes(-1, -2)
-
-        if run_ints:
-            Y = np.random.randint(low=1, high=3, size=((1, K, N))).astype(np.float32)
-        else:
-            Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype(np.float32) - 0.5)
-        if trans_b:
-            Y = Y.swapaxes(-1, -2)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "Y"])
-        pred_net.external_output.append("out")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                'BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b
-            )
-        )
-
-        pred_net_ref = core.Net("pred_net_ref")
-        pred_net_ref.BatchMatMulFP16Acc16Fake(
-            ["X", "Y"], ['out'], trans_a=trans_a, trans_b=trans_b)
-
-        print("dims", batch_dims, X.shape, Y.shape)
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": X.shape, "Y": Y.shape},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        # Run Glow net
-        workspace.RunNet(pred_net_onnxified.name)
-        out_glow = workspace.FetchBlob('out')
-
-        # Run caffe2 net
-        workspace.RunNet(pred_net_ref)
-        out_c2_fakefp16 = workspace.FetchBlob('out')
-
-        diff = np.abs((out_c2_fakefp16 - out_glow) / (out_c2_fakefp16 + 1e-8))
-        rowdiff = np.max(diff, axis=1)
-
-        success = True
-        if run_ints:
-            if not np.allclose(out_glow, out_c2_fakefp16):
-                success = False
-        else:
-            n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
-            # Find the max difference per row, if more than 10% of the rows
-            # are bigger, consider it a failure.
-            if n_offenders * 10 > rowdiff.shape[0]:
-                success = False
-
-        if not success:
-            print_test_debug_info("bmm",
-                {"m": M, "k": K, "n": N, "X": X, "Y": Y,
-                 "out_glow": out_glow,
-                 "out_c2_fakefp16": out_c2_fakefp16,
-                 "diff": diff})
-            assert(0)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
diff --git a/tests/fakelowp/test_batchnorm_nnpi_fp16.py b/tests/fakelowp/test_batchnorm_nnpi_fp16.py
deleted file mode 100644
index 1d45c2c465..0000000000
--- a/tests/fakelowp/test_batchnorm_nnpi_fp16.py
+++ /dev/null
@@ -1,142 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-
-import glow.fb.test.init_shared_libs  # noqa
-import time
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.onnx.tests.test_utils import TestCase
-from glow.fb.test.test_utils import print_test_debug_info
-
-core.GlobalInit(["caffe2", "--glow_global_fp16=1",
-                      "--glow_global_fused_scale_offset_fp16=1",
-                      "--glow_global_force_sls_fp16_accum=1"])
-
-GLOW_LOWERED_BATCHNORM = False
-
-
-def reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order):
-    X = X.astype(np.float16)
-    scale = scale.astype(np.float16)
-    bias = bias.astype(np.float16)
-    mean = mean.astype(np.float16)
-    #var = var.astype(np.float16)
-    assert(order == "NCHW")
-
-    scale = scale[np.newaxis, :, np.newaxis, np.newaxis]
-    bias = bias[np.newaxis, :, np.newaxis, np.newaxis]
-    mean = mean[np.newaxis, :, np.newaxis, np.newaxis]
-    var = var[np.newaxis, :, np.newaxis, np.newaxis]
-    Y = ((X - mean) * (scale / np.sqrt(var + epsilon).astype(np.float16))) + bias
-    return Y.astype(np.float32)
-
-
-# Test the lowered BN op
-class BatchnormTest(TestCase):
-    # TODO: replace with hypothesis
-    def test_bn(self):
-        size = 30
-        input_channels = 20
-        batch_size = 40
-        seed = int(time.time())
-        np.random.seed(seed)
-
-        order = "NCHW"
-        epsilon = 1e-3
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "scale", "bias", "mean", "var"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SpatialBN",
-                ["X", "scale", "bias", "mean", "var"],
-                ["Y"],
-                order=order,
-                is_test=True,
-                epsilon=epsilon
-            )
-        )
-
-        if GLOW_LOWERED_BATCHNORM:
-            refopname = "SpatialBNFakeLoweredFp16NNPI"
-        else:
-            refopname = "SpatialBNFakeFp16NNPI"
-
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred"
-        pred_net_ref.external_input.extend(["X", "scale", "bias", "mean", "var"])
-        pred_net_ref.external_output.append("X")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                refopname,
-                ["X", "scale", "bias", "mean", "var"],
-                ["Y"],
-                order=order,
-                is_test=True,
-                epsilon=epsilon
-            )
-        )
-
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-
-        workspace.FeedBlob("scale", scale)
-        workspace.FeedBlob("bias", bias)
-        workspace.FeedBlob("mean", mean)
-        workspace.FeedBlob("var", var)
-
-        # Use for reference to debug
-        # Y_np = reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order)
-
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": [batch_size, input_channels, size, size],
-             "scale": [input_channels],
-             "bias": [input_channels],
-             "mean": [input_channels],
-             "var": [input_channels]},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("X", X)
-
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchBlob("Y")
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)):
-            diff = np.abs(Y_glow - Y_c2).astype(np.float16)
-            print_test_debug_info(
-                "bn",
-                {"seed": seed,
-                "scale": scale,
-                "bias": bias,
-                "mean": mean,
-                "var": var,
-                "Y_np": Y_c2.shape,
-                "Y_glow": Y_glow.shape,
-                "diff": diff,
-                "rowwise_diff": np.max(np.abs(diff), -1)})
-            assert(0)
diff --git a/tests/fakelowp/test_fc_nnpi_fp16.py b/tests/fakelowp/test_fc_nnpi_fp16.py
deleted file mode 100644
index 456834c433..0000000000
--- a/tests/fakelowp/test_fc_nnpi_fp16.py
+++ /dev/null
@@ -1,324 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-
-import glow.fb.test.init_shared_libs  # noqa
-import time
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.onnx.tests.test_utils import TestCase
-from glow.fb.test.test_utils import print_test_debug_info
-
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
-
-GLOW_MATMUL_RTOL = 0
-
-
-class FCTest(TestCase):
-    def test_clip(self):
-        m, n, k = 8, 8, 8
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "W0", "b0", "W1", "b1"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X", "W0", "b0"],
-                ["X1"],
-            )
-        )
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X1", "W1", "b1"],
-                ["Y"],
-            )
-        )
-        workspace.GlobalInit(
-            ['caffe2', '--caffe2_log_level=0', '--glow_global_fp16=1',
-             '--glow_clip_fp16'])
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-        W0 = np.full((n, k), 65536.0, dtype)
-        b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
-        W1 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype)
-        b1 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-        workspace.FeedBlob("W1", W1)
-        workspace.FeedBlob("b1", b1)
-
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": (m, k)},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False
-        )
-
-        X = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(pred_net_onnxified)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-        np.testing.assert_allclose(Y_glow, np.full((m, n), 65504.0, dtype))
-
-    def test_fc_exercise(self):
-        """ Test that the matmul engine is working, this doesn't test
-            precision
-        """
-        m = np.random.randint(low=4, high=50)
-        k = np.random.randint(low=4, high=50)
-        n = np.random.randint(low=4, high=50)
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "W0", "b0"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X", "W0", "b0"],
-                ["Y"],
-            )
-        )
-
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-        W0 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype)
-        b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": (m, k)},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
-        workspace.FeedBlob("X", X0)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net)
-
-        num_iterations = 2
-        for _ in range(num_iterations):
-            X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
-            workspace.FeedBlob("X", X0)
-            # Run Glow net
-            workspace.RunNet(pred_net_onnxified.name)
-            Y_glow = workspace.FetchBlob('Y')
-            # Run caffe2 net
-            workspace.RunNet(pred_net.name)
-            Y_c2 = workspace.FetchBlob('Y')
-            if not np.allclose(Y_c2, Y_glow):
-                print_test_debug_info("fc",
-                    {"m": m, "k": k, "n": n, "X": X0, "W0": W0, "b0": b0,
-                     "Y_glow": Y_glow, "Y_c2": Y_c2,
-                     "diff": np.abs((Y_c2 - Y_glow) / Y_c2)})
-                assert(0)
-
-    def test_fc_numeric_cases(self):
-        """ Test numerics, use examples found from the unit test.
-            Use Fp16FCAcc16NNPI as a reference.
-        """
-        m = 1
-        k = 20
-        n = 1
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "W0", "b0"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X", "W0", "b0"],
-                ["Y"],
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred"
-        pred_net_ref.external_input.extend(["X", "W0", "b0"])
-        pred_net_ref.external_output.append("Y")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "Fp16FCAcc16NNPI",
-                ["X", "W0", "b0"],
-                ["Y"],
-            )
-        )
-
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-
-        W0 = np.array([[0.04882812, 0.21520996, 0.1027832, 0.04489136,
-                        -0.07635498, 0.14587402,
-                        -0.06240845, 0.3918457, 0.46362305, -0.11657715,
-                        0.29174805, 0.02890015,
-                        0.0680542, 0.4255371, -0.42895508, -0.4128418,
-                        -0.47973633, 0.33251953,
-                        0.27807617, 0.3701172]], dtype=np.float32)
-        b0 = [0.47851562]
-        b0 = np.array(b0, dtype=np.float32)
-
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": (m, k)},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        X0 = np.random.rand(m, k).astype(dtype) - 0.5
-        workspace.FeedBlob("X", X0)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        X_inputs = [
-            np.array([[-2.94921875e-01, -3.58642578e-01, -1.92871094e-01,
-                        2.81250000e-01,
-                       -1.30126953e-01, 2.32696533e-02, -4.55566406e-01,
-                        -2.31811523e-01,
-                       -1.95190430e-01, -7.76977539e-02, -1.29394531e-01,
-                        2.94677734e-01,
-                       8.96453857e-04, 4.97314453e-01, -6.07604980e-02,
-                        2.55371094e-01,
-                       3.49853516e-01, -1.37695312e-01, 2.95410156e-01,
-                        -3.67187500e-01]], dtype=np.float32),
-            np.array([[-0.4494629, -0.22192383, -0.1640625, 0.11480713,
-                        -0.09851074, -0.02084351,
-                       0.19091797, -0.17468262, -0.47485352, 0.07489014,
-                        0.03897095, 0.00197601,
-                       0.02835083, -0.27294922, 0.26757812, -0.20996094,
-                       -0.31103516, -0.41601562,
-                       0.09918213, -0.07696533]], dtype=np.float32),
-            np.array([[0.01150513, -0.20507812, 0.46704102, 0.00906372,
-                        0.19848633, 0.3720703,
-                       0.46557617, -0.47436523, -0.35107422, -0.0362854,
-                        -0.20812988, 0.41918945,
-                       0.09716797, 0.19897461, 0.3876953, -0.0165863,
-                        0.23535156, 0.29956055,
-                       0.24389648, -0.23486328]], dtype=np.float32)
-        ]
-
-        for i in range(len(X_inputs)):
-            workspace.FeedBlob("X", X_inputs[i])
-            # Run Glow net
-            workspace.RunNet(pred_net_onnxified.name)
-            Y_glow = workspace.FetchBlob('Y')
-            workspace.RunNet(pred_net_ref.name)
-            Y_c2 = workspace.FetchBlob('Y')
-
-            diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8))
-            rowdiff = np.max(diff, axis=1)
-
-            n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
-            if n_offenders > 0:
-                print_test_debug_info("fc",
-                    {"iter": i, "m": m, "k": k, "n": n, "X": X0, "W0": W0, "b0": b0,
-                     "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff,
-                     "rowdiff": rowdiff})
-                assert(0)
-
-    def test_fc_num0(self):
-        """ Test numerics, fix a dimension and determine the ranges of error.
-            Use Fp16FCAcc16 as a reference.
-        """
-        np.random.seed(int(time.time()))
-        m = np.random.randint(low=4, high=50)
-        k = np.random.randint(low=4, high=1000)
-        n = np.random.randint(low=4, high=50)
-        use_packed = np.random.randint(2)
-        W = "W_packed" if use_packed else "W0"
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", W, "b0"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FbFCPacked" if use_packed else "FC",
-                ["X", W, "b0"],
-                ["Y"],
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred"
-        pred_net_ref.external_input.extend(["X", W, "b0"])
-        pred_net_ref.external_output.append("Y")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "Fp16FCAcc16NNPI",
-                ["X", W, "b0"],
-                ["Y"],
-            )
-        )
-
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-        W0 = 10 * (np.random.rand(n, k) - 0.5).astype(np.float16).astype(np.float32)
-        b0 = 1 * (np.random.rand(n) - 0.5).astype(np.float16).astype(np.float32)
-
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FbGemmPack",
-                ['W0'],
-                ['W_packed'],
-                no_packing=True,
-            )
-        )
-
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": (m, k)},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        X0 = np.random.rand(m, k).astype(dtype) - 0.5
-        workspace.FeedBlob("X", X0)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        num_iterations = 10
-        for _ in range(num_iterations):
-            X0 = 100 * (np.random.rand(m, k) - 0.5).\
-                astype(np.float16).astype(np.float32)
-            workspace.FeedBlob("X", X0)
-            # Run Glow net
-            workspace.RunNet(pred_net_onnxified.name)
-            Y_glow = workspace.FetchBlob('Y')
-            # Run caffe2 net
-            workspace.RunNet(pred_net_ref.name)
-            Y_c2 = workspace.FetchBlob('Y')
-
-            diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8))
-            rowdiff = np.max(diff, axis=1)
-
-            n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
-            if n_offenders > 0:
-                print_test_debug_info("fc",
-                    {"iter": _, "m": m, "k": k, "n": n, "X": X0, "W0": W0,
-                     "b0": b0, "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff,
-                     "rowdiff": rowdiff})
-                assert(0)
diff --git a/tests/fakelowp/test_op_nnpi_fp16.py b/tests/fakelowp/test_op_nnpi_fp16.py
deleted file mode 100644
index c5c4509e8f..0000000000
--- a/tests/fakelowp/test_op_nnpi_fp16.py
+++ /dev/null
@@ -1,235 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import ctypes
-import numpy as np
-import os
-
-import glow.fb.test.init_shared_libs  # noqa
-
-import caffe2.python.hypothesis_test_util as hu
-from hypothesis import given
-
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import dyndep
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.onnx.tests.test_utils import TestCase
-from glow.fb.test.test_utils import print_test_debug_info
-
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
-
-kEpsilon = 1e-8
-
-
-class ArithmeticOpsTest(TestCase):
-    def _test_binary_op_graph(self, name):
-        # First dimension is the batch size
-        dims = np.concatenate((np.array([1]), np.random.randint(1, 20, size=3)))
-        A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-        B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-        print(A.shape, B.shape)
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["A", "B"])
-        pred_net.external_output.append("C")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                name,
-                ["A", "B"],
-                ["C"]
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "ref"
-        pred_net_ref.external_input.extend(["A", "B"])
-        pred_net_ref.external_output.append("C_ref")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                name + "FakeFp16",
-                ["A", "B"],
-                ["C_ref"],
-            )
-        )
-
-        shape_hints = {"A": A.shape, "B": B.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                shape_hints,
-                                                debug=True,
-                                                adjust_batch=True,
-                                                use_onnx=False)
-        print(pred_net_onnxified)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.FeedBlob("A", A)
-        workspace.FeedBlob("B", B)
-
-        workspace.CreateNet(pred_net_ref)
-        workspace.CreateNet(pred_net_onnxified)
-        num_iterations = 10
-        for _ in range(num_iterations):
-            A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-            B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-            workspace.FeedBlob("A", A)
-            workspace.FeedBlob("B", B)
-            # Run caffe2 net
-            workspace.RunNet(pred_net_ref.name)
-            Y_c2 = workspace.FetchBlob("C_ref")
-
-            # Run Glow net
-            workspace.RunNet(pred_net_onnxified.name)
-            Y_glow = workspace.FetchBlob("C")
-
-            # Results should be identical since we are comparing with the C2 emulation
-            if not np.allclose(Y_c2, Y_glow):
-                diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon))
-                print_test_debug_info(name, {
-                    "dims": dims, "A": A, "B": B,
-                    "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff})
-                assert(0)
-
-    def test_add_graph(self):
-        self._test_binary_op_graph("Add")
-
-    def test_sub_graph(self):
-        self._test_binary_op_graph("Sub")
-
-    def test_mul_graph(self):
-        self._test_binary_op_graph("Mul")
-
-    def test_div_graph(self):
-        self._test_binary_op_graph("Div")
-
-
-class UnaryOpTest(TestCase):
-    def _test_unary_op(self, opname):
-        workspace.ResetWorkspace()
-        n = 1
-        m = 10000
-        X = np.linspace(-20, 20, num=m, dtype=np.float32)
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.append("X")
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                opname,
-                ['X'],
-                ['Y'])
-        )
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.append("X")
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                opname + 'FakeFp16NNPI',
-                ['X'],
-                ['Y'])
-        )
-
-        shape_hints = {"X": (n, m)}
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                shape_hints,
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        print(pred_net_onnxified)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(ref_net)
-        workspace.CreateNet(pred_net_onnxified)
-        # Run Glow net
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-        # Run caffe2 reference net
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob('Y')
-
-        if not np.allclose(Y_c2, Y_glow):
-            diff = np.abs(Y_c2 - Y_glow)
-            np.save('/tmp/' + opname + 'diff', diff)
-            print_test_debug_info(opname,
-                {"X": X,
-                "Y_c2": Y_c2,
-                "Y_glow": Y_glow,
-                "diff": diff,
-                "maxdiff": np.max(diff)})
-            assert(0)
-
-    def test_sigmoid(self):
-        self._test_unary_op("Sigmoid")
-
-    def test_tanh(self):
-        self._test_unary_op("Tanh")
-
-
-class ReluTest(hu.HypothesisTestCase):
-    @given(inputs=hu.tensors(n=1, min_dim=1, max_dim=3, dtype=np.float32))
-    def relu_test(self, inputs, gc, dc):
-        X = inputs[0]
-        # First dimension is the batch size
-        print(X.shape)
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Relu",
-                ["X"],
-                ["Y"]
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "ref"
-        pred_net_ref.external_input.extend(["X"])
-        pred_net_ref.external_output.append("Y_ref")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "ReluFakeFp16",
-                ["X"],
-                ["Y_ref"],
-            )
-        )
-
-        shape_hints = {"X": X.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                shape_hints,
-                                                debug=True,
-                                                adjust_batch=True,
-                                                use_onnx=False)
-        print(pred_net_onnxified)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.FeedBlob("X", X)
-
-        workspace.CreateNet(pred_net_ref)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.FeedBlob("X", X)
-        # Run caffe2 net
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchBlob("Y_ref")
-
-        # Run Glow net
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        # Results should be identical since we are comparing with the C2 emulation
-        if not np.allclose(Y_c2, Y_glow):
-            diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon))
-            print_test_debug_info("Relu", {
-                "X": X,
-                "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff})
-            assert(0)
diff --git a/tests/fakelowp/test_sls_4bit_nnpi_fp16.py b/tests/fakelowp/test_sls_4bit_nnpi_fp16.py
deleted file mode 100644
index 4a3689eaf7..0000000000
--- a/tests/fakelowp/test_sls_4bit_nnpi_fp16.py
+++ /dev/null
@@ -1,219 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-import time
-
-# Must happen before importing caffe2.python.*
-import glow.fb.test.init_shared_libs  # noqa
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, dyndep
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.onnx.tests.test_utils import TestCase
-from glow.fb.test.test_utils import print_test_debug_info
-
-workspace.GlobalInit(["caffe2", "--glow_global_fp16=1",
-                      "--glow_global_fused_scale_offset_fp16=1",
-                      "--glow_global_force_sls_fp16_accum=1"])
-
-
-class SparseLengthsSumTest(TestCase):
-    def test_slws_fused_4bit_rowwise_all_same(self):
-        # Comment out for predictable debugging
-        seed = int(time.time())
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-        n = 1
-        m = 2
-        data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1
-
-        max_segments = 5
-        max_segment_length = 100
-        num_lengths = np.random.randint(1, max_segments + 1)
-        # number of segments to run
-        lengths = np.random.randint(0, max_segment_length + 1,
-                                    size=num_lengths).astype(np.int32)
-        num_indices = np.sum(lengths)
-        indices = np.zeros(num_indices, dtype=np.int64)
-        weights = np.random.uniform(low=-0.5, high=0.5,
-            size=[len(indices)]).astype(np.float32)
-        weights = np.ones(len(indices)).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"])
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused4BitRowwiseQuantized",
-                ['data'],
-                ['quantized_data']
-            )
-        )
-
-        print("quantized", workspace.FetchBlob("quantized_data"))
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=max_segments,
-            max_seq_size=max_segments * max_segment_length,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False
-        )
-
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob('Y')
-
-        if not np.allclose(Y_c2, Y_glow):
-            print_test_debug_info(
-                "slws_fused_4bit_rowwise",
-                {"seed": seed,
-                 "indices": indices,
-                 "data": data,
-                 "lengths": lengths,
-                 "weights": weights,
-                 "Y_c2": Y_c2,
-                 "Y_glow": Y_glow,
-                 "diff": Y_glow - Y_c2,
-                 "rowwise_diff": (Y_glow - Y_c2)[:, 0]})
-            assert(0)
-
-    def test_slws_fused_4bit_rowwise(self):
-        # Comment out for predictable debugging
-        seed = int(time.time() * 1000) % 2 ** 16
-        print(seed)
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        n = 20000
-        DIM = 6
-        data = (4 * np.random.random_sample((n, DIM)) + 1).astype(np.float32)
-
-        max_segments = 200
-        max_segment_length = 200
-        num_lengths = np.random.randint(0, max_segments + 1)
-        # number of segments to run
-        lengths = np.random.randint(2, max_segment_length + 1, size=num_lengths).astype(
-            np.int32
-        )
-        num_indices = np.sum(lengths)
-        indices = np.random.randint(low=0, high=n, size=num_indices, dtype=np.int64)
-        weights = np.random.uniform(low=0.01, high=0.5, size=[len(indices)]).astype(
-            np.float32
-        )
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused4BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=max_segments,
-            max_seq_size=max_segments * max_segment_length,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            print_test_debug_info(
-                "slws_fused_4bit",
-                {
-                    "indices": indices,
-                    "data": data.shape,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
diff --git a/tests/fakelowp/test_sls_nnpi_fp16.py b/tests/fakelowp/test_sls_nnpi_fp16.py
deleted file mode 100644
index 7e09610622..0000000000
--- a/tests/fakelowp/test_sls_nnpi_fp16.py
+++ /dev/null
@@ -1,616 +0,0 @@
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import time
-
-# Must happen before importing caffe2.python.*
-import glow.fb.test.init_shared_libs  # noqa
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.onnx.tests.test_utils import TestCase
-from glow.fb.test.test_utils import print_test_debug_info
-
-
-workspace.GlobalInit(
-    [
-        "caffe2",
-        "--glow_global_fp16=1",
-        "--glow_global_fused_scale_offset_fp16=1",
-        "--glow_global_force_sls_fp16_accum=1",
-    ]
-)
-GLOW_MATMUL_ATOL = 1e-5
-GLOW_MATMUL_RTOL = 1e-3
-
-
-class SparseLengthsSumTest(TestCase):
-    def Test_SLS_NonQuantized_fp16(self):
-        N = 20000
-        DIM = 64
-        D = (4 * np.random.random_sample((N, DIM)) + 1).astype(np.float32)
-        I = (np.random.randint(0, N, size=12)).astype(np.int64)
-        L = np.asarray([4, 4, 4]).astype(np.int32)
-        workspace.FeedBlob("D", D)
-
-        ref_c2_net = core.Net("test_ref_c2")
-        ref_c2_net.SparseLengthsSum(["D", "I", "L"], "ref_out")
-        ref_c2_net.Proto().external_input.extend(["D", "I", "L"])
-        ref_c2_net.Proto().external_output.extend(["ref_out"])
-
-        fp16_c2_net = core.Net("test_fp16_c2")
-        fp16_c2_net.SparseLengthsSumFakeFP16AccFP16(["D", "I", "L"], "fp16_out")
-
-        input_dict = {}
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["D", "I", "L"])
-        pred_net.external_output.append("glow_out")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator("SparseLengthsSum", ["D", "I", "L"], ["glow_out"])
-        )
-
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            input_dict,
-            max_batch_size=3,
-            max_seq_size=16,
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-        )
-
-        num_onnxified_ops = sum(
-            1 if op.type == "Onnxifi" else 0 for op in onnxified_net.op
-        )
-        print(onnxified_net)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("I", I)
-        workspace.FeedBlob("L", L)
-
-        workspace.RunNetOnce(ref_c2_net)
-        ref_c2_out = workspace.FetchBlob("ref_out")
-
-        workspace.RunNetOnce(fp16_c2_net)
-        fp16_c2_out = workspace.FetchBlob("fp16_out")
-
-        np.testing.assert_allclose(fp16_c2_out, ref_c2_out, atol=1e-3, rtol=1e-3)
-
-        workspace.RunNetOnce(onnxified_net)
-        fp16_glow_out = workspace.FetchBlob("glow_out")
-
-        if not np.allclose(fp16_glow_out, fp16_c2_out):
-            diff = np.abs(fp16_glow_out - fp16_c2_out)
-            print_test_debug_info(
-                "sls",
-                {
-                    "indices": I,
-                    "data": D,
-                    "lengths": L,
-                    "Y_c2": fp16_c2_out,
-                    "Y_glow": fp16_glow_out,
-                    "diff": diff,
-                    "rowwise_diff": diff[:, 0],
-                },
-            )
-            assert 0
-
-    def test_slws_fused_8bit_rowwise_all_same(self):
-        # Comment out for predictable debugging
-        np.random.seed(int(time.time()))
-        workspace.ResetWorkspace()
-        n = 1
-        m = 2
-        data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1
-
-        max_segments = 5
-        max_segment_length = 200
-        num_lengths = np.random.randint(1, max_segments + 1)
-        # number of segments to run
-        lengths = np.random.randint(0, max_segment_length + 1, size=num_lengths).astype(
-            np.int32
-        )
-        num_indices = np.sum(lengths)
-        indices = np.zeros(num_indices, dtype=np.int64)
-        weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)]).astype(
-            np.float32
-        )
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=max_segments,
-            max_seq_size=max_segments * max_segment_length,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_c2, Y_glow):
-            print_test_debug_info(
-                "slws_fused_8bit_rowwise",
-                {
-                    "indices": indices,
-                    "data": data,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_c2": Y_c2,
-                    "Y_glow": Y_glow,
-                    "diff": Y_glow - Y_c2,
-                    "rowwise_diff": (Y_glow - Y_c2)[:, 0],
-                },
-            )
-            assert 0
-
-    def test_slws_fused_8bit_rowwise_turkey(self):
-        # Comment out for predictable debugging
-        seed = int(time.time() * 1000) % 2 ** 16
-        print(seed)
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        n = 20000
-        DIM = 6
-        data = (4 * np.random.random_sample((n, DIM)) + 1).astype(np.float32)
-
-        max_segments = 200
-        max_segment_length = 200
-        num_lengths = np.random.randint(0, max_segments + 1)
-        # number of segments to run
-        lengths = np.random.randint(2, max_segment_length + 1, size=num_lengths).astype(
-            np.int32
-        )
-        num_indices = np.sum(lengths)
-        indices = np.random.randint(low=0, high=n, size=num_indices, dtype=np.int64)
-        weights = np.random.uniform(low=0.01, high=0.5, size=[len(indices)]).astype(
-            np.float32
-        )
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=max_segments,
-            max_seq_size=max_segments * max_segment_length,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            print_test_debug_info(
-                "slws_fused_8bit_rowwise_inv_scale",
-                {
-                    "indices": indices,
-                    "data": data.shape,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-    # Simple test to aid debugging order of operations
-    # Minimize the case to an SLS that adds two rows
-    def test_small_sls(self):
-        seed = int(time.time() * 1000) % 2 ** 16
-        print(seed)
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        n = 2
-        DIM = 3
-        data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)
-
-        lengths = np.array([n], dtype=np.int32)
-        indices = np.array(range(n), dtype=np.int64)
-        weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=1,
-            max_seq_size=n,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            np.set_printoptions(precision=12)
-            print(
-                "ref",
-                Y_ref.astype(np.float16).astype(np.float32),
-                "glow",
-                Y_glow.astype(np.float16).astype(np.float32),
-            )
-            print_test_debug_info(
-                "slws_fused_8bit_rowwise_inv_scale",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data,
-                    "quantized_data": quantized_data,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-    def test_small_sls_acc32(self):
-
-        workspace.GlobalInit(
-            [
-                "caffe2",
-                "--glow_global_fp16=0",
-                "--glow_global_fused_scale_offset_fp16=0",
-                "--glow_global_force_sls_fp16_accum=0",
-            ]
-        )
-        seed = int(time.time() * 1000) % 2 ** 16
-        print(seed)
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        n = 2
-        DIM = 3
-        data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)
-
-        lengths = np.array([n], dtype=np.int32)
-        indices = np.array(range(n), dtype=np.int64)
-        weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=1,
-            max_seq_size=n,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            np.set_printoptions(precision=12)
-            print(
-                "ref",
-                Y_ref.astype(np.float16).astype(np.float32),
-                "glow",
-                Y_glow.astype(np.float16).astype(np.float32),
-            )
-            print_test_debug_info(
-                "test_small_sls_acc32",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data,
-                    "quantized_data": quantized_data,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-    def test_slws_fused_8bit_rowwise_acc32_nnpi(self):
-        workspace.GlobalInit(
-            [
-                "caffe2",
-                "--glow_global_fp16=0",
-                "--glow_global_fused_scale_offset_fp16=0",
-                "--glow_global_force_sls_fp16_accum=0",
-            ]
-        )
-        # Comment out for predictable debugging
-        seed = int(time.time() * 1000) % 2 ** 16
-        print(seed)
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        n = 20000
-        DIM = 6
-        data = (4 * np.random.random_sample((n, DIM)) + 1).astype(np.float32)
-
-        max_segments = 200
-        max_segment_length = 200
-        num_lengths = np.random.randint(0, max_segments + 1)
-        # number of segments to run
-        lengths = np.random.randint(2, max_segment_length + 1, size=num_lengths).astype(
-            np.int32
-        )
-        num_indices = np.sum(lengths)
-        indices = np.random.randint(low=0, high=n, size=num_indices, dtype=np.int64)
-        weights = np.random.uniform(low=0.01, high=0.5, size=[len(indices)]).astype(
-            np.float32
-        )
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=max_segments,
-            max_seq_size=max_segments * max_segment_length,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            print_test_debug_info(
-                "test_slws_fused_8bit_rowwise_acc32_nnpi",
-                {
-                    "indices": indices,
-                    "data": data.shape,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0